Original data contained 2,059,735 files/locations but data cleaning stripped out locations because the data was malformed. Specifically the script_col and script_line values were not integers - that appear to have leaked from location.

This notebook extracts that as a seperate dataset so we can see whether it contains something interesting.

In [4]:
import concurrent.futures
import dask.dataframe as dd
import json
import numpy as np
import pandas as pd

from ast import literal_eval
from dask.diagnostics import ProgressBar
from datetime import datetime
from pandas.api.types import CategoricalDtype
from path import Path
from tqdm import tqdm

PROJECT_DIR = Path.getcwd().parent  # Could assert some things here to check we got the right path
CACHE_OLD_DIR = Path.joinpath(PROJECT_DIR, 'cache')  # Where the originally downloaded files are
EXTRACTED_FILE = Path.joinpath(PROJECT_DIR, 'bad_lines.csv.gz')  # Where the bad files are going to live

fix_data = lambda x:  "[" + x[1:-1] + "]"

operation_categorical_type = CategoricalDtype(categories=["get", "set", "call", "set (failed)"])

In [2]:
index_file_path = Path.joinpath(PROJECT_DIR, 'file_index.txt')
assert index_file_path.exists(), 'Index File is missing'
index_file_path

Path('/home/bird/Dev/mozilla/sb2018/file_index.txt')

In [3]:
with open(index_file_path, 'r') as index_file:
    index = index_file.readlines()
    index = np.array([x.rstrip() for x in index])
index[0:4]

array(['1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json',
       '1_000014b53a60c645e3ac9bde6bae020430c930b3cc5903677e0d5cb2.json',
       '1_00003e3765a73da45db5265de2b22424e025d61380f7cf8080b378aa.json',
       '1_00004636d8310609e710934f194bfb41a5f0ac7ed5e05c0fb9047e48.json'],
      dtype='<U63')

## Process data

In [6]:
def get_data_from_file(file_name):
    fp = Path.joinpath(CACHE_OLD_DIR, file_name)
    with open(fp, 'r') as f:
        raw_data = f.read()
    data = json.loads(fix_data(raw_data))
    return data

def make_df_from_data(data, file_name):
    df = pd.DataFrame.from_records(data)
    df = df.reset_index()
    df['file_name'] = file_name
    # Make a unique call_id
    df = df.rename(columns=dict(index='call_id'))
    call_id_format = '{file_name}__{index:06d}'
    short_file_name = file_name.split('.json')[0]
    df.call_id = df.call_id.apply(lambda x: call_id_format.format(file_name=short_file_name, index=x))
    # Make a value len and initial value
    df['value_len'] = df.value.str.len()
    df['value_1000'] = df.value.str.slice(0, 1000)
    # Make a timestamp
    df['time_stamp'] = pd.to_datetime(df.time_stamp, errors='coerce')
    # Make categorical
    df['operation'] = df.operation.astype(operation_categorical_type)
    # Make script_col and script_line numeric, if errors, then return df
    try:
        df['script_col'] = pd.to_numeric(df.script_col)
        df['script_line'] = pd.to_numeric(df.script_line)
        return None
    except:
        return df

def get_df(file_name):
    data = get_data_from_file(file_name)
    df = make_df_from_data(data, file_name)
    return df

### Test single write and read

In [None]:
%%time
dfs = []
for file_name in index[0:1000]:
    dfs.append(get_df(file_name))

In [12]:
%%time
all_df = pd.concat(dfs, ignore_index=True)

CPU times: user 2.38 ms, sys: 941 µs, total: 3.32 ms
Wall time: 3.63 ms


In [15]:
all_df.head(10)

Unnamed: 0,call_id,call_stack,crawl_id,func_name,in_iframe,location,operation,script_col,script_line,script_loc_eval,script_url,symbol,time_stamp,value,file_name,value_len,value_1000
0,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,,True,http://catfly.gr/,get,11209,1,,http://pagead2.googlesyndication.com/pagead/js...,window.navigator.userAgent,2017-12-16 17:52:15.130,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,68,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...
1,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,Zf,True,http://catfly.gr/,get,27894,1,,http://pagead2.googlesyndication.com/pagead/js...,window.navigator.plugins[Shockwave Flash].desc...,2017-12-16 17:52:15.149,Shockwave Flash 28.0 r0,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,23,Shockwave Flash 28.0 r0
2,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,Zf,True,http://catfly.gr/,get,28026,1,,http://pagead2.googlesyndication.com/pagead/js...,window.navigator.plugins[Shockwave Flash].desc...,2017-12-16 17:52:15.149,Shockwave Flash 28.0 r0,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,23,Shockwave Flash 28.0 r0
3,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,js,True,http://catfly.gr/,get,170425,1,,http://pagead2.googlesyndication.com/pagead/js...,window.navigator.userAgent,2017-12-16 17:52:15.153,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,68,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...
4,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,js,True,http://catfly.gr/,get,170439,1,,http://pagead2.googlesyndication.com/pagead/js...,window.navigator.platform,2017-12-16 17:52:15.154,Linux x86_64,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,12,Linux x86_64
5,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,js,True,http://catfly.gr/,get,170683,1,,http://pagead2.googlesyndication.com/pagead/js...,window.navigator.product,2017-12-16 17:52:15.154,Gecko,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,5,Gecko
6,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,Cs,True,http://catfly.gr/,get,179602,1,,http://pagead2.googlesyndication.com/pagead/js...,window.screen.colorDepth,2017-12-16 17:52:15.168,24,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,2,24
7,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,mq,True,http://catfly.gr/,get,130500,1,,http://pagead2.googlesyndication.com/pagead/js...,window.navigator.userAgent,2017-12-16 17:52:15.207,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,68,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...
8,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,likeUsBackShow,False,http://catfly.gr/,get,//cdn.catfly.com/js/script.js?01318aa,http,,,window.localStorage,2017-12-16 17:52:14.701,"{""google_pub_config"":""{\""sraConfigs\"":{\""2\"":{...",1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,129,"{""google_pub_config"":""{\""sraConfigs\"":{\""2\"":{..."
9,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,$h/<,False,http://catfly.gr/,get,48170,1,,http://pagead2.googlesyndication.com/pagead/js...,window.navigator.userAgent,2017-12-16 17:52:14.934,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,68,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...


In [16]:
all_df.dtypes

call_id                    object
call_stack                 object
crawl_id                    int64
func_name                  object
in_iframe                    bool
location                   object
operation                category
script_col                 object
script_line                object
script_loc_eval            object
script_url                 object
symbol                     object
time_stamp         datetime64[ns]
value                      object
file_name                  object
value_len                   int64
value_1000                 object
dtype: object

In [18]:
%%time
all_df.to_parquet(EXTRACTED_FILE, compression='snappy')

CPU times: user 541 ms, sys: 4.75 ms, total: 546 ms
Wall time: 561 ms


### Process all data

In [24]:
dfs = []
for file_name in tqdm(index):
    dfs.append(get_df(file_name))

100%|██████████| 2059735/2059735 [4:13:39<00:00, 135.34it/s]  


In [26]:
'The number of files with bad data is {:,}'.format(len(dfs))

'The number of files with bad data is 2,059,735'

In [28]:
dfs[0:10]

[None, None, None, None, None, None, None, None, None, None]

In [29]:
all_df = pd.concat(dfs, ignore_index=True)

In [37]:
all_df.location.unique().size

2468

In [39]:
all_df.dtypes

arguments                  object
call_id                    object
call_stack                 object
crawl_id                    int64
file_name                  object
func_name                  object
in_iframe                    bool
location                   object
operation                category
script_col                 object
script_line                object
script_loc_eval            object
script_url                 object
symbol                     object
time_stamp         datetime64[ns]
value                      object
value_1000                 object
value_len                   int64
dtype: object

In [41]:
all_df.to_csv(EXTRACTED_FILE, index=False, compression='gzip')

Check it all worked out.


Note there are still some rows with good script_col script_line because we save for the whole json file that had bad lines.

In [9]:
df = pd.read_csv(EXTRACTED_FILE)
df.head(10)

Unnamed: 0,arguments,call_id,call_stack,crawl_id,file_name,func_name,in_iframe,location,operation,script_col,script_line,script_loc_eval,script_url,symbol,time_stamp,value,value_1000,value_len
0,,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,True,http://catfly.gr/,get,11209,1,,http://pagead2.googlesyndication.com/pagead/js...,window.navigator.userAgent,2017-12-16 17:52:15.130,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,68
1,,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,Zf,True,http://catfly.gr/,get,27894,1,,http://pagead2.googlesyndication.com/pagead/js...,window.navigator.plugins[Shockwave Flash].desc...,2017-12-16 17:52:15.149,Shockwave Flash 28.0 r0,Shockwave Flash 28.0 r0,23
2,,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,Zf,True,http://catfly.gr/,get,28026,1,,http://pagead2.googlesyndication.com/pagead/js...,window.navigator.plugins[Shockwave Flash].desc...,2017-12-16 17:52:15.149,Shockwave Flash 28.0 r0,Shockwave Flash 28.0 r0,23
3,,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,js,True,http://catfly.gr/,get,170425,1,,http://pagead2.googlesyndication.com/pagead/js...,window.navigator.userAgent,2017-12-16 17:52:15.153,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,68
4,,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,js,True,http://catfly.gr/,get,170439,1,,http://pagead2.googlesyndication.com/pagead/js...,window.navigator.platform,2017-12-16 17:52:15.154,Linux x86_64,Linux x86_64,12
5,,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,js,True,http://catfly.gr/,get,170683,1,,http://pagead2.googlesyndication.com/pagead/js...,window.navigator.product,2017-12-16 17:52:15.154,Gecko,Gecko,5
6,,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,Cs,True,http://catfly.gr/,get,179602,1,,http://pagead2.googlesyndication.com/pagead/js...,window.screen.colorDepth,2017-12-16 17:52:15.168,24,24,2
7,,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,mq,True,http://catfly.gr/,get,130500,1,,http://pagead2.googlesyndication.com/pagead/js...,window.navigator.userAgent,2017-12-16 17:52:15.207,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,68
8,,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,likeUsBackShow,False,http://catfly.gr/,get,//cdn.catfly.com/js/script.js?01318aa,http,,,window.localStorage,2017-12-16 17:52:14.701,"{""google_pub_config"":""{\""sraConfigs\"":{\""2\"":{...","{""google_pub_config"":""{\""sraConfigs\"":{\""2\"":{...",129
9,,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,,1,1_0009022e7df583be8d2fb02720e8ff31e49f1f7d811f...,$h/<,False,http://catfly.gr/,get,48170,1,,http://pagead2.googlesyndication.com/pagead/js...,window.navigator.userAgent,2017-12-16 17:52:14.934,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,68
