In [57]:
import dask.dataframe as dd
import dask.multiprocessing
import json
import numpy as np
import pandas as pd

from ast import literal_eval
from dask.delayed import delayed
from dask.diagnostics import ProgressBar
from datetime import datetime
from pandas.api.types import CategoricalDtype
from path import Path

PROJECT_DIR = Path.getcwd().parent  # Could assert some things here to check we got the right path
CACHE_OLD_DIR = Path.joinpath(PROJECT_DIR, 'cache')  # Where the originally downloaded files are
CACHE_NEW_DIR = Path.joinpath(PROJECT_DIR, 'cache_new.parquet')  # Where the parquet files are going to live

N_PARTITIONS = 2999  # This was decided after trial and error

In [58]:
# Force into multiprocessing - works well for our data
dask.set_options(get=dask.multiprocessing.get)

<dask.context.set_options at 0x7f489bc531d0>

In [59]:
operation_categorical_type = CategoricalDtype(categories=["get", "set", "call", "set (failed)"])

symbol_counts = pd.read_csv('symbol_counts.csv', names=['symbol', 'count'])
symbol_categorical_type = CategoricalDtype(categories=symbol_counts.symbol.values)

## Make even(ish) partitions based on call counts.

This still doesn't make them even due to "value" field being so variable. But it's better than just slicing up the list of files.

In [60]:
index_with_counts = pd.read_csv('file_index_with_counts.csv.gz')
index_with_counts = index_with_counts.rename(columns=dict(crawl_id='call_count'))
index_with_counts = index_with_counts.sort_values('file_name')
index_with_counts.head()

Unnamed: 0,file_name,call_count
0,1_00001314358470f0c99914d5c7af0cd89248e54883ac...,6
1,1_000014b53a60c645e3ac9bde6bae020430c930b3cc59...,30
2,1_00003e3765a73da45db5265de2b22424e025d61380f7...,4
3,1_00004636d8310609e710934f194bfb41a5f0ac7ed5e0...,78
4,1_00004b8315fd1954f06dd80b85ebc61f7ab006785cd3...,64


In [61]:
index_with_counts['cum_call_count'] = index_with_counts.call_count.cumsum()
index_with_counts.head(10)

Unnamed: 0,file_name,call_count,cum_call_count
0,1_00001314358470f0c99914d5c7af0cd89248e54883ac...,6,6
1,1_000014b53a60c645e3ac9bde6bae020430c930b3cc59...,30,36
2,1_00003e3765a73da45db5265de2b22424e025d61380f7...,4,40
3,1_00004636d8310609e710934f194bfb41a5f0ac7ed5e0...,78,118
4,1_00004b8315fd1954f06dd80b85ebc61f7ab006785cd3...,64,182
5,1_00004cab8dbbf5a10ec8f4fe3d7a816c69a69ff6dcfb...,32,214
6,1_000052c9c1a2bcf00536c9b3f4132222339f74379067...,10,224
7,1_000055875d91ca961b32d7e535548ed87e50de528399...,27,251
8,1_00005baacbbbd2ce9442b6afa0c914506329f669a5b0...,102,353
9,1_00006011493ed94fb8010cead84ee610cdbece5de961...,3,356


In [62]:
total_calls = index_with_counts.cum_call_count.max()
n_per_partition = total_calls / N_PARTITIONS
print('Total calls: {:,}'.format(total_calls))
print('Calls per partition: ~{:,.0f}'.format(n_per_partition))

Total calls: 113,790,736
Calls per partition: ~37,943


In [63]:
index_with_counts['planned_partition'] = np.floor_divide(index_with_counts.cum_call_count, n_per_partition)
index_with_counts.head()

Unnamed: 0,file_name,call_count,cum_call_count,planned_partition
0,1_00001314358470f0c99914d5c7af0cd89248e54883ac...,6,6,0.0
1,1_000014b53a60c645e3ac9bde6bae020430c930b3cc59...,30,36,0.0
2,1_00003e3765a73da45db5265de2b22424e025d61380f7...,4,40,0.0
3,1_00004636d8310609e710934f194bfb41a5f0ac7ed5e0...,78,118,0.0
4,1_00004b8315fd1954f06dd80b85ebc61f7ab006785cd3...,64,182,0.0


In [64]:
index_with_counts.tail()

Unnamed: 0,file_name,call_count,cum_call_count,planned_partition
875367,1_ffffe72029d7bfc78edecbb36680b3bf1684792e0e9c...,9,113790675,2998.0
875368,1_ffffe728e149d0187f4c19aa72e0702399ee4146e631...,2,113790677,2998.0
875369,1_ffffefa6429f4c246f35332bc97c0c611aff1e5f2f87...,8,113790685,2998.0
875370,1_fffff099cd4647207aeb9433939c99d26e85adfa8c7b...,1,113790686,2998.0
875371,1_fffffec2c395d8cad2f6daabc097cb5322148f73e0b3...,50,113790736,2998.0


In [65]:
# Check penultimate break
rough_break = index_with_counts.cum_call_count.values[-1] - n_per_partition
index_with_counts[
    (index_with_counts.cum_call_count < rough_break + 300) &
    (index_with_counts.cum_call_count > rough_break - 200)
]

Unnamed: 0,file_name,call_count,cum_call_count,planned_partition
874771,1_ffebe26b2675d2bbd4ea87bfb0e96108089183e4e8c3...,80,113752600,2997.0
874772,1_ffebe2c957605eb75994ac5920c276ccb84ccd96af4c...,2,113752602,2997.0
874773,1_ffebe53d510bb0f2697cd57ffac52783584174a2abcf...,5,113752607,2997.0
874774,1_ffebe61fc50a3c8727378eaafc46d12c5b8addbfecca...,1,113752608,2997.0
874775,1_ffebee8e62264d7477d826c885497dfa339a2a719012...,6,113752614,2997.0
874776,1_ffebfb26e0b93fe8af36cc62c8b14167edd4e2e4aa14...,1,113752615,2997.0
874777,1_ffec021028079880f4fa67f6ca30faa7c20ecd12455f...,2,113752617,2997.0
874778,1_ffec0573acf6a05e4fe5ec0c854e499116c45f50b88d...,12,113752629,2997.0
874779,1_ffec18e50e877e5ac018736e6162d6043244ba544d3d...,4,113752633,2997.0
874780,1_ffec222605f05bc6c8a95ff09a8659626c1ae454e77d...,16,113752649,2997.0


## Process data

In [66]:
fix_data = lambda x:  "[" + x[1:-1] + "]"

def get_files_for_split(split_number):
    filtered = index_with_counts[index_with_counts.planned_partition == split_number]
    return filtered.file_name.values
    
def get_data_from_file(file_name):
    fp = Path.joinpath(CACHE_OLD_DIR, file_name)
    with open(fp, 'r') as f:
        raw_data = f.read()
    data = json.loads(fix_data(raw_data))
    return data

def convert_to_dict(item):
    item = item.replace('false', 'False')
    item = item.replace('true', 'True')
    item = item.replace('null', 'None')
    try:
        return literal_eval(item)
    except:
        return {}

def make_df_from_data(data, file_name):
    df = pd.DataFrame.from_records(data)

    # Make script_col and script_line numeric, an drop rows with bad values
    df['script_col'] = pd.to_numeric(df.script_col, errors='coerce')
    df['script_line'] = pd.to_numeric(df.script_line, errors='coerce')
    df = df.dropna(subset=['script_col', 'script_line'])
    df['script_col'] = df['script_col'].astype(int)
    df['script_line'] = df['script_line'].astype(int)
    
    # Make sure arguments column is always present
    if 'arguments' not in df.columns:
        df['arguments'] = "{}"
    
    # Parse arguments
    df['arguments'] = df.arguments.astype(str)
    df['_arg_as_dict'] = df.arguments.apply(convert_to_dict)
    df['arguments_n_keys'] = df._arg_as_dict.apply(len).astype(int)

    # Based on previous computation of max_keys_count
    for n in range(9):
        key = 'argument_{}'.format(n)
        df[key] = df._arg_as_dict.apply(lambda x: x.get(str(n))).astype(str)
    df = df.drop('_arg_as_dict', axis=1)
    
    # Add file_name column
    df['file_name'] = file_name
    
    # Make a unique call_id
    df['call_id'] = df.file_name.str.cat(df.index.astype(str), sep='__')
    
    # Make a value len and initial value
    df['value_len'] = df.value.str.len()
    df['value_1000'] = df.value.str.slice(0, 1000)
    
    # Make a timestamp
    df['time_stamp'] = pd.to_datetime(df.time_stamp, errors='coerce')
    
    # Make categorical
    df['operation'] = df.operation.astype(operation_categorical_type)
    df['symbol'] = df.symbol.astype(symbol_categorical_type)
    
    # Set call_id as index
    df = df.set_index('call_id')
    
    # Reorder columns so consistent (keeps parquet happy)
    df = df.sort_index(axis='columns')
    return df

def get_df(file_name):
    data = get_data_from_file(file_name)
    df = make_df_from_data(data, file_name)
    return df

### Test single write and read (with pandas)

In [67]:
%%time
dfs = []
for file_name in get_files_for_split(100):
    dfs.append(get_df(file_name))

CPU times: user 12 s, sys: 173 ms, total: 12.2 s
Wall time: 12.8 s


In [68]:
%%time
all_df = pd.concat(dfs)

CPU times: user 1.38 s, sys: 17.9 ms, total: 1.4 s
Wall time: 1.4 s


In [69]:
len(all_df)

37702

In [70]:
all_df.dtypes

argument_0                  object
argument_1                  object
argument_2                  object
argument_3                  object
argument_4                  object
argument_5                  object
argument_6                  object
argument_7                  object
argument_8                  object
arguments                   object
arguments_n_keys             int64
call_stack                  object
crawl_id                     int64
file_name                   object
func_name                   object
in_iframe                     bool
location                    object
operation                 category
script_col                   int64
script_line                  int64
script_loc_eval             object
script_url                  object
symbol                    category
time_stamp          datetime64[ns]
value                       object
value_1000                  object
value_len                    int64
dtype: object

In [71]:
all_df.head()

Unnamed: 0_level_0,argument_0,argument_1,argument_2,argument_3,argument_4,argument_5,argument_6,argument_7,argument_8,arguments,...,operation,script_col,script_line,script_loc_eval,script_url,symbol,time_stamp,value,value_1000,value_len
call_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1_08747931a7eb972db94f687affd20af95c8b54af91f8bb979b23f0c4.json__0,,,,,,,,,,{},...,get,7726,2,,http://www.82cook.com/js/jquery-1.6.2.min.js,window.navigator.userAgent,2017-12-16 23:20:39.271,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,68
1_08747931a7eb972db94f687affd20af95c8b54af91f8bb979b23f0c4.json__1,,,,,,,,,,{},...,get,5,64,,http://www.82cook.com/entiz/enti.php?bn=23,window.navigator.userAgent,2017-12-16 23:20:39.320,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,68
1_08747931a7eb972db94f687affd20af95c8b54af91f8bb979b23f0c4.json__2,,,,,,,,,,{},...,get,5,170,,http://www.82cook.com/entiz/enti.php?bn=23,window.navigator.userAgent,2017-12-16 23:20:39.325,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,68
1_08747931a7eb972db94f687affd20af95c8b54af91f8bb979b23f0c4.json__3,,,,,,,,,,{},...,get,832,27,,http://www.google-analytics.com/ga.js,window.document.cookie,2017-12-16 23:20:39.613,PHPSESSID=k5uklrf1t9151jrpcv8q7ig2f2,PHPSESSID=k5uklrf1t9151jrpcv8q7ig2f2,36
1_08747931a7eb972db94f687affd20af95c8b54af91f8bb979b23f0c4.json__4,,,,,,,,,,{},...,get,150,29,,http://www.google-analytics.com/ga.js,window.screen.colorDepth,2017-12-16 23:20:39.614,24,24,2


In [72]:
%%time
all_df.to_parquet(
    Path.joinpath('test.parquet'),
    compression='snappy',
    engine='fastparquet',
    write_index=True
)

CPU times: user 755 ms, sys: 16.5 ms, total: 772 ms
Wall time: 775 ms


In [73]:
%%time
df = pd.read_parquet(
    Path.joinpath('test.parquet'), 
)

CPU times: user 511 ms, sys: 16.8 ms, total: 528 ms
Wall time: 537 ms


In [74]:
df.tail()

Unnamed: 0_level_0,argument_0,argument_1,argument_2,argument_3,argument_4,argument_5,argument_6,argument_7,argument_8,arguments,...,operation,script_col,script_line,script_loc_eval,script_url,symbol,time_stamp,value,value_1000,value_len
call_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1_08891a2d9c077343b114b865bde7584a7c80294f2c24789c3b186490.json__1,,,,,,,,,,{},...,get,714,71,,https://platform.instagram.com/en_US/embeds.js,window.name,2017-12-16 22:27:59.818,,,0
1_08891a2d9c077343b114b865bde7584a7c80294f2c24789c3b186490.json__2,,,,,,,,,,{},...,get,716,71,,https://platform.instagram.com/en_US/embeds.js,window.name,2017-12-16 22:27:59.819,,,0
1_08891a2d9c077343b114b865bde7584a7c80294f2c24789c3b186490.json__3,,,,,,,,,,{},...,get,57,76,,https://platform.instagram.com/en_US/embeds.js,window.navigator.userAgent,2017-12-16 22:27:59.819,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,68
1_08891a2d9c077343b114b865bde7584a7c80294f2c24789c3b186490.json__4,,,,,,,,,,{},...,get,333,122,,https://apis.google.com/_/scs/apps-static/_/js...,window.name,2017-12-16 22:28:00.397,,,0
1_08891a2d9c077343b114b865bde7584a7c80294f2c24789c3b186490.json__5,,,,,,,,,,{},...,get,3798,23,,https://cdns.gigya.com/js/gigya.js?apiKey=3_zF...,window.document.cookie,2017-12-16 22:28:00.523,iduser=172.16.1.150.1513463269507053; __gads=I...,iduser=172.16.1.150.1513463269507053; __gads=I...,621


In [75]:
df.dtypes

argument_0                  object
argument_1                  object
argument_2                  object
argument_3                  object
argument_4                  object
argument_5                  object
argument_6                  object
argument_7                  object
argument_8                  object
arguments                   object
arguments_n_keys             int64
call_stack                  object
crawl_id                     int64
file_name                   object
func_name                   object
in_iframe                     bool
location                    object
operation                 category
script_col                   int64
script_line                  int64
script_loc_eval             object
script_url                  object
symbol                    category
time_stamp          datetime64[ns]
value                       object
value_1000                  object
value_len                    int64
dtype: object

### Make meta and divisions

In [76]:
meta = pd.DataFrame(columns=[
    'argument_0',
    'argument_1',
    'argument_2',
    'argument_3',
    'argument_4',
    'argument_5',
    'argument_6',
    'argument_7',
    'argument_8',
    'arguments',
    'arguments_n_keys',
    'call_id',
    'call_stack',
    'crawl_id',
    'file_name',
    'func_name',
    'in_iframe',
    'location',
    'operation',
    'script_col',
    'script_line',
    'script_loc_eval',
    'script_url',
    'symbol',
    'time_stamp',
    'value',
    'value_1000',
    'value_len'
], dtype='object')
meta = meta.set_index('call_id')

meta.arguments_n_keys = meta.arguments_n_keys.astype(np.int64)
meta.crawl_id = meta.crawl_id.astype(np.int64)
meta.in_iframe = meta.in_iframe.astype(bool)
meta.operation = meta.operation.astype(operation_categorical_type)
meta.script_col = meta.script_col.astype(np.int64)
meta.script_line = meta.script_line.astype(np.int64)
meta.symbol = meta.symbol.astype(symbol_categorical_type)
meta.time_stamp = meta.time_stamp.astype('datetime64[ns]')
meta.value_len = meta.value_len.astype(np.int64)

print(meta.dtypes)
print(meta.index)

argument_0                  object
argument_1                  object
argument_2                  object
argument_3                  object
argument_4                  object
argument_5                  object
argument_6                  object
argument_7                  object
argument_8                  object
arguments                   object
arguments_n_keys             int64
call_stack                  object
crawl_id                     int64
file_name                   object
func_name                   object
in_iframe                     bool
location                    object
operation                 category
script_col                   int64
script_line                  int64
script_loc_eval             object
script_url                  object
symbol                    category
time_stamp          datetime64[ns]
value                       object
value_1000                  object
value_len                    int64
dtype: object
Index([], dtype='object', name='call_id')

In [77]:
assert np.all(meta.dtypes.keys() == df.dtypes.keys())

In [78]:
divisions = [get_files_for_split(n)[0] for n in range(N_PARTITIONS)] + ['2']
print('{} divisions'.format(len(divisions)))
divisions[-5:]

3000 divisions


['1_ffa165173a4c48978172af7e6e1f0c5848edb9904dfcc8ce5d5a33bb.json',
 '1_ffba2e877be3a027c6fde6f6fe77d27f61acd59e6832de0282e17a8d.json',
 '1_ffd18cfc37f4d0b14c86ed568b3db6d84f41ee7f237a008ba010fc67.json',
 '1_ffec9cecff312c4106f22ff39db4f86940708eee16d980674875c05a.json',
 '2']

In [82]:
with open('divisions.txt', 'w') as f:
    f.write("\r\n".join(divisions))

## Run all

In [83]:
def process_chunk(split_number):
    dfs = []
    files = get_files_for_split(split_number)
    for file_name in files:
        dfs.append(get_df(file_name))
    all_df = pd.concat(dfs)
    return all_df

In [84]:
%%time
dfs = [delayed(process_chunk)(split_number) for split_number in range(N_PARTITIONS)]

CPU times: user 148 ms, sys: 7.03 ms, total: 155 ms
Wall time: 152 ms


In [88]:
df = dd.from_delayed(dfs, meta=meta, divisions=divisions)
out = df.to_parquet(
    CACHE_NEW_DIR, compression='snappy', engine='fastparquet', write_index=True, compute=False
)

In [89]:
len(df.dask)

5998

In [90]:
# Used ~10GB RAM on my machine
with ProgressBar():
    out.compute()

[########################################] | 100% Completed |  7hr 34min 30.5s


## Test run data

In [91]:
# Based on sample of 5 - basic data
print('{} hours to finish'.format(((3000 / 5) * 0.5) / 60))

# Based on sample of 5 - all data
print('{} hours to finish'.format(((3000 / 5) * 1.8) / 60))

# Based on a sample of 10 - all data (multiprocessing)
print('{} hours to finish'. format(((3000 / 10) * 2) / 60))

5.0 hours to finish
18.0 hours to finish
10.0 hours to finish


In [92]:
df = dd.read_parquet(CACHE_NEW_DIR)
df.head()

Unnamed: 0_level_0,argument_0,argument_1,argument_2,argument_3,argument_4,argument_5,argument_6,argument_7,argument_8,arguments,...,operation,script_col,script_line,script_loc_eval,script_url,symbol,time_stamp,value,value_1000,value_len
call_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json__0,,,,,,,,,,{},...,get,4206,2,,https://mech.iitm.ac.in/meiitm/wp-includes/js/...,window.navigator.userAgent,2017-12-15 23:52:40.662,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,68
1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json__1,,,,,,,,,,{},...,get,2640,11,,https://mech.iitm.ac.in/meiitm/wp-includes/js/...,window.navigator.userAgent,2017-12-15 23:52:40.742,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,68
1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json__2,,,,,,,,,,{},...,get,6226,4,,https://mech.iitm.ac.in/meiitm/wp-admin/js/iri...,window.navigator.userAgent,2017-12-15 23:52:40.751,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko...,68
1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json__3,,,,,,,,,,{},...,get,6262,4,,https://mech.iitm.ac.in/meiitm/wp-admin/js/iri...,window.navigator.appName,2017-12-15 23:52:40.752,Netscape,Netscape,8
1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json__4,,,,,,,,,,{},...,get,66,1,,https://mech.iitm.ac.in/meiitm/wp-includes/js/...,window.navigator.appVersion,2017-12-15 23:52:45.707,5.0 (X11),5.0 (X11),9


In [93]:
df = dd.read_parquet(CACHE_NEW_DIR, columns='crawl_id')
df.head()

call_id
1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json__0    1
1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json__1    1
1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json__2    1
1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json__3    1
1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json__4    1
Name: crawl_id, dtype: int64

In [94]:
%%time
df.value_counts().compute()

CPU times: user 18.6 s, sys: 576 ms, total: 19.2 s
Wall time: 21.8 s


1    113718973
Name: crawl_id, dtype: int64

In [5]:
with ProgressBar():
    df = df.reset_index()
    df = df.drop('crawl_id', axis='columns')
    df.to_parquet('index.parquet', compression='snappy', engine='fastparquet')

[########################################] | 100% Completed | 34.3s


In [95]:
with ProgressBar():
    random_indexes = df.sample(frac=0.001).compute()

[########################################] | 100% Completed | 23.3s


In [97]:
random_indexes = random_indexes.reset_index()
random_indexes = random_indexes.drop('crawl_id', axis='columns')
print(len(random_indexes))
random_indexes.head()

113826


Unnamed: 0,call_id
0,1_00177953e32e7046c0739d5edb4974877b3c6f634570...
1,1_0013e00455bedc6adbc37ef23ea98be2fa9b64725511...
2,1_0016f101d8a724a6cff2efc31b4a608556548f9f5a30...
3,1_00171b687c805fa961859d83a562b82d8d289ac41c30...
4,1_0001213aecc8140d73918b7fcd11af181a850ce5b7d2...


In [98]:
twenty_rows = random_indexes.sample(n=20)
twenty_rows.call_id.values

array(['1_880a50322a54983b7d187546b45e05be77a6d991fc53acfc97aa954a.json__8',
       '1_93937a3129d9f9852b02ad01129e36b2389dcaa2e06da91160a63b3f.json__49',
       '1_345a66f5857331bc0e4f69ee8c6d86fcb07da6b395cc5146746c12b3.json__87',
       '1_c40a615742db891c09e04e19ace734641b1fea8775f49b2066053c9c.json__0',
       '1_b150cf01d6110180f21f09a8cd7d771b274458423a220bd48633e968.json__43',
       '1_7090b764e88efb03cb1665b44a242eb94682d13d7f4e1a3ff45f00e6.json__228',
       '1_a7a39c545b286b1b197c41bf587b254d4dc4f1e8a24aa1424b112255.json__1000',
       '1_bb869d7becb7ad38bfa2869e56d861a0754df06c928f6e33edc882dd.json__19',
       '1_9fecc85901a68ef06fa445419184b24bd503bb88d600d46cfecaf3a8.json__15',
       '1_19714bf818ee2bbcef666e34b363c147d7f99ed1aa616886c630266d.json__456',
       '1_b8a0fe42f7de1b1e65cd1188e9373963c4716c22bb7c955fcc47188c.json__211',
       '1_94cad047dabce97d8f8935b1dcf7294a55c60a9b9bdc01b12d1b550c.json__15',
       '1_5b1758c610443baefdcb4a63b0cf877f5d63af3cf84c16b80a6

### Pick these 20 random rows out of the dataset - getting location and script_url values

In [99]:
df = dd.read_parquet(CACHE_NEW_DIR, columns=['location', 'script_url'], engine='fastparquet')
df.divisions = divisions
df.head()

Unnamed: 0_level_0,location,script_url
call_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json__0,https://mech.iitm.ac.in/meiitm/,https://mech.iitm.ac.in/meiitm/wp-includes/js/...
1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json__1,https://mech.iitm.ac.in/meiitm/,https://mech.iitm.ac.in/meiitm/wp-includes/js/...
1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json__2,https://mech.iitm.ac.in/meiitm/,https://mech.iitm.ac.in/meiitm/wp-admin/js/iri...
1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json__3,https://mech.iitm.ac.in/meiitm/,https://mech.iitm.ac.in/meiitm/wp-admin/js/iri...
1_00001314358470f0c99914d5c7af0cd89248e54883ac3b5083395b61.json__4,https://mech.iitm.ac.in/meiitm/,https://mech.iitm.ac.in/meiitm/wp-includes/js/...


In [100]:
%%time
success = []
failed = []
for index in twenty_rows.call_id.values:
    print('Looking for {}'.format(index))
    try:
        df.loc[index,:].compute()
        success.append(index)
    except:
        failed.append(index)

Looking for 1_880a50322a54983b7d187546b45e05be77a6d991fc53acfc97aa954a.json__8
Looking for 1_93937a3129d9f9852b02ad01129e36b2389dcaa2e06da91160a63b3f.json__49
Looking for 1_345a66f5857331bc0e4f69ee8c6d86fcb07da6b395cc5146746c12b3.json__87
Looking for 1_c40a615742db891c09e04e19ace734641b1fea8775f49b2066053c9c.json__0
Looking for 1_b150cf01d6110180f21f09a8cd7d771b274458423a220bd48633e968.json__43
Looking for 1_7090b764e88efb03cb1665b44a242eb94682d13d7f4e1a3ff45f00e6.json__228
Looking for 1_a7a39c545b286b1b197c41bf587b254d4dc4f1e8a24aa1424b112255.json__1000
Looking for 1_bb869d7becb7ad38bfa2869e56d861a0754df06c928f6e33edc882dd.json__19
Looking for 1_9fecc85901a68ef06fa445419184b24bd503bb88d600d46cfecaf3a8.json__15
Looking for 1_19714bf818ee2bbcef666e34b363c147d7f99ed1aa616886c630266d.json__456
Looking for 1_b8a0fe42f7de1b1e65cd1188e9373963c4716c22bb7c955fcc47188c.json__211
Looking for 1_94cad047dabce97d8f8935b1dcf7294a55c60a9b9bdc01b12d1b550c.json__15
Looking for 1_5b1758c610443baefdcb4a6

In [104]:
%%time
df.loc[twenty_rows.call_id.values,:].compute()

CPU times: user 166 ms, sys: 169 ms, total: 335 ms
Wall time: 2.19 s


Unnamed: 0_level_0,location,script_url
call_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1_19714bf818ee2bbcef666e34b363c147d7f99ed1aa616886c630266d.json__456,http://askul.co.jp/e/15/,http://assets.adobedtm.com/29bc4dc777738960adb...
1_345a66f5857331bc0e4f69ee8c6d86fcb07da6b395cc5146746c12b3.json__87,https://fr.zopim.com/,https://www.google-analytics.com/analytics.js
1_392cf4ef21b68ddfeafc1bad500b050701d55a142790d8e0e50ee3c2.json__123,http://www.mediamarkt.de/de/category/_streamin...,http://js.redblue.de/fee/js/dist/core.js
1_5b1758c610443baefdcb4a63b0cf877f5d63af3cf84c16b80a65a599.json__187,https://www.zhibo8.cc/zhibo/other/2017/1119113...,https://www.zhibo8.cc/js/2016/ndanmu.js
1_7090b764e88efb03cb1665b44a242eb94682d13d7f4e1a3ff45f00e6.json__228,https://musique.fnac.com/s121859/T-shirts-post...,https://actor-5637.kxcdn.com/actor/3E2C5D6A15C...
1_72e6bc91e0b06000a4427c937ca6e7caba495a9737e0e473c8d636c4.json__309,http://www.xianliao.me/h/9cc46d7c653ee32363f89...,http://cdn.xianliao.me/assets/5334b5d63c4a3db5...
1_7490a65415d870ec5c7716bbe7c48b0de5a45cad50274642ef5fc0df.json__58,http://iitb.ac.in/en/event/quami-ekta-saptah,http://maps.googleapis.com/maps-api-v3/api/js/...
1_7f612e0a69024ff9caccb9ec42753a7712c788a001bd2fbccf722699.json__603,https://www.stoloto.ru/gzhl/game?int=sitemap&l...,https://web.redhelper.ru/container/main.js?ver...
1_87d0bc1745024c44d186a6bc5e6a2ec6b7b19fc2c4744bb4578b2655.json__175,https://bbcdn.go.goldbachpoland.bbelements.com...,https://code.createjs.com/createjs-2015.11.26....
1_880a50322a54983b7d187546b45e05be77a6d991fc53acfc97aa954a.json__8,http://grid.mk/twitter,https://ajax.cloudflare.com/cdn-cgi/scripts/90...


### 2 seconds to pull 20 random rows out of 131million - yay!