## Getting Started
Basic import statements for different libraries used below and corresponding display settings

In [4]:
import numpy as np
import pandas as pd
import tldextract
from urllib.parse import urlparse
import os

In [5]:
## Don't limit pandas display.
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", -1)

## Helper functions
Some useful functions defined here which can be used in analysis

In [6]:
def extract_domain(url):
    """Use tldextract to return the base domain from a url"""
    try:
        extracted = tldextract.extract(url)
        return '{}.{}'.format(extracted.domain, extracted.suffix)
    except Exception as e:
        return 'ERROR'

In [7]:
#Borrowed from 2018_09_biskit1_mordax__canvas_fingerprinting notebook
def parse_base_url(url):
  return urlparse(url).netloc # Extract the base part of a URL (netloc, up until the first '/'). 

In [8]:
def write_csv(path,name,df):
    df.to_csv(os.path.join(path,name))
    

## Data directory
Change below to point to data location

In [20]:
DATA_DIR = '/home/alvis/Desktop/Richa/overscripted/'
PARQUET_FILE = DATA_DIR + 'sample'  # I ran this with sample data*
IMP_COLUMNS = ['arguments','in_iframe', 'location', 'operation', 'script_url','symbol','time_stamp','value_1000','location_domain','script_domain','location_base_url']

In [21]:
df = pd.read_parquet(PARQUET_FILE, engine='pyarrow')
df.head()

Unnamed: 0,argument_0,argument_1,argument_2,argument_3,argument_4,argument_5,argument_6,argument_7,argument_8,arguments,arguments_n_keys,call_id,call_stack,crawl_id,file_name,func_name,in_iframe,location,operation,script_col,script_line,script_loc_eval,script_url,symbol,time_stamp,value,value_1000,value_len,valid,errors
0,,,,,,,,,,{},0,1_028048bbce3f7816a5f1277ac3ac2372d6607581a77a4bfb7a1873ab.json__0,,1,1_028048bbce3f7816a5f1277ac3ac2372d6607581a77a4bfb7a1873ab.json,a/<,True,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,get,1802,57,,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,window.name,2017-12-16 02:54:10.079,fb_xdm_frame_https,fb_xdm_frame_https,18,True,
1,,,,,,,,,,{},0,1_028048bbce3f7816a5f1277ac3ac2372d6607581a77a4bfb7a1873ab.json__1,,1,1_028048bbce3f7816a5f1277ac3ac2372d6607581a77a4bfb7a1873ab.json,a/<,True,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,get,2895,57,,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,window.name,2017-12-16 02:54:10.080,fb_xdm_frame_https,fb_xdm_frame_https,18,True,
2,,,,,,,,,,{},0,1_028048bbce3f7816a5f1277ac3ac2372d6607581a77a4bfb7a1873ab.json__2,A@https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com:57:2781\nx@https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com:55:3028\nw@https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com:55:931\na/<@https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com:57:2353\na@https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com:57:114\nrequire@https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com:36:610\n@https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com:57:3019,1,1_028048bbce3f7816a5f1277ac3ac2372d6607581a77a4bfb7a1873ab.json,A,True,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,get,2781,57,,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,window.document.cookie,2017-12-16 02:54:10.086,,,0,True,
3,,,,,,,,,,{},0,1_028048bbce3f7816a5f1277ac3ac2372d6607581a77a4bfb7a1873ab.json__3,,1,1_028048bbce3f7816a5f1277ac3ac2372d6607581a77a4bfb7a1873ab.json,x,True,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,get,156,49,,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,window.navigator.userAgent,2017-12-16 02:54:10.088,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,68,True,
4,,,,,,,,,,{},0,1_0401c74e1e381c6f4ebd5ca99102f1529c9a843360d6c9211525136f.json__0,,1,1_0401c74e1e381c6f4ebd5ca99102f1529c9a843360d6c9211525136f.json,ra/<,True,https://cas.us.criteo.com/delivery/r/afr.php?did=5a34c73ff17390f0eaeb591979874b00&z=WjTHPwAHZYkKT7BIAAcskrRh4Qozh3b-c-mtZg&u=%7C7J5NcLNwKWZvhHazrdQ0r3pEybQM2VrhNSue519M%2FnU%3D%7C&c1=M5BADJe1UR3zJ2HNju9b10FggySKKMK0AoYTtPDcqDnSIQIZUQPlDupK--OP2eR-eNGQ46cgN3mwCl5UMg4IstlvomsUbHEHUzImPBAbL0KpTFeMsdEkBo28MAQVY_79HvMen3pU9pjoRxbnxk_AxatU3fdvCPtFY7Wzui5q962zi71J5i_HHNmYi7XbHxLl1v3NLOEqWiI-3QfHE1byzwOhuyge44QAJfUpukDSr4X723xUoquihjIy6b6D_yU9AsLHIIxKQk64_ES4G8moUw5dbt7SG3KWRhyjzAZW5acfRwaX8v33UzaCSZKj4O0XffzJaDiMmsprtAOP0J4xHPtfZqvurKt_x3z5y83mK1o&ct0=https://adclick.g.doubleclick.net/aclk%3Fsa%3Dl%26ai%3DCcqaRP8c0WonLHcjgvgKS2ZyADO7lmPBNsu23nZ0BwI23ARABIABgyQaCARdjYS1wdWItNTc4NzU5MjQ4Mzc2Njc2MKABrN3-6APIAQngAgCoAwGqBMUBT9DGnU9Xf5zpWjsp7PXxVDLu7mhvsOzx8jjeTb-wk_FUQNpBqVd4QxwydKBkX31VemFtAuP1QMeGjoHagpA44JfU11OU46ZLmBKcADPeCDg8kDPJvowA7EbbZ6gvml2aRO7nKo1LHNbLoGTBvP6gmhnbhVqThagbrECDM6qxbcRiiWobTKajDG8KeWma5flmrMZiQe5Lu3cyX_WMmu36IIP2lojiMZaZvgiE_ncYb24UZCKxrORb0gO54t1XHQFwzDnBRPXgBAGABufvkeKYhIzL9gGgBiGoB6a-G9gHANIIBQiAYRAB8ggbYWR4LXN1YnN5bi0wOTI1MDI4NTk2NjIxNjE3%26num%3D1%26sig%3DAOD64_1ibrZmc1pVLttz4doqoDmSHOvwXg%26client%3Dca-pub-5787592483766760%26adurl%3D,get,306,25,,https://ajax.googleapis.com/ajax/libs/webfont/1.6.26/webfont.js,window.navigator.userAgent,2017-12-16 07:12:07.104,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,68,True,


In [7]:
len(df)

9234

In [22]:
# Some location and script url cleansing. Nice idea from 2018_09_biskit1_mordax__canvas_fingerprinting
df['location_domain'] = df.location.apply(extract_domain)
df['script_domain'] = df.script_url.apply(extract_domain)
df['location_base_url'] = df.location.apply(parse_base_url)
#Reduced dataframe = Rdf
Rdf = df[IMP_COLUMNS]
Rdf.head()

Unnamed: 0,arguments,in_iframe,location,operation,script_url,symbol,time_stamp,value_1000,location_domain,script_domain,location_base_url
0,{},True,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,get,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,window.name,2017-12-16 02:54:10.079,fb_xdm_frame_https,facebook.com,facebook.com,staticxx.facebook.com
1,{},True,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,get,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,window.name,2017-12-16 02:54:10.080,fb_xdm_frame_https,facebook.com,facebook.com,staticxx.facebook.com
2,{},True,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,get,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,window.document.cookie,2017-12-16 02:54:10.086,,facebook.com,facebook.com,staticxx.facebook.com
3,{},True,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,get,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,window.navigator.userAgent,2017-12-16 02:54:10.088,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,facebook.com,facebook.com,staticxx.facebook.com
4,{},True,https://cas.us.criteo.com/delivery/r/afr.php?did=5a34c73ff17390f0eaeb591979874b00&z=WjTHPwAHZYkKT7BIAAcskrRh4Qozh3b-c-mtZg&u=%7C7J5NcLNwKWZvhHazrdQ0r3pEybQM2VrhNSue519M%2FnU%3D%7C&c1=M5BADJe1UR3zJ2HNju9b10FggySKKMK0AoYTtPDcqDnSIQIZUQPlDupK--OP2eR-eNGQ46cgN3mwCl5UMg4IstlvomsUbHEHUzImPBAbL0KpTFeMsdEkBo28MAQVY_79HvMen3pU9pjoRxbnxk_AxatU3fdvCPtFY7Wzui5q962zi71J5i_HHNmYi7XbHxLl1v3NLOEqWiI-3QfHE1byzwOhuyge44QAJfUpukDSr4X723xUoquihjIy6b6D_yU9AsLHIIxKQk64_ES4G8moUw5dbt7SG3KWRhyjzAZW5acfRwaX8v33UzaCSZKj4O0XffzJaDiMmsprtAOP0J4xHPtfZqvurKt_x3z5y83mK1o&ct0=https://adclick.g.doubleclick.net/aclk%3Fsa%3Dl%26ai%3DCcqaRP8c0WonLHcjgvgKS2ZyADO7lmPBNsu23nZ0BwI23ARABIABgyQaCARdjYS1wdWItNTc4NzU5MjQ4Mzc2Njc2MKABrN3-6APIAQngAgCoAwGqBMUBT9DGnU9Xf5zpWjsp7PXxVDLu7mhvsOzx8jjeTb-wk_FUQNpBqVd4QxwydKBkX31VemFtAuP1QMeGjoHagpA44JfU11OU46ZLmBKcADPeCDg8kDPJvowA7EbbZ6gvml2aRO7nKo1LHNbLoGTBvP6gmhnbhVqThagbrECDM6qxbcRiiWobTKajDG8KeWma5flmrMZiQe5Lu3cyX_WMmu36IIP2lojiMZaZvgiE_ncYb24UZCKxrORb0gO54t1XHQFwzDnBRPXgBAGABufvkeKYhIzL9gGgBiGoB6a-G9gHANIIBQiAYRAB8ggbYWR4LXN1YnN5bi0wOTI1MDI4NTk2NjIxNjE3%26num%3D1%26sig%3DAOD64_1ibrZmc1pVLttz4doqoDmSHOvwXg%26client%3Dca-pub-5787592483766760%26adurl%3D,get,https://ajax.googleapis.com/ajax/libs/webfont/1.6.26/webfont.js,window.navigator.userAgent,2017-12-16 07:12:07.104,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,criteo.com,googleapis.com,cas.us.criteo.com


In [25]:
#Both of these are used for checking which plugins are on browser
df_mimeTypes = Rdf[Rdf.symbol.str.contains('navigator.mimeTypes|navigator.plugins')]
write_csv(DATA_DIR,'df_navigator.csv',df_mimeTypes)

In [None]:
#Both of these are used for checking which plugins are on browser
df_mimeTypes = Rdf[Rdf.symbol.str.contains('navigator.mimeTypes|navigator.plugins')]
write_csv(DATA_DIR,'df_navigator.csv',df_mimeTypes)

In [None]:
akam_f = df[df.script_url.str.contains('/akam/')]
akam_f.head()