In [1]:
import sqlite3
import logging
import pandas as pd
from urllib.parse import urlparse
from tld import get_tld, get_fld

In [2]:
# Specify your local path to the sqlite db containing crawl data
SQLITE_LOCAL_PATH = '/Users/liam/dev/OpenWPM/data/gemeente_social_complete.sqlite'
SM_FLDS = ['facebook', 'instagram', 'linkedin', 'google', 'youtube', 'twitter']

In [3]:
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect(SQLITE_LOCAL_PATH)

cursor = con.cursor()

# Parse javascript table from SQLite
javascript = pd.read_sql_query("SELECT * from javascript", con)
# Parse SITE_VISITS table from SQLite
site_visits = pd.read_sql_query("SELECT * from site_visits", con)

In [4]:
# Get a sense for columns
print(javascript.columns)
print(site_visits.columns)

#Does this add up...
#print(site_visits.count())

# Let's start by looking only at the visits to the parent urls (i.e. the gemeente sites themselves)
gemeente_site_visits = site_visits.loc[site_visits['parent_url'] == site_visits['site_url']]
#print(gemeente_site_visits.count())

# Now narrow down the javascript dataframe to only visit_ids from parent_urls, and let's join the visit_url to the javascript
gemeente_visit_ids = gemeente_site_visits['visit_id']

gemeente_javascript = javascript.loc[javascript['visit_id'].isin(gemeente_visit_ids)]
#print(gemeente_javascript.count())

# Join the site_visits information into the javascript for a complete dataset and we can select columns next
all_columns = gemeente_javascript.set_index('visit_id').join(gemeente_site_visits.set_index('visit_id'), lsuffix='_javascript', rsuffix='_site_visits')

# This should be equal to javascript.columns + site_vists.columns anything that was identical now has suffixes
all_columns.columns

Index(['id', 'incognito', 'crawl_id', 'visit_id', 'extension_session_uuid',
       'event_ordinal', 'page_scoped_event_ordinal', 'window_id', 'tab_id',
       'frame_id', 'script_url', 'script_line', 'script_col', 'func_name',
       'script_loc_eval', 'document_url', 'top_level_url', 'call_stack',
       'symbol', 'operation', 'value', 'arguments', 'time_stamp'],
      dtype='object')
Index(['visit_id', 'crawl_id', 'site_url', 'parent_url'], dtype='object')


Index(['id', 'incognito', 'crawl_id_javascript', 'extension_session_uuid',
       'event_ordinal', 'page_scoped_event_ordinal', 'window_id', 'tab_id',
       'frame_id', 'script_url', 'script_line', 'script_col', 'func_name',
       'script_loc_eval', 'document_url', 'top_level_url', 'call_stack',
       'symbol', 'operation', 'value', 'arguments', 'time_stamp',
       'crawl_id_site_visits', 'site_url', 'parent_url'],
      dtype='object')

In [5]:
# What is of any interest here?
all_columns.head()

Unnamed: 0_level_0,id,incognito,crawl_id_javascript,extension_session_uuid,event_ordinal,page_scoped_event_ordinal,window_id,tab_id,frame_id,script_url,...,top_level_url,call_stack,symbol,operation,value,arguments,time_stamp,crawl_id_site_visits,site_url,parent_url
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,42,0,6,dc49b49a-d785-409c-9556-04c70223de91,138,0,3,1,0,https://gemeente.groningen.nl/misc/drupal.js?p...,...,https://gemeente.groningen.nl/,@https://gemeente.groningen.nl/misc/drupal.js?...,window.document.cookie,set,has_js=1; path=/,,2019-05-15T21:49:02.608Z,6,http://gemeente.groningen.nl,http://gemeente.groningen.nl
6,81,0,6,dc49b49a-d785-409c-9556-04c70223de91,139,1,3,1,0,https://www.kcmsurvey.com/custom/gemeentegroni...,...,https://gemeente.groningen.nl/,,window.localStorage,get,{},,2019-05-15T21:49:02.824Z,6,http://gemeente.groningen.nl,http://gemeente.groningen.nl
6,82,0,6,dc49b49a-d785-409c-9556-04c70223de91,140,2,3,1,0,https://www.kcmsurvey.com/custom/gemeentegroni...,...,https://gemeente.groningen.nl/,,window.Storage.setItem,call,,"[""kcm_test"",""kcm_test""]",2019-05-15T21:49:02.906Z,6,http://gemeente.groningen.nl,http://gemeente.groningen.nl
6,83,0,6,dc49b49a-d785-409c-9556-04c70223de91,141,3,3,1,0,https://www.kcmsurvey.com/custom/gemeentegroni...,...,https://gemeente.groningen.nl/,,window.localStorage,get,"{""kcm_test"":""kcm_test""}",,2019-05-15T21:49:02.907Z,6,http://gemeente.groningen.nl,http://gemeente.groningen.nl
6,84,0,6,dc49b49a-d785-409c-9556-04c70223de91,142,4,3,1,0,https://www.kcmsurvey.com/custom/gemeentegroni...,...,https://gemeente.groningen.nl/,,window.Storage.removeItem,call,,"[""kcm_test""]",2019-05-15T21:49:02.908Z,6,http://gemeente.groningen.nl,http://gemeente.groningen.nl


In [6]:
# We didn't have this enabled... we'll see what we can get from just javascript table... 
# Cookie Access (Experimental -- Needs tests)
#     Set browser_params['cookie_instrument'] = True
#     Data is saved to the javascript_cookies table.
#     Will record cookies set both by Javascript and via HTTP Responses

javascript_analysis_set = all_columns[["extension_session_uuid", "script_url", "top_level_url", "call_stack", "symbol", "operation", "value", "arguments", "time_stamp", "site_url", "parent_url"]]
javascript_analysis_set.head()
                                      

Unnamed: 0_level_0,extension_session_uuid,script_url,top_level_url,call_stack,symbol,operation,value,arguments,time_stamp,site_url,parent_url
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6,dc49b49a-d785-409c-9556-04c70223de91,https://gemeente.groningen.nl/misc/drupal.js?p...,https://gemeente.groningen.nl/,@https://gemeente.groningen.nl/misc/drupal.js?...,window.document.cookie,set,has_js=1; path=/,,2019-05-15T21:49:02.608Z,http://gemeente.groningen.nl,http://gemeente.groningen.nl
6,dc49b49a-d785-409c-9556-04c70223de91,https://www.kcmsurvey.com/custom/gemeentegroni...,https://gemeente.groningen.nl/,,window.localStorage,get,{},,2019-05-15T21:49:02.824Z,http://gemeente.groningen.nl,http://gemeente.groningen.nl
6,dc49b49a-d785-409c-9556-04c70223de91,https://www.kcmsurvey.com/custom/gemeentegroni...,https://gemeente.groningen.nl/,,window.Storage.setItem,call,,"[""kcm_test"",""kcm_test""]",2019-05-15T21:49:02.906Z,http://gemeente.groningen.nl,http://gemeente.groningen.nl
6,dc49b49a-d785-409c-9556-04c70223de91,https://www.kcmsurvey.com/custom/gemeentegroni...,https://gemeente.groningen.nl/,,window.localStorage,get,"{""kcm_test"":""kcm_test""}",,2019-05-15T21:49:02.907Z,http://gemeente.groningen.nl,http://gemeente.groningen.nl
6,dc49b49a-d785-409c-9556-04c70223de91,https://www.kcmsurvey.com/custom/gemeentegroni...,https://gemeente.groningen.nl/,,window.Storage.removeItem,call,,"[""kcm_test""]",2019-05-15T21:49:02.908Z,http://gemeente.groningen.nl,http://gemeente.groningen.nl


In [7]:
# What type of JS actions are happening on the top level gemeente_pages
# We are looking for cookie.get
# https://github.com/mercator-working-group/gemeente-social/issues/3

print(javascript_analysis_set.shape)

# Let's reduce to three columns, group by javascript symbol and operation, count and sort ascending...
# This will tell us the frequency of various javascript actions 
javascript_analysis_set[ ['symbol', 'operation', 'parent_url']]\
    .groupby(['symbol', 'operation'])\
    .count()\
    .sort_values(by=['parent_url'], ascending=False)\
    .head(10)

# So about ~7k of 19k javascript events are window.document.cookie 'get'
# So about ~1.6k of 19k javascript events are window.document.cookie 'set'


(19225, 11)


Unnamed: 0_level_0,Unnamed: 1_level_0,parent_url
symbol,operation,Unnamed: 2_level_1
window.document.cookie,get,6961
window.navigator.userAgent,get,4552
window.document.cookie,set,1668
window.document.referrer,get,1520
window.navigator.platform,get,727
window.navigator.vendor,get,621
window.name,get,559
window.navigator.cookieEnabled,get,518
window.localStorage,get,421
window.Storage.getItem,call,286


In [8]:
document_cookie = javascript_analysis_set[javascript_analysis_set['symbol'] == 'window.document.cookie']
document_cookie.shape

# Helper function to get TLD+1 level of crawled link.
def parse_tld_url(url):
    return get_fld(url, fail_silently=True)

# Create a copy to the warning goes away
script_url_cookies = document_cookie.copy()

script_url_cookies['script_url_tld'] = script_url_cookies['script_url'].apply(parse_tld_url)


def top_script_url_tld(script_url_cookies):
    # Let's create a new column with the script_url stripped to top level domain
    return script_url_cookies[['script_url_tld', 'parent_url']]\
        .groupby('script_url_tld')\
        .count()\
        .sort_values(by=['parent_url'], ascending=False)\
        .head(20)
    

top_script_url_tld(script_url_cookies)
# Ok so we've got an idea of who is most prevelant in their cookie actions... 

Unnamed: 0_level_0,parent_url
script_url_tld,Unnamed: 1_level_1
google-analytics.com,3543
simanalytics.nl,1451
readspeaker.com,830
siteimproveanalytics.com,606
siteimprove.com,215
sooqr.com,147
cobrowser.com,119
livecom.net,57
hotjar.com,34
brummen.nl,29


In [9]:
# For the get cookies
get_cookies = script_url_cookies.copy()
get_cookies = get_cookies[get_cookies['operation'] == 'get']
top_script_url_tld(get_cookies)

Unnamed: 0_level_0,parent_url
script_url_tld,Unnamed: 1_level_1
google-analytics.com,2922
simanalytics.nl,937
readspeaker.com,830
siteimproveanalytics.com,550
siteimprove.com,193
cobrowser.com,117
sooqr.com,114
livecom.net,57
hotjar.com,29
browsealoud.com,27


In [10]:
set_cookies = script_url_cookies.copy()
set_cookies = set_cookies[set_cookies['operation'] == 'set']
top_script_url_tld(set_cookies)

Unnamed: 0_level_0,parent_url
script_url_tld,Unnamed: 1_level_1
google-analytics.com,621
simanalytics.nl,514
siteimproveanalytics.com,56
sooqr.com,33
siteimprove.com,22
facebook.net,8
halderberge.nl,7
westervoort.nl,7
brummen.nl,7
purmerend.nl,6


In [11]:
social_sites_document_cookie = script_url_cookies[script_url_cookies['script_url_tld'] == "facebook.net"]
social_sites_document_cookie

Unnamed: 0_level_0,extension_session_uuid,script_url,top_level_url,call_stack,symbol,operation,value,arguments,time_stamp,site_url,parent_url,script_url_tld
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2493,0086562c-1c61-4ae5-9b34-dc1bc05a7468,https://connect.facebook.net/signals/config/16...,https://www.helmond.nl/Inwoner,s@https://connect.facebook.net/signals/config/...,window.document.cookie,get,_sce=1; _ga=GA1.2.1340102085.1557963998; _gid=...,,2019-05-15T23:46:38.996Z,http://www.helmond.nl,http://www.helmond.nl,facebook.net
2493,0086562c-1c61-4ae5-9b34-dc1bc05a7468,https://connect.facebook.net/signals/config/16...,https://www.helmond.nl/Inwoner,s@https://connect.facebook.net/signals/config/...,window.document.cookie,get,_sce=1; _ga=GA1.2.1340102085.1557963998; _gid=...,,2019-05-15T23:46:38.996Z,http://www.helmond.nl,http://www.helmond.nl,facebook.net
2493,0086562c-1c61-4ae5-9b34-dc1bc05a7468,https://connect.facebook.net/signals/config/16...,https://www.helmond.nl/Inwoner,v@https://connect.facebook.net/signals/config/...,window.document.cookie,set,_fbp=fb.0.1557963998556.1282726948;expires=Tue...,,2019-05-15T23:46:38.997Z,http://www.helmond.nl,http://www.helmond.nl,facebook.net
2493,0086562c-1c61-4ae5-9b34-dc1bc05a7468,https://connect.facebook.net/signals/config/16...,https://www.helmond.nl/Inwoner,s@https://connect.facebook.net/signals/config/...,window.document.cookie,get,_sce=1; _ga=GA1.2.1340102085.1557963998; _gid=...,,2019-05-15T23:46:38.997Z,http://www.helmond.nl,http://www.helmond.nl,facebook.net
2493,0086562c-1c61-4ae5-9b34-dc1bc05a7468,https://connect.facebook.net/signals/config/16...,https://www.helmond.nl/Inwoner,v@https://connect.facebook.net/signals/config/...,window.document.cookie,set,_fbp=fb.1.1557963998556.1282726948;expires=Tue...,,2019-05-15T23:46:38.997Z,http://www.helmond.nl,http://www.helmond.nl,facebook.net
2493,0086562c-1c61-4ae5-9b34-dc1bc05a7468,https://connect.facebook.net/signals/config/16...,https://www.helmond.nl/Inwoner,s@https://connect.facebook.net/signals/config/...,window.document.cookie,get,_sce=1; _ga=GA1.2.1340102085.1557963998; _gid=...,,2019-05-15T23:46:38.998Z,http://www.helmond.nl,http://www.helmond.nl,facebook.net
2493,0086562c-1c61-4ae5-9b34-dc1bc05a7468,https://connect.facebook.net/signals/config/16...,https://www.helmond.nl/Inwoner,s@https://connect.facebook.net/signals/config/...,window.document.cookie,get,_sce=1; _ga=GA1.2.1340102085.1557963998; _gid=...,,2019-05-15T23:46:40.183Z,http://www.helmond.nl,http://www.helmond.nl,facebook.net
2493,0086562c-1c61-4ae5-9b34-dc1bc05a7468,https://connect.facebook.net/signals/config/16...,https://www.helmond.nl/Inwoner,s@https://connect.facebook.net/signals/config/...,window.document.cookie,get,_sce=1; _ga=GA1.2.1340102085.1557963998; _gid=...,,2019-05-15T23:46:40.184Z,http://www.helmond.nl,http://www.helmond.nl,facebook.net
2493,0086562c-1c61-4ae5-9b34-dc1bc05a7468,https://connect.facebook.net/signals/config/16...,https://www.helmond.nl/Inwoner,v@https://connect.facebook.net/signals/config/...,window.document.cookie,set,_fbp=fb.1.1557963998556.1282726948;expires=Tue...,,2019-05-15T23:46:40.184Z,http://www.helmond.nl,http://www.helmond.nl,facebook.net
3669,e0da16c8-96dd-4ce0-a852-5c93b2aa9a9c,https://connect.facebook.net/signals/config/80...,https://www.visitleiden.nl/nl/landingpage,s@https://connect.facebook.net/signals/config/...,window.document.cookie,get,PHPSESSID=r4vbr4og5jao98isq4qu1uoohe; _ga=GA1....,,2019-05-16T00:39:57.197Z,http://www.leiden.nl,http://www.leiden.nl,facebook.net
