In [29]:
import json
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import re
import s3fs
import six
from pyarrow.filesystem import S3FSWrapper

In [3]:
# import some analysis utilities from https://github.com/englehardt/crawl_utils
import sys
sys.path.append('./crawl_utils/')
import domain_utils as du
import analysis_utils as au

In [4]:
BUCKET = 'openwpm-crawls'
CRAWL_DIR = '2018-09-11_top_20k_stateless'
BUCKET_URI = '%s/%s/visits/%%s' % (BUCKET, CRAWL_DIR)
fs = s3fs.S3FileSystem()

# HTTP Requests

In [10]:
# Load the data
table_name = 'http_requests'
reqs = pq.ParquetDataset(
    BUCKET_URI % table_name,
    filesystem=fs,
    metadata_nthreads=4
).read_pandas().to_pandas()

### Add some additional columns to help with analysis

In [11]:
# Add the public suffix + 1 of a bunch of the URL columns
reqs['url_ps1'] = reqs['url'].apply(du.get_ps_plus_1)
reqs['top_ps1'] = reqs['top_level_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)
reqs['loading_ps1'] = reqs['loading_href'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)

In [12]:
# Parse some info out of the call stack
reqs['stack_scripts'] = reqs['req_call_stack'].apply(au.get_script_urls_from_call_stack_as_set)
reqs['stack_ps1s'] = reqs['stack_scripts'].apply(lambda x: set([du.get_ps_plus_1(y) for y in x]))

In [13]:
total_sites = reqs['top_level_url'].nunique()

### How many sites is doubleclick.net loaded on?

In [14]:
reqs[reqs['url_ps1'] == 'doubleclick.net'].top_level_url.nunique() / float(total_sites)

0.6348133848133848

In [15]:
reqs[reqs['url_ps1'] == 'google-analytics.com'].top_level_url.nunique() / float(total_sites)

0.7061347061347061

### What domains does doubleclick.net load other resources from?

In [16]:
reqs[
    reqs.req_call_stack != ''
]['req_call_stack'].iloc[0]

u'Y.X/b@http://pagead2.googlesyndication.com/pagead/show_ads.js:1:42031;null\nY.X@http://pagead2.googlesyndication.com/pagead/show_ads.js:1:42107;null\nMe@http://pagead2.googlesyndication.com/pagead/show_ads.js:1:42381;null\nNe@http://pagead2.googlesyndication.com/pagead/show_ads.js:1:43091;null\nnull@http://pagead2.googlesyndication.com/pagead/show_ads.js:1:53260;null\nAc@http://pagead2.googlesyndication.com/pagead/show_ads.js:1:15521;null\nEc@http://pagead2.googlesyndication.com/pagead/show_ads.js:1:18425;null\nnull@http://pagead2.googlesyndication.com/pagead/show_ads.js:1:51995;null\nnull@http://pagead2.googlesyndication.com/pagead/show_ads.js:1:2;null'

In [17]:
reqs[
    reqs.req_call_stack.apply(lambda x: len(x) > 0)
]['stack_ps1s']

7                                {googlesyndication.com}
8                                {googlesyndication.com}
9                                {googlesyndication.com}
10                               {googlesyndication.com}
12                                      {applyloan.club}
13               {applyloan.club, googlesyndication.com}
14               {applyloan.club, googlesyndication.com}
15                                         {histats.com}
16                               {googlesyndication.com}
17               {applyloan.club, googlesyndication.com}
24                                     {doubleclick.net}
27                                     {doubleclick.net}
28                                     {doubleclick.net}
29                                     {doubleclick.net}
30                               {googlesyndication.com}
33                                         {adblade.com}
40                               {googlesyndication.com}
41                             

In [18]:
reqs[
    reqs.stack_ps1s.apply(lambda x: 'doubleclick.net' in x)
].groupby('url_ps1').top_level_url.count().sort_values(ascending=False)

url_ps1
googlesyndication.com            37987
doubleclick.net                  34733
google.com                        6953
ampproject.org                    5771
2mdn.net                          3154
moatads.com                       1367
doubleverify.com                  1046
adnxs.com                         1017
rubiconproject.com                1013
gstatic.com                        967
googletagservices.com              915
adsafeprotected.com                887
advertising.com                    519
openx.net                          452
casalemedia.com                    381
amazon-adsystem.com                335
quantserve.com                     294
criteo.com                         261
google-analytics.com               255
gumgum.com                         246
adblade.com                        221
pubmatic.com                       214
clarium.global.ssl.fastly.net      177
lijit.com                          174
myvisualiq.net                     169
serving-sys.com  

# Javascript Calls

In [5]:
# Load the data
table_name = 'javascript'
js = pq.ParquetDataset(
    BUCKET_URI % table_name,
    filesystem=fs,
    metadata_nthreads=4
).read_pandas().to_pandas()

In [8]:
# Add the public suffix + 1 of a bunch of the URL columns
js['script_ps1'] = js['script_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)
js['top_ps1'] = js['top_level_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)
js['document_ps1'] = js['document_url'].apply(lambda x: du.get_ps_plus_1(x) if x is not None else None)

#### Canvas font fingerprinting

In [36]:
font_shorthand = re.compile(r"^\s*(?=(?:(?:[-a-z]+\s*){0,2}(italic|oblique))?)(?=(?:(?:[-a-z]+\s*){0,2}(small-caps))?)(?=(?:(?:[-a-z]+\s*){0,2}(bold(?:er)?|lighter|[1-9]00))?)(?:(?:normal|\1|\2|\3)\s*){0,3}((?:xx?-)?(?:small|large)|medium|smaller|larger|[.\d]+(?:\%|in|[cem]m|ex|p[ctx]))(?:\s*\/\s*(normal|[.\d]+(?:\%|in|[cem]m|ex|p[ctx])))?\s*([-_\{\}\(\)\&!\',\*\.\"\sa-zA-Z0-9]+?)\s*$")

In [None]:
# Grab all of the canvas calls
js[
    js.symbol.str.startswith('CanvasRenderingContext2D')
]

In [25]:
js[
    (js.symbol == 'CanvasRenderingContext2D.measureText') &
    (js.script_ps1 != js.top_ps1)
].groupby('script_ps1').top_ps1.count().sort_values(ascending=False)

script_ps1
mathtag.com                          55265
adbetnet.com                         32241
admicro.vn                            9000
createjs.com                          8827
tradingview.com                       4998
stripe.network                        1682
infernotions.com                      1000
nooncdn.com                            500
radial.com                             500
du3rt6yhb1dqh.cloudfront.net           500
stanza.co                              415
online-metrix.net                      348
c4assets.com                           200
paymentsmb.com                         174
werally.co                             174
signifyd.com                           174
cloudflare.com                         151
clinch.co                              149
codeproject.global.ssl.fastly.net      125
jrjimg.cn                              110
st8fm.com                               95
amap.com                                83
mfilterit.com                           76


In [32]:
js[
    (js.symbol == 'CanvasRenderingContext2D.measureText') &
    (js.script_ps1 != js.top_ps1) & 
    (js.script_ps1 == 'admicro.vn')
].arguments.apply(lambda x: json.loads(x)["0"]).unique()

array([u'abcdefghijklmnopqrstuvxywzABCDEFGHIJKLMNOPQRSTUVXYWZ0123456789'],
      dtype=object)

In [38]:
js[
    (js.symbol == 'CanvasRenderingContext2D.font') &
    (js.script_ps1 != js.top_ps1) & 
    (js.script_ps1 == 'admicro.vn')
].value.apply(lambda x: re.match(font_shorthand, x).group(6)).unique()

array([u'sans-serif', u'"book antiqua", sans-serif',
       u'"palatino linotype", sans-serif', u'"constantia", sans-serif',
       u'"ms mincho", sans-serif', u'"browallia new", sans-serif',
       u'".vntime", sans-serif', u'"segoe ui semibold", sans-serif',
       u'"segoe ui", sans-serif', u'"century gothic", sans-serif',
       u'"vni-times", sans-serif', u'"mt extra", sans-serif',
       u'"mt", sans-serif', u'"vni-maria", sans-serif',
       u'"viner hand itc", sans-serif', u'"mistral", sans-serif',
       u'"lucida sans", sans-serif', u'"arial unicode ms", sans-serif',
       u'"vni-linus", sans-serif', u'"vni-awchon", sans-serif',
       u'"rage italic", sans-serif', u'"rage", sans-serif',
       u'"papyrus", sans-serif', u'"french script mt", sans-serif',
       u'"vni-thufap2", sans-serif', u'"wst_swed", sans-serif',
       u'".tmc-ong do", sans-serif', u'"vntimes2", sans-serif',
       u'"calibri light", sans-serif', u'"calibri", sans-serif',
       u'"vni-shellal", sans-se