In [2]:
import dask.dataframe as dd
import os
import re
import json

from dask.distributed import Client, progress
from pandas.api.types import CategoricalDtype

DATA_DIR = 'YOUR DATA DIRECTORY HERE'
DATA_DIR_FULL = DATA_DIR + "PATH TO PARQUET FILES"

In [3]:
Client()

0,1
Client  Scheduler: tcp://127.0.0.1:64926  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 32.00 GB


# Build candidates

In [3]:
df_to_data_urls_df = dd.read_parquet(DATA_FILE, columns=['script_url', 'symbol'])
df_to_data_urls_df = df_to_data_urls_df[df_to_data_urls_df.symbol == 'HTMLCanvasElement.toDataURL']
to_data_urls = df_to_data_urls_df.script_url.unique().compute()
to_data_urls[0:5]

0                  http://www.qvc.com/akam/10/2b30e194
1                      http://www.qvc.com/_bm/async.js
2                http://www.coupang.com/akam/10/4f2b47
3     https://www.coches.net/ztkieflaaxcvaiwh121837.js
4    https://a1.alicdn.com/creation/html/2016/06/20...
Name: script_url, dtype: object

In [4]:
def large_enough(row):
    width = float(row.argument_2)
    height = float(row.argument_3)
    return width >= 16 and height >= 16

df_get_image_data_df = dd.read_parquet(DATA_FILE, columns=['script_url', 'symbol', 'argument_2', 'argument_3'])
df_get_image_data_df = df_get_image_data_df[df_get_image_data_df.symbol == 'CanvasRenderingContext2D.getImageData']
df_get_image_data_df = df_get_image_data_df[df_get_image_data_df.apply(large_enough, axis=1, meta=('bool'))]
get_image_data_urls = df_get_image_data_df.script_url.unique().compute()
get_image_data_urls[0:5]

0    http://p6.drtst.com/templates/drtuber/js/drtub...
1      https://www.jigsawplanet.com/js/jp.js?v=b177a4b
2    http://p5.vptpsn.com/templates/frontend/viptub...
3    https://code.createjs.com/createjs-2015.11.26....
4           http://cdn.promodj.com/core/core.js?1ce4f0
Name: script_url, dtype: object

In [5]:
print('n to_data_urls', len(to_data_urls))
print('n get_image_data_urls', len(get_image_data_urls))
candidate_urls = to_data_urls.append(get_image_data_urls).unique()
print('n candidate urls', len(candidate_urls))

n to_data_urls 26481
n get_image_data_urls 559
n candidate urls 27009


In [6]:
all_candidate_urls = candidate_urls.copy()

# Start removing

## 1. Remove manually filtered

In [7]:
false_positive_script_urls = {
    'http://www.fivola.com/',
    'http://cdn02.centraledachats.be/dist/js/holder.js',
    'http://ccmedia.fr/accueil.php',
    'http://rozup.ir/up/moisrex/themes/space_theme/script.js'
}

In [8]:
candidate_urls = [url for url in candidate_urls if url not in false_positive_script_urls]
print('n candidate urls', len(candidate_urls))

n candidate urls 27009


In [9]:
print(len(set(all_candidate_urls) - set(candidate_urls)))
disgarded_urls = [url for url in all_candidate_urls if url not in candidate_urls]
with open('not_canvas_fingerprinting_1.json', 'w') as f:
    f.write(json.dumps(disgarded_urls)) 

0


## 2. Remove save, restore, addEventListener

In [10]:
df_valid_calls_df = dd.read_parquet(DATA_FILE, columns=['script_url', 'symbol'])
df_valid_calls_df = df_valid_calls_df[df_valid_calls_df.symbol.isin(
    ['CanvasRenderingContext2D.save', 'CanvasRenderingContext2D.restore', 'HTMLCanvasElement.addEventListener']
)]
valid_calls_urls = df_valid_calls_df.script_url.unique().values.compute()
valid_calls_urls[0:5]

array(['https://tpc.googlesyndication.com/sadbundle/$csp%3Der3%26dns%3Doff$/4134920871885725337/createjs-2015.11.26.min.js',
       'http://pics3.city-data.com/js/maps/CANVAS/boxMap.js',
       'https://code.createjs.com/createjs-2015.11.26.min.js',
       'http://media.ufc.tv/ufc_system_assets/ufc_201707101050/js/cufon-yui.js',
       'https://sale.yhd.com/act/J3oKuL4Izcsvpn.html'], dtype=object)

In [11]:
candidate_urls = [url for url in candidate_urls if url not in valid_calls_urls]
print('n candidate urls', len(candidate_urls))

n candidate urls 26877


In [12]:
len(set(all_candidate_urls) - set(candidate_urls))

132

In [13]:
print(len(set(all_candidate_urls) - set(candidate_urls)))
disgarded_urls = [url for url in all_candidate_urls if url not in candidate_urls]
with open('not_canvas_fingerprinting_2.json', 'w') as f:
    f.write(json.dumps(disgarded_urls)) 

132


## 3. Must have written 10 or more characters

In [14]:
## Code sourced from: github.com/sensor-js/OpenWPM-mobile/blob/mobile_sensors/feature_extraction/extract_features.py

def text_length(arg_0):
    return len(arg_0.encode('ascii', 'ignore'))

In [15]:
df_write = dd.read_parquet(DATA_FILE, columns=['script_url', 'symbol', 'argument_0'])
df_write = df_write[df_write.script_url.isin(candidate_urls)]
df_write = df_write[df_write.symbol.isin(['CanvasRenderingContext2D.fillText', 'CanvasRenderingContext2D.strokeText'])]
df_write['len_arg'] = df_write.argument_0.apply(text_length, meta=('int'))
df_write = df_write[df_write.len_arg >= 10]
df_write = df_write.compute()
df_write.head()

Unnamed: 0,script_url,symbol,argument_0,len_arg
944,http://www.qvc.com/akam/10/2b30e194,CanvasRenderingContext2D.fillText,Soft Ruddy Foothold 2,21
951,http://www.qvc.com/akam/10/2b30e194,CanvasRenderingContext2D.fillText,!H71JCaj)]# 1@#,15
1007,http://www.qvc.com/_bm/async.js,CanvasRenderingContext2D.fillText,"<@nv45. F1n63r,Pr1n71n6!",24
2824,http://www.coupang.com/akam/10/4f2b47,CanvasRenderingContext2D.fillText,Soft Ruddy Foothold 2,21
2831,http://www.coupang.com/akam/10/4f2b47,CanvasRenderingContext2D.fillText,!H71JCaj)]# 1@#,15


In [16]:
too_many_write_urls = df_write.script_url.unique()
print('n "3 too long writes" urls', len(too_many_write_urls))

n "3 too long writes" urls 8514


## Apply 3

In [17]:
text_filter = set(too_many_write_urls)
candidate_urls = list(text_filter)

In [18]:
print(len(set(all_candidate_urls) - set(candidate_urls)))
disgarded_urls = [url for url in all_candidate_urls if url not in candidate_urls]
with open('not_canvas_fingerprinting_3.json', 'w') as f:
    f.write(json.dumps(disgarded_urls)) 

18495


In [19]:
with open('canvas_fingerprinting.json', 'w') as f:
    f.write(json.dumps(candidate_urls))

In [20]:
with open('not_canvas_fingerprinting.json', 'w') as f:
    f.write(json.dumps(disgarded_urls))

## Find Locations

In [32]:
with open('canvas_fingerprinting.json', 'r') as f:
    canvas_fp_urls = json.load(f)
    
print(len(canvas_fp_urls), '== 8514')

8514 == 8514


In [7]:
df = dd.read_parquet(DATA_FILE, columns=['script_url', 'location'])

In [34]:
df_locs = df[df.script_url.isin(canvas_fp_urls)]
locs = df_locs.location.unique().persist()
progress(locs, notebook=False)

[########################################] | 100% Completed |  3min  0.8s

In [35]:
print('# of locations that call canvas fingerprinting scripts:', len(locs))

# of locations that call canvas fingerprinting scripts: 38419
