In [1]:
import dask.dataframe as dd
import json

from dask.distributed import Client, progress

DATA_DIR = 'YOUR DATA DIRECTORY HERE'
DATA_DIR_FULL = DATA_DIR + "PATH TO PARQUET FILES"
Client()

0,1
Client  Scheduler: tcp://127.0.0.1:56640  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 12.00 GB


## Setup

In [6]:
df = dd.read_parquet(DATA_DIR_FULL, columns=['script_url', 'symbol'])

## Build Candidate URLs for `OfflineAudioContext.createOscillator`

In [7]:
create_oscillator_df = df[df.symbol == 'OfflineAudioContext.createOscillator']
create_oscillator_urls = create_oscillator_df.script_url.unique().persist()
progress(create_oscillator_urls, notebook=False)

[########################################] | 100% Completed | 51.8s

In [8]:
create_oscillator_urls = create_oscillator_urls.compute()
create_oscillator_urls[0:5]

0      https://www.alaskaair.com/px/client/main.min.js
1    https://client.perimeterx.net/PXQ76Auu14/main....
2    https://client.perimeterx.net/PXM636Svr4/main....
3    http://client.perimeterx.net/PX0F3091f3/main.m...
4             https://media1.admicro.vn/core/fipmin.js
Name: script_url, dtype: object

## Build Candidate URLs for `OfflineAudioContext.createDynamicsCompressor`

In [9]:
create_dynamics_df = df[df.symbol == 'OfflineAudioContext.createDynamicsCompressor']
create_dynamics_urls = create_dynamics_df.script_url.unique().persist()
progress(create_dynamics_urls, notebook=False)

[########################################] | 100% Completed | 47.3s

In [10]:
create_dynamics_urls = create_dynamics_urls.compute()
create_dynamics_urls[0:5]

0      https://www.alaskaair.com/px/client/main.min.js
1    https://client.perimeterx.net/PXQ76Auu14/main....
2    https://client.perimeterx.net/PXM636Svr4/main....
3    http://client.perimeterx.net/PX0F3091f3/main.m...
4             https://media1.admicro.vn/core/fipmin.js
Name: script_url, dtype: object

## Build Candidate URLs for `OfflineAudioContext.destination`

In [11]:
destination_df = df[df.symbol == 'OfflineAudioContext.destination']
destination_urls = destination_df.script_url.unique().persist()
progress(destination_urls, notebook=False)

[########################################] | 100% Completed | 39.6s

In [12]:
destination_urls = destination_urls.compute()
destination_urls[0:5]

0      https://www.alaskaair.com/px/client/main.min.js
1    https://client.perimeterx.net/PXQ76Auu14/main....
2    https://client.perimeterx.net/PXM636Svr4/main....
3    http://client.perimeterx.net/PX0F3091f3/main.m...
4             https://media1.admicro.vn/core/fipmin.js
Name: script_url, dtype: object

## Build Candidate URLs for `OfflineAudioContext.startRendering`

In [13]:
start_rendering_df = df[df.symbol == 'OfflineAudioContext.startRendering']
start_rendering_urls = start_rendering_df.script_url.unique().persist()
progress(start_rendering_urls, notebook=False)

[########################################] | 100% Completed | 40.3s

In [14]:
start_rendering_urls = start_rendering_urls.compute()
start_rendering_urls[0:5]

0      https://www.alaskaair.com/px/client/main.min.js
1    https://client.perimeterx.net/PXQ76Auu14/main....
2    https://client.perimeterx.net/PXM636Svr4/main....
3    http://client.perimeterx.net/PX0F3091f3/main.m...
4             https://media1.admicro.vn/core/fipmin.js
Name: script_url, dtype: object

## Build Candidate URLs for `OfflineAudioContext.oncomplete`

In [15]:
on_complete_df = df[df.symbol == 'OfflineAudioContext.createOscillator']
on_complete_urls = on_complete_df.script_url.unique().persist()
progress(on_complete_urls, notebook=False)

[########################################] | 100% Completed | 44.8s

In [16]:
on_complete_urls = on_complete_urls.compute()
on_complete_urls[0:5]

0      https://www.alaskaair.com/px/client/main.min.js
1    https://client.perimeterx.net/PXQ76Auu14/main....
2    https://client.perimeterx.net/PXM636Svr4/main....
3    http://client.perimeterx.net/PX0F3091f3/main.m...
4             https://media1.admicro.vn/core/fipmin.js
Name: script_url, dtype: object

## Scripts must call all 5 functions: ["OfflineAudioContext.createOscillator", "OfflineAudioContext.createDynamicsCompressor", "OfflineAudioContext.destination", "OfflineAudioContext.startRendering", "OfflineAudioContext.oncomplete"]

In [17]:
audio_fp_urls = set(create_oscillator_urls) & \
    set(create_dynamics_urls) & \
    set(destination_urls) & \
    set(start_rendering_urls) & \
    set(on_complete_urls)
print('# of script_urls using audio fingerprinting:', len(audio_fp_urls))

# of script_urls using audio fingerprinting: 170


In [18]:
all_candidate_urls = set(create_oscillator_urls) | \
    set(create_dynamics_urls) | \
    set(destination_urls) | \
    set(start_rendering_urls) | \
    set(on_complete_urls)
not_audio_fp_urls = all_candidate_urls - audio_fp_urls
print('# of script_urls that did not call all 5 symbols:', len(not_audio_fp_urls))

# of script_urls that did not call all 5 symbols: 0


## Save URLs

In [19]:
with open('audio_fingerprinting.json', 'w') as f:
    f.write(json.dumps(list(audio_fp_urls)))

In [20]:
with open('not_audio_fingerprinting.json', 'w') as f:
    f.write(json.dumps(list(not_audio_fp_urls)))

## Find Locations

In [2]:
with open('audio_fingerprinting.json', 'r') as f:
    audio_fp_urls = json.load(f)

In [4]:
df = dd.read_parquet(DATA_DIR_FULL, columns=['script_url', 'location'])

In [6]:
df_locs = df[df.script_url.isin(audio_fp_urls)]
locs = df_locs.location.unique().persist()
progress(locs, notebook=False)

[########################################] | 100% Completed | 44.7s

In [8]:
print('# of locations that call audio fingerprinting scripts:', len(locs))

# of locations that call audio fingerprinting scripts: 2006
