In [1]:
import json
import os, os.path
import pandas as pd
from pathlib import Path
from configparser import ConfigParser

# Input directories and config parameters

In [2]:
# Testing directories
# These are the input and output files from 'test_generate_url_list_spark.py', to check that behaviour is consistent
TEST_CSV_INPUT = 'test_urls.csv'
TEST_CSV_PARSED = 'parsed_test_urls.csv'

# Full dataset directories
# This is the path to the output of 'generate_url_list_spark.py' operating on the full dataset
WORKING_DIR = '/mnt/Data/UCOSP_DATA/resources'
FULL_PARQUET = 'full_url_list_parsed/'

# Specify display options
pd.set_option('display.max_colwidth', -1)

# Test Dataset (in local directory)
### Loading data

Load in the data generated by the `test_generate_url_list_spark.py`

In [3]:
# Load the input and parsed files
tests_input = pd.read_csv(TEST_CSV_INPUT)
tests_parsed = pd.read_csv(TEST_CSV_PARSED, index_col=0)

### Explore data

The goal of this step is to strip out the .js file out of the script url. There is a lot of junk to strip off of the input url, and there are many duplicates. This step reduced the original dataset (millions of urls) to about 180k unique URLs. 

In [4]:
tests_input.head(10)

Unnamed: 0,script_url
0,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com
1,https://ajax.googleapis.com/ajax/libs/webfont/1.6.26/webfont.js
2,http://cpro.baidustatic.com/cpro/ui/noexpire/js/4.0.1/adClosefeedbackUpgrade.min.js
3,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=fe1ad16a94c816&origin=http%3A%2F%2Farabi21.com
4,https://static.dynamicyield.com/scripts/12290/dy-coll-min.js
5,https://www.syracuse.edu/about/
6,https://www.googletagmanager.com/gtm.js?id=GTM-5FC97GL
7,https://www.syracuse.edu/wp-includes/js/wp-emoji-release.min.js?ver=4.9.1
8,https://www.google-analytics.com/analytics.js
9,https://code.jquery.com/jquery-migrate-1.4.1.min.js


In [5]:
tests_parsed.head(10)

Unnamed: 0,script_url,parsed_url,filename
0,https://securepubads.g.doubleclick.net/gpt/pubads_impl_170.js,https://securepubads.g.doubleclick.net/gpt/pubads_impl_170.js,securepubads-g-doubleclick-net-gpt-pubads-impl-170.txt
1,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js?version=42#channel=f30ef17b61f384&origin=http%3A%2F%2Fwww.ubitennis.com,https://staticxx.facebook.com/connect/xd_arbiter/r/lY4eZXm_YWu.js,staticxx-facebook-com-connect-xd-arbiter-r-ly4ezxm-ywu.txt
2,https://www.syracuse.edu/wp-includes/js/wp-embed.min.js?ver=4.9.1,https://www.syracuse.edu/wp-includes/js/wp-embed.min.js,www-syracuse-edu-wp-includes-js-wp-embed-min.txt
3,https://syr-piwik-prod.syr.edu/piwik.js,https://syr-piwik-prod.syr.edu/piwik.js,syr-piwik-prod-syr-edu-piwik.txt
4,https://code.jquery.com/jquery-migrate-1.4.1.min.js,https://code.jquery.com/jquery-migrate-1.4.1.min.js,code-jquery-com-jquery-migrate-1-4-1-min.txt
5,http://b.scorecardresearch.com/beacon.js,http://b.scorecardresearch.com/beacon.js,b-scorecardresearch-com-beacon.txt
6,http://media.ufc.tv/ufc_system_assets/ufc_201707101050/js/jwplayer7/jwplayer.js,http://media.ufc.tv/ufc_system_assets/ufc_201707101050/js/jwplayer7/jwplayer.js,media-ufc-tv-ufc-system-assets-ufc-201707101050-js-jwplayer7-jwplayer.txt
7,https://apis.google.com/js/plusone.js,https://apis.google.com/js/plusone.js,apis-google-com-js-plusone.txt
8,http://cdn.optimizely.com/js/549871026.js,http://cdn.optimizely.com/js/549871026.js,cdn-optimizely-com-js-549871026.txt
9,https://www.syracuse.edu/wp-content/themes/g6-carbon/js/carbon-all.js?ver=6.3.6,https://www.syracuse.edu/wp-content/themes/g6-carbon/js/carbon-all.js,www-syracuse-edu-wp-content-themes-g6-carbon-js-carbon-all.txt


# Full Dataset to explore

Load in the data generated by the `generate_url_list_spark.py` after running over the full dataset. The `parsed_url` column holds the urls that will be requested in the next step of the pipeline, and the contents of that `GET` request will be dumped into the associated `filename`.

### Loading data

In [9]:
url_list = os.path.join(WORKING_DIR, FULL_PARQUET)
parquet_dir = Path(url_list)

input_data = pd.concat(
    pd.read_parquet(parquet_file)
        for parquet_file in parquet_dir.glob('*.parquet')
)

### Exploring data

In [10]:
# Check size of output
input_data.shape

(183434, 3)

In [11]:
# Check how output looks like
input_data.head(10)

Unnamed: 0,script_url,parsed_url,filename
0,http://115.com/static/js/jquery.js?v=1513353946,http://115.com/static/js/jquery.js,115-com-static-js-jquery-js.txt
1,http://1stream.co.za/wp-content/plugins/revslider/public/assets/js/jquery.themepunch.tools.min.js?ver=5.1.6,http://1stream.co.za/wp-content/plugins/revslider/public/assets/js/jquery.themepunch.tools.min.js,1stream-co-za-wp-content-plugins-revslider-public-assets-js-jquery-themepunch-tools-min-js.txt
2,http://ad-cdn.technoratimedia.com/00/58/88/uat_78858.js?ad_size=728x90,http://ad-cdn.technoratimedia.com/00/58/88/uat_78858.js,ad-cdn-technoratimedia-com-00-58-88-uat-78858-js.txt
3,http://ads.rubiconproject.com/ad/10822.js,http://ads.rubiconproject.com/ad/10822.js,ads-rubiconproject-com-ad-10822-js.txt
4,http://agecom.ufsc.br/wp-content/mu-plugins/videojs/video.min.js,http://agecom.ufsc.br/wp-content/mu-plugins/videojs/video.min.js,agecom-ufsc-br-wp-content-mu-plugins-videojs-video-min-js.txt
5,http://airindia.in/js/general.js,http://airindia.in/js/general.js,airindia-in-js-general-js.txt
6,http://akfs.nspmotion.com/files/htmlcreative/4-6-7-22121.js,http://akfs.nspmotion.com/files/htmlcreative/4-6-7-22121.js,akfs-nspmotion-com-files-htmlcreative-4-6-7-22121-js.txt
7,http://anthemcorporateresponsibility.com/cr/foundation/js/require.js,http://anthemcorporateresponsibility.com/cr/foundation/js/require.js,anthemcorporateresponsibility-com-cr-foundation-js-require-js.txt
8,http://app.medyanetads.com/dfp.js,http://app.medyanetads.com/dfp.js,app-medyanetads-com-dfp-js.txt
9,http://asos-fr.custhelp.com/euf/core/3.2.6/js/6.232/min/RightNow.js,http://asos-fr.custhelp.com/euf/core/3.2.6/js/6.232/min/RightNow.js,asos-fr-custhelp-com-euf-core-3-2-6-js-6-232-min-rightnow-js.txt
