# Explore web crawl dataset

In [1]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
from __future__ import division
import feather
from collections import defaultdict
import sys

#%matplotlib inline
from IPython.display import display as IPdisplay

## Append the path so the mozillametricstools package can be found.
## This is included here as a temporary workaround.
sys.path.append("/home/hadoop/git")

## https://github.com/saptarshiguha/mozillametricstools
import mozillametricstools.common.functions as mmt
import mozillametricstools.common.s3 as s3fun
import mozillametricstools.common.data as mdt
import mozillametricstools.users.dzeber.utils as mymmt
import mozillametricstools.users.dzeber.display as disp

## Shortcut to base S3 location
S3_HOME = s3fun.join_s3_path(s3fun.S3_METRICS_HOME_PATH, "dzeber")

## Make display tweaks for pandas output.
disp.prettify_pandas()

from moztelemetry.dataset import Dataset
from moztelemetry.spark import get_pings_properties
from pyspark.sql import Row
import pyspark.sql.functions as fun
from pyspark.sql.window import Window
from pyspark.sql.types import *

## If SparkSession `spark` is not pre-defined (as in IPython),
## create one from the SparkContext.
try:
    spark.version
except NameError:
    from pyspark.sql import SparkSession
    spark = SparkSession(sc)

## Don't limit pandas display.
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", -1)
## Wider plots
plt.rcParams['figure.figsize'] = (8, 5)

Updated the display CSS.
Patched the pandas module to display with row numbering.


In [2]:
disp.time_msg("Last updated")

__Last updated: Sun Jan 21 15:07:48 2018__

In [4]:
DATA_S3_PATH = "s3://safe-ucosp-2017/"

-----

In [104]:
raw_data = sc.wholeTextFiles(DATA_S3_PATH + "1_00018c3bdd6a2811ce0f3157e4ef8e28c0b5d017e5ff5e28866df78c.json")

In [109]:
def parse_records((filename, data)):
    """ Parse JSON objects in data files to a list of dicts.
    
        Each file contains a JSON list of objects, although the
        list is delimited by "{}" rather than "[]". Split into
        separate strings for each JSON object and parse.
        
        The records in the file are ordered. Add an index
        field to maintain this, and retain the input file name.
    """
    ## Split the file contents into a list of records,
    ## stripping away delimiting braces.
    ##
    ## Parsing the entire contents as a JSON list wasn't working.
    ## Split into separate strings, and parse JSON objects
    ## individually.
    data_stripped = data[2:-2]
    rows = data_stripped.split("},{")
    ## Add back delimiting braces to reconstruct valid JSON objects.
    rows_obj = map(lambda s: "{" + s + "}", rows)
    parsed_rows = map(json.loads, rows_obj)
    ## Add indexing field.
    for i, r in enumerate(parsed_rows):
        r["call_index"] = i
        r["file_name"] = filename
    return parsed_rows


def dict_to_row(row_dict):
    """ Convert dicts corresponding to individual records to Rows. """
    return Row(
        location = row_dict.get("location"),
        call_index = row_dict["call_index"],
        timestamp = row_dict.get("time_stamp"),
        script_url = row_dict.get("script_url"),
        symbol = row_dict.get("symbol"),
        operation = row_dict.get("operation"),
        value = row_dict.get("value"),
        func_name = row_dict.get("func_name"),
        arguments = row_dict.get("arguments"),
        script_line = row_dict.get("script_line"),
        script_col = row_dict.get("script_col"),
        script_loc_eval = row_dict.get("script_loc_eval"),
        in_iframe = row_dict.get("in_iframe"),
        call_stack = row_dict.get("call_stack"),
        file_name = row_dict["file_name"]
    )


crawl_data_schema = [
    ("location", "string"),
    ("call_index", "integer"),
    ("timestamp", "string"),
    ("script_url", "string"),
    ("symbol", "string"),
    ("operation", "string"),
    ("value", "string"),
    ("func_name", "string"),
    ("arguments", "string"),
    ## To avoid errors in reading data,
    ## keep as string for now and convert later.
    ("script_line", "string"),
    ("script_col", "string"),
    ("script_loc_eval", "string"),
    ("in_iframe", "boolean"),
    ("call_stack", "string"),
    ("file_name", "string")
]

In [110]:
raw_data_parsed = raw_data.map(parse_records)

In [111]:
data_rows = raw_data_parsed.flatMap(lambda r: map(dict_to_row, r))

In [112]:
DF = spark.createDataFrame(data_rows, mymmt.build_schema_from_spec(crawl_data_schema))

In [113]:
DF.toPandas()

Unnamed: 0,location,call_index,timestamp,script_url,symbol,operation,value,func_name,arguments,script_line,script_col,script_loc_eval,in_iframe,call_stack,file_name
1,http://tc.batepapo.uol.com.br/todas_as_salas.html?theme=/Cidades-e-regi%C3%B5es,0,2017-12-16T09:04:15.484Z,http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js,window.navigator.userAgent,get,Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0,require<[152]</<,,1,159992,,False,,s3://safe-ucosp-2017/1_00018c3bdd6a2811ce0f3157e4ef8e28c0b5d017e5ff5e28866df78c.json
2,http://tc.batepapo.uol.com.br/todas_as_salas.html?theme=/Cidades-e-regi%C3%B5es,1,2017-12-16T09:04:15.501Z,http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js,window.document.cookie,get,,i,,1,349107,,False,i@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:349107\nrequire<[91]<@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:349837\no@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:258\no/<@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:317\nrequire<[85]<@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:347290\no@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:258\no/<@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:317\nrequire<[38]<@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:70801\no@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:258\no/<@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:317\nrequire<[40]<@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:68368\no@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:258\no/<@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:317\nrequire<[33]<@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:21954\no@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:258\no/<@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:317\nrequire<[55]<@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:1638\no@http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js:1:258\no@http://jsuol.com.br/c/bp/scripts/all_rooms-788dd7b428.js:1:122\no/<@http://jsuol.com.br/c/bp/scripts/all_rooms-788dd7b428.js:1:317\nrequire<[7]<@http://jsuol.com.br/c/bp/scripts/all_rooms-788dd7b428.js:1:604\no@http://jsuol.com.br/c/bp/scripts/all_rooms-788dd7b428.js:1:258\ne@http://jsuol.com.br/c/bp/scripts/all_rooms-788dd7b428.js:1:437\n@http://jsuol.com.br/c/bp/scripts/all_rooms-788dd7b428.js:1:18,s3://safe-ucosp-2017/1_00018c3bdd6a2811ce0f3157e4ef8e28c0b5d017e5ff5e28866df78c.json
3,http://tc.batepapo.uol.com.br/todas_as_salas.html?theme=/Cidades-e-regi%C3%B5es,2,2017-12-16T09:04:15.520Z,http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js,window.localStorage,get,{},require<[82]</r.get,,1,354262,,False,,s3://safe-ucosp-2017/1_00018c3bdd6a2811ce0f3157e4ef8e28c0b5d017e5ff5e28866df78c.json
4,http://tc.batepapo.uol.com.br/todas_as_salas.html?theme=/Cidades-e-regi%C3%B5es,3,2017-12-16T09:04:15.521Z,http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js,window.Storage.getItem,call,,require<[82]</r.get,"{""0"":""menu""}",1,354264,,False,,s3://safe-ucosp-2017/1_00018c3bdd6a2811ce0f3157e4ef8e28c0b5d017e5ff5e28866df78c.json
5,http://tc.batepapo.uol.com.br/todas_as_salas.html?theme=/Cidades-e-regi%C3%B5es,4,2017-12-16T09:04:15.526Z,http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js,window.localStorage,get,{},require<[82]</r.get,,1,354262,,False,,s3://safe-ucosp-2017/1_00018c3bdd6a2811ce0f3157e4ef8e28c0b5d017e5ff5e28866df78c.json
6,http://tc.batepapo.uol.com.br/todas_as_salas.html?theme=/Cidades-e-regi%C3%B5es,5,2017-12-16T09:04:15.527Z,http://jsuol.com.br/c/bp/scripts/common-ea8041229f.js,window.Storage.getItem,call,,require<[82]</r.get,"{""0"":""subscriberMenu""}",1,354264,,False,,s3://safe-ucosp-2017/1_00018c3bdd6a2811ce0f3157e4ef8e28c0b5d017e5ff5e28866df78c.json
7,http://tc.batepapo.uol.com.br/todas_as_salas.html?theme=/Cidades-e-regi%C3%B5es,6,2017-12-16T09:04:15.772Z,http://tm.jsuol.com.br/uoltm.js?id=mzjgr6,window.document.cookie,get,,UniqueId/$private.trackUniqueId,,1,132791,,False,UniqueId/$private.trackUniqueId@http://tm.jsuol.com.br/uoltm.js?id=mzjgr6:1:132791\nUniqueId/$public.__constructor<@http://tm.jsuol.com.br/uoltm.js?id=mzjgr6:1:133711\nUniqueId@http://tm.jsuol.com.br/uoltm.js?id=mzjgr6:1:133589\nTagManager@http://tm.jsuol.com.br/uoltm.js?id=mzjgr6:1:146487\n@http://tm.jsuol.com.br/uoltm.js?id=mzjgr6:1:149967\n@http://tm.jsuol.com.br/uoltm.js?id=mzjgr6:1:2,s3://safe-ucosp-2017/1_00018c3bdd6a2811ce0f3157e4ef8e28c0b5d017e5ff5e28866df78c.json
8,http://tc.batepapo.uol.com.br/todas_as_salas.html?theme=/Cidades-e-regi%C3%B5es,7,2017-12-16T09:04:15.772Z,http://tm.jsuol.com.br/uoltm.js?id=mzjgr6,window.localStorage,get,{},UniqueId/$private.trackUniqueId,,1,132875,,False,,s3://safe-ucosp-2017/1_00018c3bdd6a2811ce0f3157e4ef8e28c0b5d017e5ff5e28866df78c.json
9,http://tc.batepapo.uol.com.br/todas_as_salas.html?theme=/Cidades-e-regi%C3%B5es,8,2017-12-16T09:04:15.773Z,http://tm.jsuol.com.br/uoltm.js?id=mzjgr6,window.Storage.getItem,call,,UniqueId/$private.trackUniqueId,"{""0"":""tt_uid""}",1,132881,,False,,s3://safe-ucosp-2017/1_00018c3bdd6a2811ce0f3157e4ef8e28c0b5d017e5ff5e28866df78c.json
10,http://tc.batepapo.uol.com.br/todas_as_salas.html?theme=/Cidades-e-regi%C3%B5es,9,2017-12-16T09:04:15.775Z,http://tm.jsuol.com.br/uoltm.js?id=mzjgr6,window.localStorage,get,{},TrackManager/$private.isTrackEnabled,,1,1,,False,,s3://safe-ucosp-2017/1_00018c3bdd6a2811ce0f3157e4ef8e28c0b5d017e5ff5e28866df78c.json
