# Initial Read/JSON Conversions

In [92]:
import pandas as pd
from pandas.io.json import loads, json_normalize
    
df = pd.read_csv('./all/train.csv', nrows=100000, dtype={'fullVisitorId': str})

* Function to parse json columns

In [93]:
# PREPROCESS STEP 1
def parse_json_cols(df):
    json_cols = ['device', 'geoNetwork', 'totals', 'trafficSource']

    for col in json_cols:
        df_json = json_normalize(df[col].apply(loads), sep='/')
        df_json.columns = [(col + '/' + c) for c in df_json.columns]
        df.drop(columns=col, inplace=True)
        df = pd.concat([df, df_json], axis=1)
    
    return df

In [94]:
df = parse_json_cols(df)

totals_cols = list(filter(lambda c : c.startswith('totals'), df.columns))
geoNetwork_cols = list(filter(lambda c : c.startswith('geoNetwork'), df.columns))
device_cols = list(filter(lambda c : c.startswith('device'), df.columns))
trafficSource_cols = list(filter(lambda c : c.startswith('trafficSource'), df.columns))

# Examine JSON Data

### totals/
* These fields should be treated as numeric values, not objects
* Cast them to fix this issue

In [95]:
# PREPROCESS STEP 2

def cast_totals(df):
    totals_cols = list(filter(lambda c : c.startswith('totals'), df.columns))
    for i in totals_cols:
        df[i] = df[i].astype(float)
    return df

In [96]:
df = cast_totals(df)

In [97]:
df[totals_cols].describe()

Unnamed: 0,totals/bounces,totals/hits,totals/newVisits,totals/pageviews,totals/transactionRevenue,totals/visits
count,48916.0,100000.0,77263.0,99993.0,1399.0,100000.0
mean,1.0,4.62235,1.0,3.868151,130050700.0,1.0
std,0.0,9.620067,0.0,6.981922,303636600.0,0.0
min,1.0,1.0,1.0,1.0,1200000.0,1.0
25%,1.0,1.0,1.0,1.0,24990000.0,1.0
50%,1.0,2.0,1.0,2.0,50000000.0,1.0
75%,1.0,4.0,1.0,4.0,109625000.0,1.0
max,1.0,500.0,1.0,400.0,5498000000.0,1.0


---
* The visits column only has 1 unique value

In [98]:
# Preprocess step 3
def drop_visits(df):
    df.drop(columns=['totals/visits'], inplace=True)
    return df

In [99]:
df = drop_visits(df)

totals_cols = list(filter(lambda c : c.startswith('totals'), df.columns))
df[totals_cols].describe()

Unnamed: 0,totals/bounces,totals/hits,totals/newVisits,totals/pageviews,totals/transactionRevenue
count,48916.0,100000.0,77263.0,99993.0,1399.0
mean,1.0,4.62235,1.0,3.868151,130050700.0
std,0.0,9.620067,0.0,6.981922,303636600.0
min,1.0,1.0,1.0,1.0,1200000.0
25%,1.0,1.0,1.0,1.0,24990000.0
50%,1.0,2.0,1.0,2.0,50000000.0
75%,1.0,4.0,1.0,4.0,109625000.0
max,1.0,500.0,1.0,400.0,5498000000.0


---
### geoNetwork/

In [100]:
df[geoNetwork_cols].describe()

Unnamed: 0,geoNetwork/city,geoNetwork/cityId,geoNetwork/continent,geoNetwork/country,geoNetwork/latitude,geoNetwork/longitude,geoNetwork/metro,geoNetwork/networkDomain,geoNetwork/networkLocation,geoNetwork/region,geoNetwork/subContinent
count,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000
unique,290,1,6,183,1,1,53,7228,1,218,23
top,not available in demo dataset,not available in demo dataset,Americas,United States,not available in demo dataset,not available in demo dataset,not available in demo dataset,(not set),not available in demo dataset,not available in demo dataset,Northern America
freq,55211,100000,50435,40939,100000,100000,55211,27820,100000,55211,43874


* most columns have the majority (or all of their values) as "not available in demo dataset" or "(not set)"
        * city, cityId, latitude, longitude, metro, networkLocation, region, networkDomain
---
**Dropping listed columns:**

In [101]:
# Preprocess step 4
def drop_geo_cols(df):
    df.drop(columns=['geoNetwork/city', 'geoNetwork/cityId', 'geoNetwork/latitude', 'geoNetwork/longitude', 
                 'geoNetwork/metro', 'geoNetwork/networkLocation','geoNetwork/region', 'geoNetwork/networkDomain'], 
                 inplace=True)
    return df

In [102]:
df = drop_geo_cols(df)

geoNetwork_cols = list(filter(lambda c : c.startswith('geoNetwork'), df.columns))
df[geoNetwork_cols].describe()

Unnamed: 0,geoNetwork/continent,geoNetwork/country,geoNetwork/subContinent
count,100000,100000,100000
unique,6,183,23
top,Americas,United States,Northern America
freq,50435,40939,43874


---
### device/
* Again, mosts columns only have one value: "not available in demo dataset"
        * browserSize, browserVersion, flashVersion, language, mobileDeviceBranding, mobileDeviceInfo, mobileDeviceMarketingName, mobileDeviceModel, mobileInputSelector, operatingSystemVersion, screenColors, screenResolution

In [103]:
df[device_cols].describe()

Unnamed: 0,device/browser,device/browserSize,device/browserVersion,device/deviceCategory,device/flashVersion,device/isMobile,device/language,device/mobileDeviceBranding,device/mobileDeviceInfo,device/mobileDeviceMarketingName,device/mobileDeviceModel,device/mobileInputSelector,device/operatingSystem,device/operatingSystemVersion,device/screenColors,device/screenResolution
count,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000
unique,29,1,1,3,1,2,1,1,1,1,1,1,16,1,1,1
top,Chrome,not available in demo dataset,not available in demo dataset,desktop,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,not available in demo dataset,not available in demo dataset
freq,68984,100000,100000,74135,100000,74141,100000,100000,100000,100000,100000,100000,38001,100000,100000,100000


---
**Dropping listed columns:**

In [105]:
# Preprocess step 5
def drop_device_cols(df):
    df.drop(columns=['device/browserSize', 'device/browserVersion', 'device/flashVersion', 'device/language',
                 'device/mobileDeviceBranding', 'device/mobileDeviceInfo', 'device/mobileDeviceMarketingName', 'device/mobileDeviceModel',
                 'device/mobileInputSelector', 'device/operatingSystemVersion', 'device/screenColors', 'device/screenResolution'],
                 inplace=True)
    return df

In [106]:
df = drop_device_cols(df)

device_cols = list(filter(lambda c : c.startswith('device'), df.columns))
df[device_cols].describe()

Unnamed: 0,device/browser,device/deviceCategory,device/isMobile,device/operatingSystem
count,100000,100000,100000,100000
unique,29,3,2,16
top,Chrome,desktop,False,Windows
freq,68984,74135,74141,38001


---
### trafficSource/
* A few columns here have majority "not available in demo dataset", "(not set)", "(not provided)" values:
        * adwordsClickInfo/criteriaParameters, campaign, keyword
* Column "campaignCode" has only one entry. "gclId" has only 70.

In [107]:
df[trafficSource_cols].describe()

Unnamed: 0,trafficSource/adContent,trafficSource/adwordsClickInfo/adNetworkType,trafficSource/adwordsClickInfo/criteriaParameters,trafficSource/adwordsClickInfo/gclId,trafficSource/adwordsClickInfo/isVideoAd,trafficSource/adwordsClickInfo/page,trafficSource/adwordsClickInfo/slot,trafficSource/campaign,trafficSource/campaignCode,trafficSource/isTrueDirect,trafficSource/keyword,trafficSource/medium,trafficSource/referralPath,trafficSource/source
count,1325,2574,100000,2625,2574,2574,2574,100000,1,30454,44218,100000,36473,100000
unique,31,2,1,2389,1,4,2,8,1,1,643,7,527,161
top,Google Merchandise Collection,Google Search,not available in demo dataset,Cj0KEQjwmIrJBRCRmJ_x7KDo-9oBEiQAuUPKMufMpuG3Zd...,False,1,Top,(not set),11251kjhkvahf,True,(not provided),organic,/,google
freq,662,2573,100000,10,2574,2566,2527,95442,1,30454,40273,42019,8731,44448


---
**Dropping listed columns:**

In [108]:
# Preprocess step 6
def drop_trafficSource_cols(df):
    df.drop(columns=['trafficSource/adwordsClickInfo/criteriaParameters', 'trafficSource/campaign', 'trafficSource/keyword',
                 'trafficSource/campaignCode', 'trafficSource/adwordsClickInfo/gclId'], inplace=True)
    return df

In [109]:
df = drop_trafficSource_cols(df)

trafficSource_cols = list(filter(lambda c : c.startswith('trafficSource'), df.columns))
df[trafficSource_cols].describe()

Unnamed: 0,trafficSource/adContent,trafficSource/adwordsClickInfo/adNetworkType,trafficSource/adwordsClickInfo/isVideoAd,trafficSource/adwordsClickInfo/page,trafficSource/adwordsClickInfo/slot,trafficSource/isTrueDirect,trafficSource/medium,trafficSource/referralPath,trafficSource/source
count,1325,2574,2574,2574,2574,30454,100000,36473,100000
unique,31,2,1,4,2,1,7,527,161
top,Google Merchandise Collection,Google Search,False,1,Top,True,organic,/,google
freq,662,2573,2574,2566,2527,30454,42019,8731,44448


# Examining Remaining Columns (Non JSON)
* Dates are not parsed
* visitStartTime is in POSIX time
* Looking only at qualitative data shows socialEngagementType only has 1 unique value (therefore contributes no information)

In [110]:
others = [c for c in df.columns if c not in (totals_cols + geoNetwork_cols + device_cols + trafficSource_cols)]
df[others].head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime
0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,Not Socially Engaged,1472830385,1,1472830385
1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,Not Socially Engaged,1472880147,1,1472880147
2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,Not Socially Engaged,1472865386,1,1472865386
3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,Not Socially Engaged,1472881213,1,1472881213
4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,Not Socially Engaged,1472822600,2,1472822600


In [111]:
df[['channelGrouping', 'socialEngagementType']].describe()

Unnamed: 0,channelGrouping,socialEngagementType
count,100000,100000
unique,8,1
top,Organic Search,Not Socially Engaged
freq,42019,100000


In [15]:
from datetime import datetime

df['date'] = df['date'].apply(lambda date: datetime.strptime(date, '%Y%m%d'))
df['date'] = df['date'].apply(lambda date: datetime.strptime(date, '%Y%m%d'))

df['visitStartTime'] = pd.to_datetime(train_data["visitStartTime"],unit='s')

df.drop(columns='socialEngagementType', inplace=True)


others = [c for c in df.columns if c not in (totals_cols + geoNetwork_cols + device_cols + trafficSource_cols)]
df[others].head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime
0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,1472830385,1,1472830385
1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,1472880147,1,1472880147
2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,1472865386,1,1472865386
3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,1472881213,1,1472881213
4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,1472822600,2,1472822600


---
# Back to the full dataset

In [16]:
tdf = pd.read_csv('./all/test.csv', nrows=100000, dtype={'fullVisitorId': str})

Unnamed: 0,channelGrouping,date,device,fullVisitorId,geoNetwork,sessionId,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime
0,Organic Search,20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",6167871330617112363,"{""continent"": ""Asia"", ""subContinent"": ""Southea...",6167871330617112363_1508151024,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""4""}","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508151024,2,1508151024
1,Organic Search,20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",0643697640977915618,"{""continent"": ""Europe"", ""subContinent"": ""South...",0643697640977915618_1508175522,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""5"", ""pageviews"": ""5"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508175522,1,1508175522
2,Organic Search,20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",6059383810968229466,"{""continent"": ""Europe"", ""subContinent"": ""Weste...",6059383810968229466_1508143220,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""7"", ""pageviews"": ""7"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508143220,1,1508143220
3,Organic Search,20171016,"{""browser"": ""Safari"", ""browserVersion"": ""not a...",2376720078563423631,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",2376720078563423631_1508193530,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""8"", ""pageviews"": ""4"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508193530,1,1508193530
4,Organic Search,20171016,"{""browser"": ""Safari"", ""browserVersion"": ""not a...",2314544520795440038,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",2314544520795440038_1508217442,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""9"", ""pageviews"": ""4"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508217442,1,1508217442
5,Organic Search,20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",4133039884103392367,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",4133039884103392367_1508186358,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""11"", ""pageviews"": ""5""...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508186358,1,1508186358
6,Organic Search,20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",4320478850207397557,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",4320478850207397557_1508203650,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""37"", ""pageviews"": ""15...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508203650,1,1508203650
7,Organic Search,20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",5876438247590157131,"{""continent"": ""Americas"", ""subContinent"": ""Nor...",5876438247590157131_1508184397,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""52"", ""pageviews"": ""22...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508184397,1,1508184397
8,Organic Search,20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",0514591268737702944,"{""continent"": ""Europe"", ""subContinent"": ""South...",0514591268737702944_1508189652,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""5"", ""pageviews"": ""5""}","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508189652,6,1508189652
9,Organic Search,20171016,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",6430567031531677212,"{""continent"": ""Europe"", ""subContinent"": ""South...",6430567031531677212_1508175502,Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""6"", ""pageviews"": ""6"",...","{""campaign"": ""(not set)"", ""source"": ""google"", ...",1508175502,1,1508175502
