# Set Up

In [2]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns
import json
from pandas.io.json import json_normalize
from datetime import datetime
from sklearn import preprocessing


In [3]:
# load data and convert columns w JSON data into normal columns
def load_df(csv_path, nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']

    df = pd.read_csv(csv_path,
                     converters={column: json.loads for column in JSON_COLUMNS},
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)

    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded. Shape: {df.shape}")
    return df

In [4]:
# main code

# load data
data = load_df('train.csv')

print('Loaded :)')

Loaded. Shape: (903653, 25)
Loaded :)


# Checking out Data

In [5]:
data.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,socialEngagementType,visitId,visitNumber,visitStartTime,device.device,geoNetwork.geoNetwork,...,trafficSource.adwordsClickInfo.isVideoAd,trafficSource.adwordsClickInfo.page,trafficSource.adwordsClickInfo.slot,trafficSource.campaign,trafficSource.campaignCode,trafficSource.isTrueDirect,trafficSource.keyword,trafficSource.medium,trafficSource.referralPath,trafficSource.source
0,Organic Search,20160902,1131660440785968503,1131660440785968503_1472830385,Not Socially Engaged,1472830385,1,1472830385,"{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Asia', 'subContinent': 'Western...",...,,,,(not set),,,(not provided),organic,,google
1,Organic Search,20160902,377306020877927890,377306020877927890_1472880147,Not Socially Engaged,1472880147,1,1472880147,"{'browser': 'Firefox', 'browserVersion': 'not ...","{'continent': 'Oceania', 'subContinent': 'Aust...",...,,,,(not set),,,(not provided),organic,,google
2,Organic Search,20160902,3895546263509774583,3895546263509774583_1472865386,Not Socially Engaged,1472865386,1,1472865386,"{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Europe', 'subContinent': 'South...",...,,,,(not set),,,(not provided),organic,,google
3,Organic Search,20160902,4763447161404445595,4763447161404445595_1472881213,Not Socially Engaged,1472881213,1,1472881213,"{'browser': 'UC Browser', 'browserVersion': 'n...","{'continent': 'Asia', 'subContinent': 'Southea...",...,,,,(not set),,,google + online,organic,,google
4,Organic Search,20160902,27294437909732085,27294437909732085_1472822600,Not Socially Engaged,1472822600,2,1472822600,"{'browser': 'Chrome', 'browserVersion': 'not a...","{'continent': 'Europe', 'subContinent': 'North...",...,,,,(not set),,True,(not provided),organic,,google


In [6]:
data.describe()

Unnamed: 0,date,visitId,visitNumber,visitStartTime
count,903653.0,903653.0,903653.0,903653.0
mean,20165890.0,1485007000.0,2.264897,1485007000.0
std,4697.698,9022124.0,9.283735,9022124.0
min,20160800.0,1470035000.0,1.0,1470035000.0
25%,20161030.0,1477561000.0,1.0,1477561000.0
50%,20170110.0,1483949000.0,1.0,1483949000.0
75%,20170420.0,1492759000.0,1.0,1492759000.0
max,20170800.0,1501657000.0,395.0,1501657000.0


In [7]:
list(data.columns.values)

['channelGrouping',
 'date',
 'fullVisitorId',
 'sessionId',
 'socialEngagementType',
 'visitId',
 'visitNumber',
 'visitStartTime',
 'device.device',
 'geoNetwork.geoNetwork',
 'totals.totals',
 'trafficSource.adContent',
 'trafficSource.adwordsClickInfo.adNetworkType',
 'trafficSource.adwordsClickInfo.criteriaParameters',
 'trafficSource.adwordsClickInfo.gclId',
 'trafficSource.adwordsClickInfo.isVideoAd',
 'trafficSource.adwordsClickInfo.page',
 'trafficSource.adwordsClickInfo.slot',
 'trafficSource.campaign',
 'trafficSource.campaignCode',
 'trafficSource.isTrueDirect',
 'trafficSource.keyword',
 'trafficSource.medium',
 'trafficSource.referralPath',
 'trafficSource.source']

# Visualizing diff vars

In [8]:
print(data["channelGrouping"][0])

Organic Search


In [None]:
plt.hist(data["channelGrouping"])