In [239]:
import pandas as pd
from pandas.io.json import loads, json_normalize

# challenge specifies to load fullVisitorId as a string to guarantee uniqueness
id_to_str = {'fullVisitorId': str}

train = pd.read_csv('./all/train.csv', nrows=100000, dtype=id_to_str)
#test  = pd.read_csv('./all/test.csv', dtype=id_to_str)

In [240]:
def parse_json_cols(df):
    for col in ['device', 'geoNetwork', 'totals', 'trafficSource']:
        # load the json and separate
        df_json = json_normalize(df[col].apply(loads), sep='/')
        df_json.columns = [(col + '/' + c) for c in df_json.columns]
        df.drop(columns=col, inplace=True)
        df = pd.concat([df, df_json], axis=1)
    return df

#
train = parse_json_cols(train)
#test  = parse_json_cols(test)

# Examine data

### Several columns have NaN values
* Every value in totals should have a lower bound of 0.
* Will fill remainder with most frequent value

In [241]:
nan_series = train.isna().any()
nan_true   = nan_series[nan_series == True]
nan_true

totals/bounces                                  True
totals/newVisits                                True
totals/pageviews                                True
totals/transactionRevenue                       True
trafficSource/adContent                         True
trafficSource/adwordsClickInfo/adNetworkType    True
trafficSource/adwordsClickInfo/gclId            True
trafficSource/adwordsClickInfo/isVideoAd        True
trafficSource/adwordsClickInfo/page             True
trafficSource/adwordsClickInfo/slot             True
trafficSource/campaignCode                      True
trafficSource/isTrueDirect                      True
trafficSource/keyword                           True
trafficSource/referralPath                      True
dtype: bool

In [242]:
def clean_nans(df):
    totals_cols = list(filter(lambda c : c.startswith('totals'), df.columns))
    for i in totals_cols:
        df[i] = df[i].astype(float)
        df[i].fillna(0, inplace=True)
   
    nan_series = df.isna().any()
    nan_true   = nan_series[nan_series == True]
    remaining_nan_cols = list(set(nan_true.index) - set(totals_cols))
    
    for i in remaining_nan_cols:
        df[i].fillna(df[i].value_counts().index[0], inplace=True)
        
    return df

#
train = clean_nans(train)
#test  = clean_nans(test)

### Several columns have 1 unique value
    * Provide no information, so will drop

In [247]:
for col in train:
    if len(train[col].unique()) == 1:
        print(col)

socialEngagementType
device/browserSize
device/browserVersion
device/flashVersion
device/language
device/mobileDeviceBranding
device/mobileDeviceInfo
device/mobileDeviceMarketingName
device/mobileDeviceModel
device/mobileInputSelector
device/operatingSystemVersion
device/screenColors
device/screenResolution
geoNetwork/cityId
geoNetwork/latitude
geoNetwork/longitude
geoNetwork/networkLocation
totals/visits
trafficSource/adwordsClickInfo/criteriaParameters
trafficSource/adwordsClickInfo/isVideoAd
trafficSource/campaignCode
trafficSource/isTrueDirect


In [248]:
def drop_single_val_cols(df):
    for col in df:
        if len(df[col].unique()) == 1:
            df.drop(columns=[col], inplace=True)
    return df

#
train = drop_single_val_cols(train)
#test  = drop_single_val_cols(test)

In [249]:
for col in train:
    if len(train[col].unique()) == 1:
        print(col)

In [159]:
train[list(filter(lambda c : c.startswith('totals'), train.columns))].head()

Unnamed: 0,totals/bounces,totals/hits,totals/newVisits,totals/pageviews,totals/transactionRevenue,totals/visits
0,1.0,1.0,1.0,1.0,0.0,1.0
1,1.0,1.0,1.0,1.0,0.0,1.0
2,1.0,1.0,1.0,1.0,0.0,1.0
3,1.0,1.0,1.0,1.0,0.0,1.0
4,1.0,1.0,0.0,1.0,0.0,1.0
