In [1]:
import pandas as pd
from pandas.io.json import loads, json_normalize

# challenge specifies to load fullVisitorId as a string to guarantee uniqueness
id_to_str = {'fullVisitorId': str}

train = pd.read_csv('./all/train.csv', nrows=100000, dtype=id_to_str)
test  = pd.read_csv('./all/test.csv', nrows=100000, dtype=id_to_str)

In [2]:
def parse_json_cols(df):
    for col in ['device', 'geoNetwork', 'totals', 'trafficSource']:
        # load the json and separate
        df_json = json_normalize(df[col].apply(loads), sep='/')
        df_json.columns = [(col + '/' + c) for c in df_json.columns]
        df.drop(columns=col, inplace=True)
        df = pd.concat([df, df_json], axis=1)
    return df

#
train = parse_json_cols(train)
test  = parse_json_cols(test)

---
# Examine data

### Dates are not parsed and visitStartTime is in POSIX time
    * visitStartTime makes Date redundant (more information encoded)
    * need to get continuous time data (hour, minute)

In [3]:
from datetime import datetime

def parse_dates(df):
    df['visitStartTime'] = df['visitStartTime'].apply(lambda time: datetime.fromtimestamp(time))
    df['year']  = df['visitStartTime'].dt.year
    df['month'] = df['visitStartTime'].dt.month
    df['day']   = df['visitStartTime'].dt.day
    df['hour']  = df['visitStartTime'].dt.hour + (df['visitStartTime'].dt.minute/60)
    df.drop(columns=['date', 'visitStartTime'], inplace=True)

    return df

#
train = parse_dates(train)
test  = parse_dates(test)

### Several columns have NaN values
* Every value in totals should have a lower bound of 0.
* Will fill remainder with most frequent value

In [4]:
nan_series = train.isna().any()
nan_true   = nan_series[nan_series == True]
nan_true

totals/bounces                                  True
totals/newVisits                                True
totals/pageviews                                True
totals/transactionRevenue                       True
trafficSource/adContent                         True
trafficSource/adwordsClickInfo/adNetworkType    True
trafficSource/adwordsClickInfo/gclId            True
trafficSource/adwordsClickInfo/isVideoAd        True
trafficSource/adwordsClickInfo/page             True
trafficSource/adwordsClickInfo/slot             True
trafficSource/campaignCode                      True
trafficSource/isTrueDirect                      True
trafficSource/keyword                           True
trafficSource/referralPath                      True
dtype: bool

In [5]:
def clean_nans(df):
    totals_cols = list(filter(lambda c : c.startswith('totals'), df.columns))
    for i in totals_cols:
        df[i] = df[i].astype(float)
        df[i].fillna(0, inplace=True)
   
    nan_series = df.isna().any()
    nan_true   = nan_series[nan_series == True]
    remaining_nan_cols = list(set(nan_true.index) - set(totals_cols))
    
    for i in remaining_nan_cols:
        # fill with most frequent value
        df[i].fillna(df[i].value_counts().index[0], inplace=True)
        
    return df

#
train = clean_nans(train)
test  = clean_nans(test)

### Several columns have 1 unique value
    * Provide no information, so will drop

In [6]:
for col in train:
    if len(train[col].unique()) == 1:
        print(col)

socialEngagementType
device/browserSize
device/browserVersion
device/flashVersion
device/language
device/mobileDeviceBranding
device/mobileDeviceInfo
device/mobileDeviceMarketingName
device/mobileDeviceModel
device/mobileInputSelector
device/operatingSystemVersion
device/screenColors
device/screenResolution
geoNetwork/cityId
geoNetwork/latitude
geoNetwork/longitude
geoNetwork/networkLocation
totals/visits
trafficSource/adwordsClickInfo/criteriaParameters
trafficSource/adwordsClickInfo/isVideoAd
trafficSource/campaignCode
trafficSource/isTrueDirect


In [7]:
def drop_single_val_cols(df):
    for col in df:
        if len(df[col].unique()) == 1:
            df.drop(columns=[col], inplace=True)
    return df

#
train = drop_single_val_cols(train)
test  = drop_single_val_cols(test)

### A lot of data is categorical
* Thinks like referal links, locations, and keywords are categorical and need to be represented as such
* Will use sklearn's LabelEncoder
    * fit on all possible values for a column, then transform

In [8]:
list(train.select_dtypes(include='object').columns)

['channelGrouping',
 'fullVisitorId',
 'sessionId',
 'device/browser',
 'device/deviceCategory',
 'device/operatingSystem',
 'geoNetwork/city',
 'geoNetwork/continent',
 'geoNetwork/country',
 'geoNetwork/metro',
 'geoNetwork/networkDomain',
 'geoNetwork/region',
 'geoNetwork/subContinent',
 'trafficSource/adContent',
 'trafficSource/adwordsClickInfo/adNetworkType',
 'trafficSource/adwordsClickInfo/gclId',
 'trafficSource/adwordsClickInfo/page',
 'trafficSource/adwordsClickInfo/slot',
 'trafficSource/campaign',
 'trafficSource/keyword',
 'trafficSource/medium',
 'trafficSource/referralPath',
 'trafficSource/source']

In [9]:
from sklearn.preprocessing import LabelEncoder

for i in train.select_dtypes(include='object').columns:
    # get list of all possible categories for each column
    all_categories = list(set(list(train[i].unique()))\
                     | set(list(test[i].unique())))

    encoder = LabelEncoder()
    # fit on column
    encoder.fit(all_categories)
    train[i] = encoder.transform(train[i])
    test[i]  = encoder.transform(test[i])

### Predict natural log of revenue
* Target specified as ln(total transaction revenue + 1)
    * Need to convert transactionRevenue column to accomodate
* Evaluate with RMSE

In [10]:
from numpy import log1p

# to make the line more readable
i = 'totals/transactionRevenue'

train[i] = train[i].apply(lambda revenue: log1p(revenue))

---
# Train models

In [70]:
def get_tt_split(df):
    X = df.drop(columns=['totals/transactionRevenue'])
    y = df['totals/transactionRevenue']
    
    return train_test_split(X, y, test_size = 0.2, random_state = 3)

### Linear Regression

In [66]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def train_lr(df):
    X_train, X_test, y_train, y_test = get_tt_split(df)
    
    model = LinearRegression()
    print('Training model: {0}...\n'.format('lr'))
    model.fit(X_train, y_train)
    test_predictions = model.predict(X_test)
    
    mse = mean_squared_error(y_test, test_predictions)
    rmse = sqrt(mse)
    
    print('=====')
    print('RMSE:', rmse)
    print('=====\n\n==========')
    print('INTERCEPT:', model.intercept_)
    print('==========\n\n==========================')
    print('COEFFICIENTS (decreasing):')
    print('==========================')
    coefficients = list(zip(X_test.columns, model.coef_))
    coefficients.sort(key = lambda c: c[1])
    for i in coefficients[::-1]:
        print('{0}:\n\t{1}\n'.format(i[0], i[1]))
    
    
    return model

In [67]:
lr_model = train_lr(train)

Training model: lr...

=====
RMSE: 1.8582894507821344
=====

INTERCEPT: -5468.556876986922

COEFFICIENTS (decreasing):
year:
	2.770301931422214

totals/bounces:
	0.3631650026659217

trafficSource/adwordsClickInfo/slot:
	0.3041925545137706

totals/pageviews:
	0.2854396660639576

month:
	0.2393481316736069

channelGrouping:
	0.060844239372899234

trafficSource/adContent:
	0.011471181134603516

day:
	0.006537429214795349

device/deviceCategory:
	0.005030980732452793

trafficSource/source:
	0.003094612189183585

hour:
	0.0016087721429387215

geoNetwork/subContinent:
	0.0013255093235927592

device/browser:
	0.0009856930027360355

geoNetwork/country:
	0.0009206174651419316

geoNetwork/region:
	0.0007015852458719579

fullVisitorId:
	0.00011935583816971003

trafficSource/adwordsClickInfo/gclId:
	1.3133051772880416e-05

trafficSource/adwordsClickInfo/adNetworkType:
	-9.842474057997208e-12

visitId:
	-8.13514509770333e-08

geoNetwork/networkDomain:
	-3.0433784000468515e-06

trafficSource/keyword

In [71]:
import xgboost as xgb
from xgboost import DMatrix
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def train_xgb(df):
    X_train, X_test, y_train, y_test = get_tt_split(df)
    
    print('Training model: {0}...'.format('xgb'))
    model = xgb.train({}, DMatrix(X_train, y_train))
    test_predictions = model.predict(DMatrix(X_test))
    
    mse = mean_squared_error(y_test, test_predictions)
    rmse = sqrt(mse)
    
    print('=====')
    print('RMSE:', rmse)
    return model

In [72]:
xgb_model = train_xgb(train)

Training model: xgb...
=====
RMSE: 1.7147403852259484
