# Google Analytics Customer Revenue Prediction

Analyzis of a Google Merchandise Store (also known as GStore, where Google swag is sold) customer dataset to predict revenue per customer. 

Link to Data: https://www.kaggle.com/c/ga-customer-revenue-prediction/data

We are predicting the natural log of the sum of all transactions per user.

In [None]:
import time
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize

def load_df(csv_path='train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [None]:
df_train = load_df()
df_test = load_df('test.csv')

## Preprocessing

Custom function to extract more information from dates.

In [None]:
def custom_time(POSIX):
    
    year = int(time.strftime('%Y', time.localtime(POSIX)))
    month = int(time.strftime('%m', time.localtime(POSIX)))
    day = int(time.strftime('%d', time.localtime(POSIX)))
    hour = int(time.strftime('%H', time.localtime(POSIX)))
    
    dayofweek = int(time.strftime('%w', time.localtime(POSIX)))
    dayofyear = int(time.strftime('%j', time.localtime(POSIX)))
    weekofyear = int(time.strftime('%W', time.localtime(POSIX)))
    
    Is_month_start = True if day < 5 else False
    Is_month_end = True if day > 25 else False
    Is_quarter_start = True if month in [1, 4, 7, 10] and day < 15 else False 
    Is_quarter_end = True if month in [3, 6, 9, 12] and day > 15 else False
    Is_year_start = True if dayofyear < 50 else False
    Is_year_end = True if dayofyear > 300 else False
    
    return [year, month, day, hour, dayofweek, dayofyear, weekofyear, Is_month_start, Is_month_end, Is_quarter_start, Is_quarter_end, Is_year_start, Is_year_end]

In [None]:
time_columns = ['year', 'month', 'day', 'hour', 'dayofweek', 'dayofyear', 'weekofyear', 'Is_month_start', 'Is_month_end', 'Is_quarter_start', 'Is_quarter_end', 'Is_year_start', 'Is_year_end']

In [None]:
def custom_time_columns(df):
    visitStartTime = df.visitStartTime
    custom_times = list()
    for starttime in visitStartTime:
        custom_times.append(custom_time(starttime))
    df_time = pd.DataFrame(custom_times, columns=time_columns)
    df_time = pd.concat([df, df_time], axis=1)
    df_time.drop('visitStartTime', axis=1, inplace=True)
    return df_time

In [None]:
df_train = custom_time_columns(df_train)
df_test = custom_time_columns(df_test)

Some columns contain only one unique value and are not useful for making predictions.

In [None]:
const_cols = [c for c in df_train.columns if df_train[c].nunique(dropna=False)==1]

In [None]:
df_train_time.drop(const_cols, axis=1, inplace=True)
df_test_time.drop(const_cols, axis=1, inplace=True)

Removing some columns that don't contain any useful information.

In [None]:
df_train_time.drop(['sessionId','visitId','date','trafficSource.campaignCode'],axis=1,inplace=True)
df_test_time.drop(['sessionId','visitId','date'],axis=1,inplace=True)

Encoding categorical columns.

In [None]:
cat_cols = [col for col in df_train_time.columns if (df_train_time[col].dtype == 'object' and col not in ['fullVisitorId', 'totals.transactionRevenue'])]

In [None]:
from sklearn.preprocessing import LabelEncoder
for col in cat_cols:
    lbl = LabelEncoder()
    lbl.fit(list(df_train_time[col].values.astype('str')) + list(df_test_time[col].values.astype('str')))
    df_train_time[col] = lbl.transform(list(df_train_time[col].values.astype('str')))
    df_test_time[col] = lbl.transform(list(df_test_time[col].values.astype('str')))

In [None]:
bool_cols = [col for col in df_train_time.columns if df_train_time[col].dtype == 'bool']

In [None]:
for col in bool_cols:
    df_train_time[col] = df_train_time[col].astype(float)
    df_test_time[col] = df_test_time[col].astype(float)

Filling missing values with zeroes.

In [None]:
df_train_time.fillna(0, inplace=True)
df_test_time.fillna(0, inplace=True)

In [None]:
df_train_time.set_index('fullVisitorId', inplace=True)
df_test_time.set_index('fullVisitorId', inplace=True)

Creating training set and validation set.

In [None]:
from sklearn.model_selection import train_test_split
features = [c for c in df_train_time.columns]
features.remove("totals.transactionRevenue")
df_train_time["totals.transactionRevenue"] = np.log1p(df_train_time["totals.transactionRevenue"].astype(float))
train_x, valid_x, train_y, valid_y = train_test_split(df_train_time[features], df_train_time["totals.transactionRevenue"], test_size=0.25, random_state=20)

## Model Training

In [None]:
import lightgbm as lgb 

params = {"objective" : "regression", "metric" : "rmse",
              "num_leaves" : 50, "learning_rate" : 0.02, 
              "bagging_fraction" : 0.75, "feature_fraction" : 0.8, "bagging_frequency" : 9}
    
lgb_train = lgb.Dataset(train_x, label=train_y)
lgb_val = lgb.Dataset(valid_x, label=valid_y)
model = lgb.train(params, lgb_train, 1000, valid_sets=[lgb_val], early_stopping_rounds=150, verbose_eval=20)

## Making Predictions

In [None]:
preds = model.predict(df_test_time[features], num_iteration=model.best_iteration)
preds[preds < 0] = 0
df_test_time["PredictedLogRevenue"] = np.expm1(preds)
submission = df_test_time.groupby("fullVisitorId").agg({"PredictedLogRevenue" : "sum"}).reset_index()
submission["PredictedLogRevenue"] = np.log1p(submission["PredictedLogRevenue"])
submission.to_csv("baseline.csv", index=False)