# Import Transcation level data

In [14]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [8]:
df_trans = pd.read_csv('../Project/data/train_trans.csv',index_col=0)

In [9]:
df_trans.columns

Index(['fullVisitorId', 'is_transaction', 'transactionRevenue', 'bounces',
       'hits', 'pageviews', 'visitNumber', 'is_source_googleplex',
       'is_source_direct', 'is_source_google', 'is_medium_referral',
       'is_medium_none', 'is_medium_organic', 'is_bayarea', 'Africa',
       'Americas', 'Asia', 'Europe', 'Oceania', 'next_session_1',
       'last_session_1', 'q1', 'q2', 'q3', 'q4', 'weekday', 'h_0_6', 'h_7_12',
       'h_13_17', 'h_18_23', 'browser_chrome', 'is_mobile', 'system_google'],
      dtype='object')

### Split the data into Train and test sets

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df_trans.drop(['is_transaction', 'fullVisitorId', 'transactionRevenue'], axis=1), 
    df_trans['transactionRevenue'], test_size=0.33, random_state=42)

# Tree Based Regression: RandomForest

In [12]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 42,n_estimators = 100)
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)

# Linear Regression

In [15]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)
reg_y_pred = reg.predict(X_test)
np.place(reg_y_pred, reg_y_pred<0, 0)

# Regularization: Lasso

In [17]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha = 0.01)
lasso_reg.fit(X_train,y_train)
lasso_y_pred = lasso_reg.predict(X_test)
np.place(reg_y_pred, reg_y_pred<0, 0)

# Bagging + SVR

In [19]:
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR
bg_svr = BaggingRegressor(base_estimator=SVR(),
                  bootstrap=True, bootstrap_features=False, max_features=1.0,
                  max_samples=0.01, n_estimators=100, n_jobs=-1)
bg_svr.fit(X_train, y_train)
svr_y_pred = bg_svr.predict(X_test)

# Compare metrics for all Four models

$\bullet$ RMSE

In [21]:
from sklearn.metrics import mean_squared_error
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_y_pred))
reg_rmse = np.sqrt(mean_squared_error(y_test, reg_y_pred))
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_y_pred))
svr_rmse = np.sqrt(mean_squared_error(y_test, svr_y_pred))
rmse = [('random_forest', rf_rmse),('linear regression', reg_rmse), ('lasso', lasso_rmse), ('SVR', svr_rmse)]
for i in rmse:
    print('{}: {}'.format(i[0],i[1]))

random_forest: 45685138.141020775
linear regression: 42678498.61433511
lasso: 42710009.996953
SVR: 43526740.04373704


$\bullet R^2$

In [23]:
from sklearn.metrics import r2_score
rf_r2 = r2_score(y_test, rf_y_pred)
reg_r2 = r2_score(y_test, reg_y_pred)
lasso_r2 = r2_score(y_test, lasso_y_pred)
svr_r2 = r2_score(y_test, svr_y_pred)
r2 = [('random_forest', rf_r2), ('linear regression', reg_r2), ('lasso', reg_r2), ('SVR', svr_r2)]
for i in r2:
    print('{}: {}'.format(i[0],i[1]))

random_forest: -0.10334518563468653
linear regression: 0.037103106894960325
lasso: 0.037103106894960325
SVR: -0.0015526924443989287


$\bullet$Explained Variance

In [24]:
from sklearn.metrics import explained_variance_score
rf_ev = explained_variance_score(y_test, rf_y_pred)
reg_ev = explained_variance_score(y_test, reg_y_pred)
lasso_ev = explained_variance_score(y_test, lasso_y_pred)
svr_ev = explained_variance_score(y_test, svr_y_pred)
ev = [('random_forest', rf_ev), ('linear regression', reg_ev), ('lasso', lasso_ev), ('svr', svr_ev) ]
for i in r2:
    print('{}: {}'.format(i[0],i[1]))

random_forest: -0.10334518563468653
linear regression: 0.037103106894960325
lasso: 0.037103106894960325
SVR: -0.0015526924443989287


Overall, these models are not suitable for our transcation level data. The negative explained variance / r2 is an indicater that this is actually doing worse than a mean-prediction.