In [1]:
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score

# Transaction Level

In [4]:
#Get data
raw_trans_csv = pd.read_csv('../Data/train_trans.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
classification_labels = raw_trans_csv.iloc[:,1] #Unused here - regression only
regression_labels = raw_trans_csv.iloc[:,2]
transaction_data = raw_trans_csv.iloc[:,3:]

In [6]:
#Split test and train
X_train_1, X_test, y_train_1, y_test = train_test_split(transaction_data, regression_labels, test_size=0.33)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_1, y_train_1, test_size=0.33)

In [7]:
#Create lgb datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

## Hyperparameter selection

Search Space:

num_leaves : 31 (default), 50, 100

learning_rate: 0.1 (default), 0.05

max_bin: 255 (default), 300

Total iterations required:  12

### Manual Grid Search

In [13]:
#All params
param1 = {'num_leaves':31, 'learning_rate':0.1, 'max_bin':255,'objective':'rmse'}
param2 = {'num_leaves':50, 'learning_rate':0.1, 'max_bin':255,'objective':'rmse'}
param3 = {'num_leaves':100, 'learning_rate':0.1, 'max_bin':255,'objective':'rmse'}
param4 = {'num_leaves':31, 'learning_rate':0.05, 'max_bin':255,'objective':'rmse'}
param5 = {'num_leaves':50, 'learning_rate':0.05, 'max_bin':255,'objective':'rmse'}
param6 = {'num_leaves':100, 'learning_rate':0.05, 'max_bin':255,'objective':'rmse'}
param7 = {'num_leaves':31, 'learning_rate':0.1, 'max_bin':300,'objective':'rmse'}
param8 = {'num_leaves':50, 'learning_rate':0.1, 'max_bin':300,'objective':'rmse'}
param9 = {'num_leaves':100, 'learning_rate':0.1, 'max_bin':300,'objective':'rmse'}
param10 = {'num_leaves':31, 'learning_rate':0.05, 'max_bin':300,'objective':'rmse'}
param11 = {'num_leaves':50, 'learning_rate':0.05, 'max_bin':300,'objective':'rmse'}
param12 = {'num_leaves':100, 'learning_rate':0.05, 'max_bin':300,'objective':'rmse'}

In [14]:
#Custom code for grid search
def train_model_print_metrics(param_dict, train_data, X_test, y_test):
    model = lgb.train(param_dict, train_data)
    ypred = model.predict(X_test)
    
    rmse = mean_squared_error(y_test,ypred)**(1/2)
    psuedo_r2 = r2_score(y_test, ypred)
    explained_variance = explained_variance_score(y_test,ypred)
    print("RMSE: "+str(rmse)+"\n")
    print("Psuedo R2: "+str(psuedo_r2)+"\n")
    print("Explained Variance: "+str(explained_variance)+"\n")
    
    return [rmse, psuedo_r2, explained_variance]

In [10]:
metrics_grid = []

In [11]:
metrics_grid.append(train_model_print_metrics(param1, train_data, X_valid, y_valid))

RMSE: 50926370.31321675

Psuedo R2: -0.047743702266721755

Explained Variance: -0.047743459423267964



In [12]:
metrics_grid.append(train_model_print_metrics(param2, train_data, X_valid, y_valid))

RMSE: 50957469.26743807

Psuedo R2: -0.049023733892990196

Explained Variance: -0.0490227561846075



In [13]:
metrics_grid.append(train_model_print_metrics(param3, train_data, X_valid, y_valid))

RMSE: 51287110.642113

Psuedo R2: -0.06263979911720452

Explained Variance: -0.06263894423677474



In [14]:
metrics_grid.append(train_model_print_metrics(param4, train_data, X_valid, y_valid))

RMSE: 49647262.613002636

Psuedo R2: 0.004227272565601736

Explained Variance: 0.004227317994338731



In [15]:
metrics_grid.append(train_model_print_metrics(param5, train_data, X_valid, y_valid))

RMSE: 49552962.32314937

Psuedo R2: 0.00800643271341217

Explained Variance: 0.008006571298355691



In [16]:
metrics_grid.append(train_model_print_metrics(param6, train_data, X_valid, y_valid))

RMSE: 49609265.143288925

Psuedo R2: 0.005750916080727642

Explained Variance: 0.005751306386997945



In [17]:
metrics_grid.append(train_model_print_metrics(param7, train_data, X_valid, y_valid))

RMSE: 50926370.31321675

Psuedo R2: -0.047743702266721755

Explained Variance: -0.047743459423267964



In [18]:
metrics_grid.append(train_model_print_metrics(param8, train_data, X_valid, y_valid))

RMSE: 50957469.26743807

Psuedo R2: -0.049023733892990196

Explained Variance: -0.0490227561846075



In [19]:
metrics_grid.append(train_model_print_metrics(param9, train_data, X_valid, y_valid))

RMSE: 51287110.642113

Psuedo R2: -0.06263979911720452

Explained Variance: -0.06263894423677474



In [20]:
metrics_grid.append(train_model_print_metrics(param10, train_data, X_valid, y_valid))

RMSE: 49647262.613002636

Psuedo R2: 0.004227272565601736

Explained Variance: 0.004227317994338731



In [21]:
metrics_grid.append(train_model_print_metrics(param11, train_data, X_valid, y_valid))

RMSE: 49552962.32314937

Psuedo R2: 0.00800643271341217

Explained Variance: 0.008006571298355691



In [22]:
metrics_grid.append(train_model_print_metrics(param12, train_data, X_valid, y_valid))

RMSE: 49609265.143288925

Psuedo R2: 0.005750916080727642

Explained Variance: 0.005751306386997945



In [23]:
metrics_grid

[[50926370.31321675, -0.047743702266721755, -0.047743459423267964],
 [50957469.26743807, -0.049023733892990196, -0.0490227561846075],
 [51287110.642113, -0.06263979911720452, -0.06263894423677474],
 [49647262.613002636, 0.004227272565601736, 0.004227317994338731],
 [49552962.32314937, 0.00800643271341217, 0.008006571298355691],
 [49609265.143288925, 0.005750916080727642, 0.005751306386997945],
 [50926370.31321675, -0.047743702266721755, -0.047743459423267964],
 [50957469.26743807, -0.049023733892990196, -0.0490227561846075],
 [51287110.642113, -0.06263979911720452, -0.06263894423677474],
 [49647262.613002636, 0.004227272565601736, 0.004227317994338731],
 [49552962.32314937, 0.00800643271341217, 0.008006571298355691],
 [49609265.143288925, 0.005750916080727642, 0.005751306386997945]]

Lowest RMSE: Parameters 5 and 11

Highest Psuedo R2: Parameters 5 and 11

Highest Explained Variance: Parameters 5 and 11

I'm choosing p10 as my final model: 

num_leaves: 31

learning_rate: 0.05

max_bin: 300



## Final model fit and metrics

In [24]:
#Fit final model, predict, metrics
train_model_print_metrics(param11, train_data, X_test, y_test)



RMSE: 42661156.43881862

Psuedo R2: -0.010275758822533287

Explained Variance: -0.010273749115046638



[42661156.43881862, -0.010275758822533287, -0.010273749115046638]

Overall, this model is struggling - the negative explained variance / r2 is an indicater that this is actually doing worse than a mean-prediction

# User Level

In [8]:
#Get data, split test and train
raw_user_csv = pd.read_csv('../Data/train_user.csv', index_col=0)

In [9]:
user_classification_labels = raw_user_csv.iloc[:,1] #Unused here - regression only
user_regression_labels = raw_user_csv.iloc[:,2]
user_data = raw_user_csv.iloc[:,3:]

In [10]:
#Split test and train
X_train_1, X_test, y_train_1, y_test = train_test_split(user_data, user_regression_labels, test_size=0.33)

X_train, X_valid, y_train, y_valid = train_test_split(X_train_1, y_train_1, test_size=0.33)

In [11]:
#Create lgb datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

## Hyperparameter Selection

Search Space:

num_leaves : 31 (default), 50, 100

learning_rate: 0.1 (default), 0.05

max_bin: 255 (default), 300

Total iterations required:  12

I will be using the same param dicts and helper function from the transactions data

In [29]:
metrics_grid_user = []

In [30]:
metrics_grid_user.append(train_model_print_metrics(param1, train_data, X_valid, y_valid))

RMSE: 1.6348454970703414

Psuedo R2: 0.3858928666909015

Explained Variance: 0.3858936012634171



In [31]:
metrics_grid_user.append(train_model_print_metrics(param2, train_data, X_valid, y_valid))

RMSE: 1.6375186034645695

Psuedo R2: 0.3838829938784353

Explained Variance: 0.3838832845184944



In [32]:
metrics_grid_user.append(train_model_print_metrics(param3, train_data, X_valid, y_valid))

RMSE: 1.653283751369526

Psuedo R2: 0.3719626012525894

Explained Variance: 0.3719635483979076



In [33]:
metrics_grid_user.append(train_model_print_metrics(param4, train_data, X_valid, y_valid))

RMSE: 1.6345452544390613

Psuedo R2: 0.3861184099742774

Explained Variance: 0.38611924565219224



In [34]:
metrics_grid_user.append(train_model_print_metrics(param5, train_data, X_valid, y_valid))

RMSE: 1.6315890296656872

Psuedo R2: 0.38833692410692333

Explained Variance: 0.38833778614872605



In [35]:
metrics_grid_user.append(train_model_print_metrics(param6, train_data, X_valid, y_valid))

RMSE: 1.633182221286418

Psuedo R2: 0.3871418041950321

Explained Variance: 0.3871427432530763



In [36]:
metrics_grid_user.append(train_model_print_metrics(param7, train_data, X_valid, y_valid))

RMSE: 1.6348454970703414

Psuedo R2: 0.3858928666909015

Explained Variance: 0.3858936012634171



In [37]:
metrics_grid_user.append(train_model_print_metrics(param8, train_data, X_valid, y_valid))

RMSE: 1.6375186034645695

Psuedo R2: 0.3838829938784353

Explained Variance: 0.3838832845184944



In [38]:
metrics_grid_user.append(train_model_print_metrics(param9, train_data, X_valid, y_valid))

RMSE: 1.653283751369526

Psuedo R2: 0.3719626012525894

Explained Variance: 0.3719635483979076



In [39]:
metrics_grid_user.append(train_model_print_metrics(param10, train_data, X_valid, y_valid))

RMSE: 1.6345452544390613

Psuedo R2: 0.3861184099742774

Explained Variance: 0.38611924565219224



In [40]:
metrics_grid_user.append(train_model_print_metrics(param11, train_data, X_valid, y_valid))

RMSE: 1.6315890296656872

Psuedo R2: 0.38833692410692333

Explained Variance: 0.38833778614872605



In [41]:
metrics_grid_user.append(train_model_print_metrics(param12, train_data, X_valid, y_valid))

RMSE: 1.633182221286418

Psuedo R2: 0.3871418041950321

Explained Variance: 0.3871427432530763



In [42]:
metrics_grid_user

[[1.6348454970703414, 0.3858928666909015, 0.3858936012634171],
 [1.6375186034645695, 0.3838829938784353, 0.3838832845184944],
 [1.653283751369526, 0.3719626012525894, 0.3719635483979076],
 [1.6345452544390613, 0.3861184099742774, 0.38611924565219224],
 [1.6315890296656872, 0.38833692410692333, 0.38833778614872605],
 [1.633182221286418, 0.3871418041950321, 0.3871427432530763],
 [1.6348454970703414, 0.3858928666909015, 0.3858936012634171],
 [1.6375186034645695, 0.3838829938784353, 0.3838832845184944],
 [1.653283751369526, 0.3719626012525894, 0.3719635483979076],
 [1.6345452544390613, 0.3861184099742774, 0.38611924565219224],
 [1.6315890296656872, 0.38833692410692333, 0.38833778614872605],
 [1.633182221286418, 0.3871418041950321, 0.3871427432530763]]

Lowest RMSE: Parameters 5 and 11

Highest Psuedo R2: Parameters 5 and 11

Highest Explained Variance: Parameters 5 and 11

## Final model fit and metrics

In [15]:
#Fit final model, predict, metrics
train_model_print_metrics(param11, train_data, X_test, y_test)


RMSE: 1.6340376865887687

Psuedo R2: 0.399714042176322

Explained Variance: 0.3997148914892392



[1.6340376865887687, 0.399714042176322, 0.3997148914892392]

This model is much better - very low RMSE

In [16]:
model = lgb.train(param11, train_data)


In [27]:
d = dict(zip(X_train.columns.values,model.feature_importance()))
sorted(d.items(), key=lambda kv: -kv[1])

[('pageviews_mean', 893),
 ('hits_mean', 601),
 ('next_session_1_adjmean', 459),
 ('Americas_freq', 247),
 ('h_13_17_rate', 165),
 ('is_source_googleplex_freq', 136),
 ('h_18_23_rate', 134),
 ('is_mobile_freq', 121),
 ('visitNumber_total', 119),
 ('q2_rate', 112),
 ('h_0_6_rate', 112),
 ('q2_freq', 110),
 ('is_bayarea_freq', 106),
 ('browser_chrome_freq', 106),
 ('is_source_googleplex_rate', 103),
 ('weekday_freq', 96),
 ('h_13_17_freq', 91),
 ('weekday_rate', 90),
 ('q4_freq', 85),
 ('q3_rate', 80),
 ('bounces_mean', 75),
 ('is_source_direct_rate', 73),
 ('h_18_23_freq', 68),
 ('Americas_rate', 64),
 ('is_medium_organic_rate', 61),
 ('q3_freq', 59),
 ('q1_rate', 58),
 ('h_7_12_rate', 58),
 ('is_medium_organic_freq', 52),
 ('is_source_direct_freq', 50),
 ('is_bayarea_rate', 46),
 ('q4_rate', 46),
 ('is_medium_referral_freq', 45),
 ('q1_freq', 44),
 ('is_medium_referral_rate', 42),
 ('h_0_6_freq', 38),
 ('system_google_freq', 37),
 ('h_7_12_freq', 11),
 ('browser_chrome_rate', 3),
 ('As