In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, PredefinedSplit
import lightgbm as lgb
from funs import rmse, clip20, clip40
import pickle

## Data

In [10]:
X_train = pd.read_parquet('data/X_train.parquet')
X_val = pd.read_parquet('data/X_val.parquet')
y_train = np.load('data/y_train.npy')
y_val = np.load('data/y_val.npy')

## Models

Grid Search for Hyperparameter tuning

### Light Gradient Boosting

In [12]:
learning_rates = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
best_rmse = 9999999999999
for lr in learning_rates:
    lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': lr, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

    lgb_model = lgb.train(lgb_params, lgb.Dataset(X_train, label=clip40(y_train)), int(100 * (lr / 0.03)))
    pred_lgb_val = lgb_model.predict(X_val)
    score = rmse(clip20(y_val), clip20(pred_lgb_val))

    if score < best_rmse:
        best_rmse = score
        best_lr = lr
        best_lgb = lgb_model

In [13]:
best_lr

0.05

We train the best model with all the data.

In [14]:
X = X_train.append(X_val)
y = np.append(y_train, y_val)

In [16]:
best_lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': best_lr, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }
best_lgb = lgb.train(lgb_params, lgb.Dataset(X, label=clip40(y)), int(100 * (lr / 0.03)))

Save the model

In [17]:
filename = 'models/best_lgb.sav'
pickle.dump(best_lgb, open(filename, 'wb'))

### Random Forest

CV Iterator

In [8]:
X = X_train.append(X_val)
Y = np.concatenate([y_train, y_val])
train_ind=np.zeros(X.shape[0])
for i in range(0, len(X_train)):
    train_ind[i] = -1
ps = PredefinedSplit(test_fold=(train_ind))

Grid Search for Hyperparameter tuning

In [9]:
param_grid={'bootstrap':[0.7, 0.8], 'max_features':[4, 6, 8], 
            'max_depth' : [None, 4, 6, 8, 10, 12]}
gs = GridSearchCV(cv = ps, 
                  estimator = RandomForestRegressor(n_estimators=300, n_jobs=4), 
                  param_grid=param_grid, scoring='neg_mean_squared_error')

In [None]:
gs.fit(X, clip40(Y))
best_rf = gs.best_estimator_

In [12]:
best_rf = gs.best_estimator_
best_rf

RandomForestRegressor(bootstrap=0.7, criterion='mse', max_depth=12,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=4, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

We train the best model with all the data.

In [6]:
best_rf = pickle.load(open('models/best_rf.sav', 'rb'))

In [8]:
best_rf.fit(X, clip40(y))

RandomForestRegressor(bootstrap=0.7, criterion='mse', max_depth=12,
           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=4, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

Save the model

In [9]:
filename = 'models/best_rf.sav'
pickle.dump(best_rf, open(filename, 'wb'))