In [179]:
%load_ext autoreload
%autoreload 2

import sys
import numpy as np
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier 
from importlib.machinery import SourceFileLoader
# For Hyperparameter optimization
from hpsklearn import HyperoptEstimator
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials

# TODO Jupyter working directory is /notebooks. Therefore importing "from src.data" isn't working. Manually passing --notebook-dir
# when running the notebook didn't work. Needs to be investigated and fixed. This is a workaround
process_test_data = SourceFileLoader('process_test_data', '../src/data/process_test_data.py').load_module()
visualize = SourceFileLoader('visualize', '../src/visualization/visualize.py').load_module()

# True if you want to run with the Kaggle train and test set for submission. Otherwise it will use split kaggle train data for model 
# optimization to calculate stats and parameter optimization
use_kaggle_data = False 
run_parameter_optimization = True
run_feature_selection = False

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


If not using Kaggle data set for submission, split train datasets for training (80%), testing (10%) and validation (10%)
and normalize features using MinMaxScaler. Else load full Kaggle data and predict using Kaggle test set for submission

In [180]:
if use_kaggle_data:
    X_train, y_train, X_test = process_test_data.load_kaggle_train_and_test_data('../data/raw/train.csv', '../data/raw/test.csv')
else:    
    X_train, y_train, X_test, y_test, X_valid, y_valid = \
    process_test_data.split_and_normalize('../data/raw/train.csv', '../data/processed')

Original train shape: (8000, 21)
Concat shape: (8000, 20)
Files written to: ../data/processed
X_train shape: (6400, 19)
y_train shape: (6400, 1)
X_test shape: (800, 20)
y_test shape: (800, 1)
X_valid shape: (800, 20)
y_valid shape: (800, 1)


Check details of the data if required

In [181]:
 # X_test.describe()

Selecting features using sequential feature selection if required

In [182]:
if run_feature_selection==True:
    num_of_features_to_select = 7
    features = process_test_data.sequential_feature_selection('../data/raw/train.csv', num_of_features_to_select) # ['GP', 'MIN', 'FGM', '3P Made', 'OREB', 'BLK', 'TOV']
    X_train = X_train[features]
    # Appending Id column since it should be kept
    features.append('Id')
    X_test = X_test[features]

Running parameter optimization if required

In [183]:
# Defining pre-identified best parameters if parameter optimization is not going to run
hyp_params = {
'n_estimators': 200, 
'max_depth': 12, 
'criterion': 'entropy',
'class_weight': None,
'max_features': 'auto'
}

def hyperparameter_tuning(params):
    from sklearn.model_selection import cross_val_score
    clf = RandomForestClassifier(**params, n_jobs=-1)
    acc = cross_val_score(clf, X_train, y_train.values.ravel(), scoring="accuracy", cv=10).mean()
    return {"loss": -acc, "status": STATUS_OK}

if run_parameter_optimization==True:
    n_estimators=[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
    criterian=["gini", "entropy"]
    class_weight=["balanced_subsample", "balanced", None]
    max_features=["auto", "sqrt", "log2"]
    space = {
    "n_estimators": hp.choice("n_estimators", n_estimators),
    "max_depth": hp.quniform("max_depth", 1, 30,1),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
    "class_weight": hp.choice("class_weight", class_weight),
    "max_features": hp.choice("max_features", max_features),    
    }
    
    # Initialize trials object
    trials = Trials()
    
    best = fmin(fn=hyperparameter_tuning, space = space, algo=tpe.suggest, max_evals=100, trials=trials)
    
    hyp_params['n_estimators'] = n_estimators[best['n_estimators']]
    hyp_params['max_depth'] = best['max_depth']
    hyp_params['criterion'] = criterian[best['criterion']]
    hyp_params['class_weight'] = class_weight[best['class_weight']]
    hyp_params['max_features'] = max_features[best['max_features']]
    
    print("Best: {}".format(best))

100%|████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [1:06:40<00:00, 40.00s/trial, best loss: -0.8359375]
Best: {'class_weight': 2, 'criterion': 1, 'max_depth': 28.0, 'max_features': 1, 'n_estimators': 7}


Training the random forest

In [184]:
rf = RandomForestClassifier(n_estimators=hyp_params['n_estimators'], n_jobs=1, random_state = 44, max_features=hyp_params['max_features'], \
                            oob_score=True, class_weight=hyp_params['class_weight'], max_depth=hyp_params['max_depth'], criterion=hyp_params['criterion'])

# Converting column y values to 1d array
rf.fit(X_train, y_train.values.ravel())

RandomForestClassifier(criterion='entropy', max_depth=28.0, max_features='sqrt',
                       n_estimators=800, n_jobs=1, oob_score=True,
                       random_state=44)

Predicting using trained random forest 

In [185]:
# Selecting columns to train
test_X = X_test.loc[:, 'GP':'TOV']
# Selecting Ids for CSV
test_X_Ids = X_test.loc[:,'Id']

if use_kaggle_data==True:
    # Predicting probabilities for kaggle submission and selecting probability of class 1.
    pred = rf.predict_proba(test_X)[:,1]  
else:
    # Predicting classes (1 or 0) for calculating accuracy
    pred = rf.predict(test_X) 
    # Probabilities for calculating ROC
    rf_probs = rf.predict_proba(test_X)[:,1]

# Data frame with ID for csv writing. In Kaggle mode pred will contains probabilities and else contains classes
result = pd.DataFrame(data = {'Id': test_X_Ids, 'TARGET_5Yrs': pred}) 
# Extracting values for calculating stats
result_values = result[['TARGET_5Yrs']] 

Saving the trainned model and writing result to a CSV file

In [186]:
joblib.dump(rf, "../models/nuwan_random_forest_v13.joblib", compress=3)

['../models/nuwan_random_forest_v13.joblib']

Show stats related to performance of the model if not using Kaggle dataset

In [187]:
if use_kaggle_data==False:
    visualize.show_random_forest_stats(rf, test_X, y_test, rf_probs)
    # visualize.show_feature_importance(rf, X_train) # Uncomment to see feature importance if required
else:
    result.to_csv("../data/external/submission_nuwan_v13.csv", index = False)
    print("Kaggle dataset and no stats. Writing to a file.")

Average absolute error: 16.75%
ROC: 0.69202
