In [21]:
import sys
import numpy as np
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier 
from importlib.machinery import SourceFileLoader

# TODO Jupyter working directory is /notebooks. Therefore importing "from src.data" isn't working. Manually passing --notebook-dir
# when running the notebook didn't work. Needs to be investigated and fixed. This is a workaround
process_test_data = SourceFileLoader('process_test_data', '../src/data/process_test_data.py').load_module()
visualize = SourceFileLoader('visualize', '../src/visualization/visualize.py').load_module()

# True if you want to run with the Kaggle train and test set for submission. Otherwise it will use split kaggle train data for model 
# optimization to calculate stats and parameter optimization
use_kaggle_data = False 

If not using Kaggle data set for submission, split train datasets for training (80%), testing (10%) and validation (10%)
and normalize features using MinMaxScaler. Else load full Kaggle data and predict using Kaggle test set for submission

In [22]:
if use_kaggle_data:
    X_train, y_train, X_test = process_test_data.load_kaggle_train_and_test_data('../data/raw/train.csv', '../data/raw/test.csv')
else:    
    X_train, y_train, X_test, y_test, X_valid, y_valid = \
    process_test_data.split_and_normalize('../data/raw/train.csv', '../data/processed')

Original train shape: (8000, 21)
Concat shape: (8000, 20)
Files written to: ../data/processed
X_train shape: (6400, 19)
y_train shape: (6400, 1)
X_test shape: (800, 20)
y_test shape: (800, 1)
X_valid shape: (800, 20)
y_valid shape: (800, 1)


Check details of the data if required

In [23]:
# X_train.describe()

Dropping features that have low importance

In [24]:
X_train.drop('3P Made', axis=1, inplace=True)
X_test.drop('3P Made', axis=1, inplace=True)

Training the random forest

In [25]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=1, random_state = 44, max_features="auto", oob_score=True, class_weight="balanced_subsample")
# Converting column y values to 1d array
rf.fit(X_train, y_train.values.ravel()) 

RandomForestClassifier(class_weight='balanced_subsample', n_jobs=1,
                       oob_score=True, random_state=44)

Predicting using trained random forest 

In [26]:
if use_kaggle_data==True:
    # Predicting probabilities for kaggle submission and selecting probability of class 1.
    pred = rf.predict_proba(X_test.loc[:, 'GP':'TOV'])[:,1]  
else:
    # Predicting classes (1 or 0) for calculating accuracy
    pred = rf.predict(X_test.loc[:, 'GP':'TOV']) 
    rf_probs = rf.predict_proba(X_test.loc[:, 'GP':'TOV'])[:,1]

# Data frame with ID for csv writing
result = pd.DataFrame(data = {'Id': X_test.loc[:,'Id'], 'TARGET_5Yrs': pred}) 
# Extracting values for calculating stats
result_values = result[['TARGET_5Yrs']] 

Saving the trainned model and writing result to a CSV file

In [27]:
joblib.dump(rf, "../models/nuwan_random_forest_v7.joblib", compress=3)

['../models/nuwan_random_forest_v7.joblib']

Show stats related to performance of the model if not using Kaggle dataset

In [28]:
if use_kaggle_data==False:
    visualize.show_random_forest_stats(rf, X_test.loc[:, 'GP':'TOV'], y_test, rf_probs)
    # visualize.show_feature_importance(rf, X_train) # Uncomment to see feature importance if required
else:
    result.to_csv("../data/external/submission_nuwan_v7.csv", index = False)
    print("Kaggle dataset and no stats. Writing to a file.")

Average absolute error: 16.625%
ROC: 0.66966
