** Experiment - 2**

Goal of this experiment is to see how far off can we go with RandomForest Model with only a single predictor.

1. Set up cross-validation scheme
2. Grid search to find out optimal values ( hyperopt or bayes opt )
3. Report results.

In [38]:
%matplotlib inline

import pandas as pd
import numpy as np
import gc
import scipy as sp

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.externals import joblib

from bayes_opt import BayesianOptimization

pd.set_option('max_columns', None)

sns.set_style('dark')

SEED = 213123
np.random.seed(SEED)

import warnings
warnings.filterwarnings('ignore')

%run ../src/data/make_dataset.py
%run ../src/models/cross_validation.py

In [5]:
dataset = Dataset('../data/raw/4b699168-4-here_dataset/')

dataset.load_files()\
       .encode_target()\
       .rename_target()\
       .concat_data()\
       .save_data('../data/processed/processed.feather')

<__main__.Dataset at 0x7f8a02d65518>

In [6]:
data       = dataset.data
train_mask = dataset.get_train_mask() 

In [10]:
features = ['AngleOfSign']
label    = 'Target'

X = data.loc[train_mask, features]
y = data.loc[train_mask, label]

Xtest = data.loc[~train_mask, features]

In [11]:
params = {
    'stratify': y,
    'test_size': .3,
    'random_state': SEED
}

X_train, X_test, y_train, y_test = get_train_test_split(X, y, **params)

In [19]:
y_train.value_counts(normalize=True)

0.0    0.554178
2.0    0.406956
3.0    0.021345
1.0    0.017521
Name: Target, dtype: float64

In [21]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=2, random_state=SEED)
ll_scores = cross_validation(X_train, y_train, rf, SEED)




In [22]:
print('Mean ll score: {0} and std: {1}'.format(np.mean(ll_scores), np.std(ll_scores)))

Mean ll score: 0.29389675048673886 and std: 0.0019488475691916876


In [36]:
def rfccv(n_estimators, min_samples_split, max_depth):
    skf = StratifiedKFold(n_splits=3, random_state=SEED)
    val = cross_val_score(
        RandomForestClassifier(n_estimators=int(n_estimators),
                               min_samples_split=int(min_samples_split),
                               max_depth=int(max_depth),
                               random_state=SEED
                              ),
        X_train, y_train, scoring='neg_log_loss', cv=skf
    ).mean()
    
    return val

def parameter_search():
    gp_params = {
        'alpha': 1e-5
    }
    
    rfcBO = BayesianOptimization(
        rfccv,
        {
            'n_estimators': (10, 250),
            'min_samples_split': (2, 25),
            'max_depth': (5, 30)
        }
    )
    
    rfcBO.maximize(n_iter=10, **gp_params)
    print('RFC: %f' % rfcBO.res['max']['max_val'])

In [37]:
parameter_search()

[31mInitialization[0m
[94m----------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   max_depth |   min_samples_split |   n_estimators | 
    1 | 00m05s | [35m  -0.25019[0m | [32m    19.1442[0m | [32m            14.9380[0m | [32m      188.4220[0m | 
    2 | 00m05s |   -0.26961 |     20.7204 |              8.8383 |       200.7449 | 
    3 | 00m05s | [35m  -0.21208[0m | [32m    16.1495[0m | [32m            19.7687[0m | [32m      185.5168[0m | 
    4 | 00m01s |   -0.26320 |     26.5018 |             16.6650 |        62.3570 | 
    5 | 00m01s |   -0.27414 |     17.3623 |              4.5441 |        43.7993 | 
[31mBayesian Optimization[0m
[94m----------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   max_depth |   min_samples_split |   n_estimators | 
    6 | 00m16s | [35m  -0.13777[0m | [32m     5.0000[0m | [32m            25.0000[0m | [32m   

In [39]:
def test_model(X_train, y_train, X_test, y_test):
    rf = RandomForestClassifier(n_estimators=250, 
                                max_depth=5, 
                                min_samples_split=25, 
                                random_state=SEED)
    
    rf.fit(X_train, y_train)
    preds = rf.predict_proba(X_test)
    print('Log Loss on test set: {}'.format(log_loss(y_test, preds)))

In [40]:
test_model(X_train, y_train, X_test, y_test)

Log Loss on test set: 0.12788299648110937


In [42]:
def full_training(X, y, Xtest, save=True):
    rf = RandomForestClassifier(n_estimators=250, 
                                max_depth=5, 
                                min_samples_split=25, 
                                random_state=SEED)
    
    rf.fit(X, y)
    final_preds = rf.predict_proba(Xtest)
    
    if save:
        joblib.dump(rf, '../models/rf_model_angle_of_sign.pkl')
        
    return final_preds

In [44]:
final_preds = full_training(X, y, Xtest)

In [46]:
data.loc[~train_mask, :].head(2)

Unnamed: 0,AngleOfSign,DetectedCamera,Id,SignAspectRatio,SignHeight,SignWidth,Target
0,67,Right,2c9180975a056a64015a1e10d3f270fe,0.63,169,107,
1,16,Front,2c9180975a056a64015a1de4deb16bdc,0.88,69,61,


In [47]:
sample_sub = dataset.sub
sample_sub.loc[:, ['Front', 'Left', 'Rear', 'Right']] = final_preds

In [49]:
sample_sub.to_csv('../submissions/predict_sign/rf_angle_of_sign.csv', index=False)