### References 

- 

#### Load Libs

In [207]:
#### Load Libraries
from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

In [203]:
#### K-Fold Cross Validation - Random Forest

#### Define parameters
num_trees = 100
num_splits = 12

#### Train-test sets
X_training, X_val, y_training, y_val = train_test_split(X, y, test_size=0.10, random_state=42)

#### K - Folds
kf = KFold(n_splits = num_splits)
print(kf)

#### Model Dictionary
models = {}
results_df = pd.DataFrame()

#### Lop through each fold and fit the model
for split_idx, fold_index in zip(range(num_splits), kf.split(X_training)):
    #### Train test splits
    train_index = fold_index[0]
    test_index = fold_index[1]
    
    #### Train test data
    X_train, X_test = X_training[train_index], X_training[test_index]
    y_train, y_test = y_training[train_index], y_training[test_index]

    #### Define classifier
    rf_model = RandomForestClassifier(n_estimators = num_trees)
    
    #### fit model
    rf_model.fit(X_train, y_train)
    
    #### predict probabilities and insert in dataframe
    prediction = rf_model.predict_proba(X_val)[:, 1]
    results_df.insert(len(results_df.columns), "rf_fold_" + str(split_idx+1), prediction)
    
    #### Store model instance in models dict
    models['rf_fold_'+ str(split_idx+1)] = rf_model

KFold(n_splits=12, random_state=None, shuffle=False)


In [204]:
#### Generate new features -- score related

#### Mean and standard deviation for probabilities
mean_score = results_df.mean(axis=1)
score_std = results_df.std(axis=1)

#### num values greater than threshold
threshold = 0.5
num_val_gt_than_thres = results_df.gt(threshold).sum(axis =1)

#### Calculate final prediction based on model
pred_threshold = np.ceil((num_splits)/2) - 1
prediction = np.where(num_val_gt_than_thres > pred_threshold, 1, 0)

#### percent correct
percent_ones = num_val_gt_than_thres/num_splits

results_df.insert(len(results_df.columns), "mean_score", np.round(mean_score, 2))
results_df.insert(len(results_df.columns), "score_std", np.round(score_std, 2))
results_df.insert(len(results_df.columns), "gt_than_thres", num_val_gt_than_thres)
results_df.insert(len(results_df.columns), "prediction", prediction)
results_df.insert(len(results_df.columns), "percent_ones", np.round(percent_ones, 2))

In [206]:
results_df.head(10)

Unnamed: 0,rf_fold_1,rf_fold_2,rf_fold_3,rf_fold_4,rf_fold_5,rf_fold_6,rf_fold_7,rf_fold_8,rf_fold_9,rf_fold_10,rf_fold_11,rf_fold_12,mean_score,score_std,gt_than_thres,prediction,percent_ones
0,0.42,0.39,0.47,0.39,0.5,0.43,0.47,0.44,0.56,0.36,0.43,0.49,0.45,0.06,1,0,0.08
1,0.7,0.85,0.75,0.74,0.74,0.78,0.7,0.85,0.84,0.78,0.8,0.8,0.78,0.05,12,1,1.0
2,0.2,0.11,0.15,0.17,0.21,0.14,0.13,0.16,0.16,0.19,0.19,0.12,0.16,0.03,0,0,0.0
3,0.95,0.96,0.97,0.94,0.96,0.9,0.9,0.93,0.93,0.9,0.95,0.96,0.94,0.03,12,1,1.0
4,0.97,0.99,0.98,0.99,0.99,0.99,0.98,1.0,0.98,0.96,1.0,0.99,0.98,0.01,12,1,1.0
5,0.97,0.98,0.96,0.98,0.99,1.0,0.96,0.99,0.97,0.96,0.99,0.97,0.98,0.01,12,1,1.0
6,0.95,0.89,0.92,0.94,0.92,0.95,0.9,0.91,0.96,0.91,0.96,0.92,0.93,0.02,12,1,1.0
7,1.0,0.93,0.97,0.96,0.98,0.98,0.98,0.98,0.96,0.97,0.97,0.98,0.97,0.02,12,1,1.0
8,0.14,0.1,0.16,0.08,0.14,0.09,0.08,0.05,0.07,0.05,0.09,0.09,0.1,0.04,0,0,0.0
9,0.03,0.03,0.02,0.05,0.03,0.0,0.03,0.03,0.04,0.03,0.02,0.01,0.03,0.01,0,0,0.0
