In this notebook we take the approach of randomly undersampling the majority class (0 i.e. happy) to create balanced training data sets.

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
#from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# We import custom utility functions for data processing and random forest training
from process_data import process
from train_forest import trainForests

In [10]:
data = pd.read_csv('data/train.csv')

In [3]:
process(data)

In [4]:
def select_features(n_trees, data):
    rf = RandomForestClassifier(n_trees)
    rf.fit(data.ix[:,:-1], data.ix[:,-1])
    fimp = rf.feature_importances_
    important = []
    for idx, name in enumerate(data.ix[:,:-1].columns):
        if fimp[idx] > 0.1:
            important.append(name)
    return important

In [None]:
# We can use an initial random forest to select important features
if False:
    selected = select_features(1000, data)
    selected+= ["TARGET"]
    data = data[selected]
    len(selected)

The performace is similar wether we take 3 features or 300...

In [14]:
submission = False

In [15]:
if submission:
    train = data 
else:
    train, test = train_test_split(data, test_size = 0.2, random_state = 42)

According to the cross-validation analysis, the best results are obtained with N=4, w=1, N_forest=60, n_trees=300

In [58]:
a = 0.25 # a can also be >1 to also oversample the minority class (1 ie. unhappy)
w = 1
N_forest = 500
n_trees = 5

In [59]:
rfs, Y_prob = trainForests(train, a, w, N_forest,n_trees,submission)

In [60]:
roc_auc_score(test['TARGET'],Y_prob)

0.83483121884515321

In [None]:
# If desired, transform probabilities into class labels.
def threshold(Y_prob, threshold = 0.5):
    result = []
    for y in Y_prob:
        if y <= threshold:
            result.append(0)
        else:
            result.append(1)
    return result

In [57]:
# Evaluate class labels
Y_pred = threshold(Y_prob, threshold = 0.5)
_ = eval_classification(test['TARGET'],Y_pred, print_results = True)

TP=454, FP=3164, FN=153, TN=11433
[[11433  3164]
 [  153   454]]
Missclasification error: 0.218166272034
Accuracy: 0.781833727966
Recall (Sensitivity, TPR): 0.747940691928
Specificity (TNR): 0.78324313215
Precision: 0.125483692648
F1-score: 0.214911242604
Area under ROC curve: 0.765591912039


In [None]:
# Plot feature importance
def plot_features(forest, n):  
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]
    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(n), importances[indices],
           color="r", yerr=std[indices], align="center")
    plt.xticks(range(n), indices)
    plt.xlim([-1, n])
    plt.show()

## Only for submission

In [None]:
test = pd.read_csv('data/test.csv')
test_id = test.ix[:,'ID'].values
process(test)

In [None]:
Y_prob = mean_ensemble(rfs,test)

In [None]:
pd.DataFrame({"ID": test_id, "TARGET": Y_prob}).to_csv('submissions/rforest_ensemble.csv',index=False)