In this notebook we take the approach of randomly undersampling the majority class (0 i.e. happy) to create balanced training data sets.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix
from process_data import process
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
data = pd.read_csv('data/train.csv')

In [3]:
process(data)

In [4]:
def select_features(n_trees, data):
    rf = RandomForestClassifier(n_trees)
    rf.fit(data.ix[:,:-1], data.ix[:,-1])
    fimp = rf.feature_importances_
    important = []
    for idx, name in enumerate(data.ix[:,:-1].columns):
        if fimp[idx] > 0.1:
            important.append(name)
    return important

In [None]:
if False:
    selected = select_features(1000, data)
    selected+= ["TARGET"]
    data = data[selected]
    len(selected)

The performace is similar wether we take 3 features or 300...

In [5]:
submission = False

In [6]:
if submission:
    train = data 
else:
    train, test = train_test_split(data, test_size = 0.2, random_state = 42)

In [7]:
happy = train[train.TARGET == 0]
unhappy = train[train.TARGET == 1]

In [23]:
def generate_split(N,w):
    if N<1:
        flag = True
    else:
        flag = False
    unhappy_temp = unhappy.sample(frac = 1/N, replace=flag)
    size, _ = unhappy_temp.shape
    if int(np.floor(w*size)) > happy.shape[0]:
        flag = True
    else:
        flag = False
    temp = happy.sample(int(np.floor(w*size)), replace = flag).append(unhappy_temp, ignore_index=True).sample(frac=1).reset_index(drop=True)
    return temp.ix[:,:-1], temp.ix[:,'TARGET']

In [24]:
def mean_ensemble(rfs,X_test):
    df = pd.DataFrame()
    for i, rf in enumerate(rfs):
        temp = rfs[i].predict_proba(X_test)
        Y_pred = pd.DataFrame(temp)[1]
        df = pd.concat([df,Y_pred], axis = 1)
    return df.mean(axis = 1)

In [25]:
def trainForests(N, w, N_forest, n_trees, submission):
    rfs = []
    for i in range(N_forest):
        temp =  RandomForestClassifier(n_trees, class_weight='balanced')
        rfs.append(temp)
    for i in range(N_forest):
        X_train, Y_train = generate_split(N,w)
        rfs[i].fit(X_train,Y_train)
    if not submission:
        Y_pred = mean_ensemble(rfs,test.ix[:,:-1])
        return rfs, Y_pred
    return rfs, None

In [26]:
def eval_classification(Y_test, Y_pred, print_results = False):
    # Y_pred needs to be  1 and 0's, not just probabilitys.
    n = len(Y_test)
    cm = confusion_matrix(Y_test,Y_pred)
    tp = cm[1][1]  # True positives
    fp = cm[0][1]  # False positives
    fn = cm[1][0]  # False negatives
    tn = cm[0][0]  # True negatives
    print('TP={}, FP={}, FN={}, TN={}'.format(tp,fp,fn,tn))
    miss = (fp + fn)/n    # missclassification error
    accu = 1 -  miss      # accuracy
    recall = tp/(tp + fn) # true positive rate (TPR), sensitivity, recall = True pos./(#real pos.)
    spec = tn/(tn + fp)   # true negative rate (TNR), specificity = True neg./(#real neg.)
    prec = tp/(tp + fp)   # precision = True pos./(#predicted pos.)
    f1 = 2*(prec*recall)/(prec + recall) # F1 score
    auc = roc_auc_score(Y_test, Y_pred)  # Area under the ROC curve.
    
    if print_results:
        print(cm)
        print("Missclasification error:", miss)
        print("Accuracy:",accu)
        print("Recall (Sensitivity, TPR):", recall)
        print("Specificity (TNR):", spec)
        print("Precision:", prec)
        print("F1-score:", f1)
        print("Area under ROC curve:", auc)

    return [miss, recall, spec, prec, f1, auc]

In [27]:
def threshold(Y_prob, threshold = 0.5):
    result = []
    for y in Y_prob:
        if y <= threshold:
            result.append(0)
        else:
            result.append(1)
    return result

According to the cross-validation analysis, the best results are obtained with N=4, w=1, N_forest=60, n_trees=300

In [58]:
N = 4 # N can also be <1 to also oversample the minority class (1 ie. unhappy)
w = 1
N_forest = 500
n_trees = 5

In [59]:
rfs, Y_prob = trainForests(N,w,N_forest,n_trees,submission)

In [60]:
roc_auc_score(test['TARGET'],Y_prob)

0.83483121884515321

In [57]:
# Project the probabilities into actual classifiers.
Y_pred = threshold(Y_prob, threshold = 0.5)
_ = eval_classification(test['TARGET'],Y_pred, print_results = True)

TP=454, FP=3164, FN=153, TN=11433
[[11433  3164]
 [  153   454]]
Missclasification error: 0.218166272034
Accuracy: 0.781833727966
Recall (Sensitivity, TPR): 0.747940691928
Specificity (TNR): 0.78324313215
Precision: 0.125483692648
F1-score: 0.214911242604
Area under ROC curve: 0.765591912039


In [None]:
# Plot feature importance
def plot_features(forest, n):  
    importances = forest.feature_importances_

    std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]
    
    # Plot the feature importances of the forest
    plt.figure()
    plt.title("Feature importances")
    plt.bar(range(n), importances[indices],
           color="r", yerr=std[indices], align="center")
    plt.xticks(range(n), indices)
    plt.xlim([-1, n])
    plt.show()

## Only for submission

In [None]:
test = pd.read_csv('data/test.csv')
test_id = test.ix[:,'ID'].values
process(test)

In [None]:
Y_pred = mean_ensemble(rfs,test)

In [None]:
pd.DataFrame({"ID": test_id, "TARGET": Y_pred}).to_csv('submissions/rforest_ensemble_feature.csv',index=False)