In [57]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import xgboost as xgb
from scipy import sparse

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [103]:
# external scripts
%run scripts/helper.py

In [3]:
# load data
survey_train_df = pd.read_csv('./data/train_FBFog7d.csv')
survey_test_df = pd.read_csv('./data/Test_L4P23N3.csv')

In [4]:
# shape of dataset
print 'shape of training set {0} and test set {1} '.format(survey_train_df.shape, survey_test_df.shape)

shape of training set (10357, 18) and test set (3387, 17) 


In [5]:
# load alcohol dataset
alcohol_df = pd.read_csv('./data/NewVariable_Alcohol.csv')

In [6]:
# merge this dataset with both training and test set
survey_train_df_merged = pd.merge(survey_train_df, alcohol_df, on='ID', how='left')
survey_test_df_merged = pd.merge(survey_test_df, alcohol_df, on='ID', how='left')

In [7]:
# shape of dataset
print 'shape of training set {0} and test set {1} '.format(survey_train_df_merged.shape, survey_test_df_merged.shape)

shape of training set (10357, 19) and test set (3387, 18) 


In [8]:
# map labels to int
emotion_dict = {'Very Happy': 0, 'Pretty Happy': 1, 'Not Happy': 2}

def map_emotions_to_int(emotion):
    return emotion_dict[emotion]

train_labels = survey_train_df_merged.Happy.map(map_emotions_to_int)

## Two Stage Modelling

In first stage we would predict whether person is happy or not and further then we would try to predict if he is very happy or not. 

* Labels in first stage ( 1 - happy, 0 - not happy )

In [9]:
train_labels_first_stage = (train_labels<2)*1

In [10]:
# list of columns we want
cols_we_want = survey_train_df_merged.columns.drop(['ID', 'Happy', 'babies', 'teens', 'preteen', 'TVhours'])

In [11]:
# data frame with only those columns that we want
train_df = survey_train_df_merged[cols_we_want]
test_df = survey_test_df_merged[cols_we_want]

In [12]:
# fill in the missing values
train_df.fillna('-999', inplace=True)
test_df.fillna('-999', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [13]:
from sklearn.feature_extraction import DictVectorizer

train = train_df.T.to_dict().values()
test = test_df.T.to_dict().values()

vec = DictVectorizer()
train = vec.fit_transform(train)
test = vec.transform(test)

## Split dataset

In [84]:
mask = split_dataset(train, train_labels_first_stage)

In [85]:
Xtrain = train[mask]
ytrain = train_labels_first_stage[mask]

Xtest = train[~mask]
ytest = train_labels_first_stage[~mask]

## Modelling

In [86]:
print('Ratio of happy class over all instances: {:.2f}'.format(float(train_labels_first_stage.sum()) / train_labels_first_stage.shape[0]))

Ratio of happy class over all instances: 0.87


** Pretty unbalanced dataset with 87% of the classes as being marked as happy **

In [87]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn import metrics

In [88]:
scorer_func = metrics.make_scorer(score, greater_is_better=True)

In [89]:
skf = cross_validation.StratifiedKFold(ytrain, n_folds=5, shuffle=True)
rf = RandomForestClassifier()

cv_scores = cross_validation.cross_val_score(rf, Xtrain, ytrain, scoring='accuracy', n_jobs=-1, cv=skf)

In [90]:
print 'Mean cross validation score across 5-folds {0}'.format(np.mean(cv_scores))

Mean cross validation score across 5-folds 0.934337295203


In [91]:
# test it out on separate examples
clf = xgb.XGBClassifier(learning_rate=0.003, objective='binary:logistic', subsample=0.7, colsample_bytree=0.7, min_child_weight=30)
calibrated_clf = CalibratedClassifierCV(clf, method='isotonic', cv=3)
calibrated_clf.fit(Xtrain, ytrain)


print 'Accuracy on unseen examples {0} '.format(metrics.accuracy_score(ytest, calibrated_clf.predict(Xtest)))

Accuracy on unseen examples 0.936293436293 


In [51]:
# spit out probabilities
train_probs = calibrated_clf.predict_proba(Xtrain)
test_probs = calibrated_clf.predict_proba(Xtest)

In [60]:
# add these probs as features
Xtrain_with_probs = sparse.hstack([Xtrain, train_probs])
Xtest_with_probs = sparse.hstack([Xtest, test_probs])

In [61]:
# target values
ytrain = train_labels[mask]
ytest = train_labels[~mask]

## Second Stage
Takes in the features along with predicted probabilities from first stage classifier

## Cross validation

In [80]:
skf = cross_validation.StratifiedKFold(ytrain, n_folds=5, shuffle=True)
clf_second_stage = RandomForestClassifier(min_samples_leaf=10, n_estimators=50)

cv_scores = cross_validation.cross_val_score(clf_second_stage, Xtrain_with_probs, ytrain, scoring=scorer_func, n_jobs=-1, cv=skf)

In [81]:
print 'Mean cross validation score across 5-folds {0}'.format(np.mean(cv_scores))

Mean cross validation score across 5-folds 0.696645859321


In [83]:
clf_second_stage.fit(Xtrain_with_probs, ytrain)
print 'score on the test set {0} '.format(score(ytest, clf_second_stage.predict(Xtest_with_probs)))

score on the test set 0.706692406692 


## Run on full training set

In [92]:
# Run calibrated classifier on full training set
clf = xgb.XGBClassifier(learning_rate=0.003, objective='binary:logistic', subsample=0.7, colsample_bytree=0.7, min_child_weight=30)
calibrated_clf = CalibratedClassifierCV(clf, method='isotonic', cv=3)
calibrated_clf.fit(train, train_labels_first_stage)

CalibratedClassifierCV(base_estimator=XGBClassifier(base_score=0.5, colsample_bytree=0.7, gamma=0,
       learning_rate=0.003, max_delta_step=0, max_depth=3,
       min_child_weight=30, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', seed=0, silent=True, subsample=0.7),
            cv=3, method='isotonic')

In [93]:
train_probs = calibrated_clf.predict_proba(train)
test_probs = calibrated_clf.predict_proba(test)

In [94]:
train_with_probs = sparse.hstack([train, train_probs])
test_with_probs = sparse.hstack([test, test_probs])

In [95]:
# train second stage classifier
clf_second_stage.fit(train_with_probs, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [96]:
# predictions from second stage classifier
second_stage_preds = clf_second_stage.predict(test_with_probs)

In [101]:
prediction_labels = inverse_mapping_func(second_stage_preds)

In [106]:
create_submission_file('./submissions_week_long/multi_stage.csv', survey_test_df_merged.ID.values, prediction_labels)