In [1]:
%matplotlib inline 

import numpy as np
import pandas as pd

from skfeature.function.information_theoretical_based import CMIM
from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import xgboost as xgb

from collections import Counter
from datetime import datetime

%run ../predict_voting_outcomes/data_preparation.py
%run ../predict_voting_outcomes/feature_preprocessing.py
%run ../predict_voting_outcomes/cross_validation_setup.py

np.random.seed(2016) # set seed

In [2]:
# load data from disk
data = pd.read_csv('../data/imputed_data.csv')

In [3]:
# fill missing values
data = fill_missing_values(data)

In [4]:
# encode labels
data = encode_cat_features(data)

In [5]:
# consider only this feature age mapping and see if it is of any importance
features = data.columns.drop('Party')
mask = (data.Party.notnull())

X = data.loc[mask, features]
y = (data[mask].Party == 'Democrat').astype(int)

Xtest = data.loc[~mask, features]

** Only include discrete variables for now **

In [6]:
X     = X.select_dtypes(include=['int64'])
Xtest = Xtest.select_dtypes(include=['int64'])

** Split into training and test splits **

In [7]:
Xtr, Xte, ytr, yte = split_dataset(X, y)

** Feature Selection **

In [180]:
feature_indices = CMIM.cmim(Xtr.values, ytr.values, n_selected_features=25)
selected_features = X.columns[feature_indices]

** Cross-validation **

In [21]:
est = RandomForestClassifier(n_estimators=1000, max_depth=5, max_features='sqrt', random_state=12313)
cv_scores = perform_cross_validation(Xtr, ytr, est)
print('Mean cv score: %f and std: %f'%(cv_scores.mean(), cv_scores.std()))

Mean cv score: 0.620119 and std: 0.010071


In [22]:
# score on unseen examples 0.612208
est.fit(Xtr, ytr)
yhat = est.predict(Xte)
print('Accuracy on held out set: %f'%(accuracy_score(yte, yhat)))

Accuracy on held out set: 0.612208


** XGBClassifier **

In [114]:
model = xgb.XGBClassifier(max_depth=3, learning_rate=0.09, gamma=2, min_child_weight=1, \
                          n_estimators=1000, colsample_bytree=0.6, subsample=0.9, seed=1231)

In [115]:
%%time 

model.fit(
    Xtr.values, 
    ytr.values, 
    eval_set = [(Xte.values, yte.values)], 
    early_stopping_rounds = 50,
    verbose = False
)

CPU times: user 2.36 s, sys: 36 ms, total: 2.4 s
Wall time: 670 ms


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.6,
       gamma=2, learning_rate=0.09, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1231, silent=True, subsample=0.9)

In [116]:
p = model.predict(Xte.values)
score = accuracy_score(yte, p)
score

0.63195691202872528

In [117]:
model.best_iteration / 0.9

132.22222222222223

** Bagging Different Classifiers **

In [111]:
class BaggingClassifier:
    
    @staticmethod
    def majority_vote(preds):
        """
        Given an array of predictions from various classifiers
        return single array with ensemble of predictions based on
        simple majority voting

        Input: list of list [[y1, y2, y3, ..], [y1, y2, y3, ...], ..] 
        Output: final prediction [y1, y2, y3, ..]
        """
        length = [len(pred) for pred in preds]

        if len(set(length)) != 1:
            raise ValueError('Predictions must be of the same length')

        pred_matrix = np.matrix(preds)
        ensemble_preds = []

        for i in range(len(preds[0])):
            pred_column = np.array(pred_matrix[:, i]).ravel()
            common_pred = Counter(pred_column)
            most_common = common_pred.most_common()[0][0]

            ensemble_preds.append(most_common)

        return ensemble_preds
    
    def __init__(self, estimators, voting='hard'):
        """
        Estimators will be a list of tuples of (n_selected_features, estimator)
        """
        
        self.estimators = estimators
        self.fitted_models = []
        
    def fit(self, X, y=None):
        
#         for n_selected_features, model in self.estimators:
#             feature_indices = CMIM.cmim(X.values, y.values, n_selected_features=n_selected_features)
#             selected_features = X.columns[feature_indices]
#             model.fit(X[selected_features], y)    
#             self.fitted_models.append((selected_features, model))
        
        for model in self.estimators:
            model.fit(X, y)
            self.fitted_models.append(model)
    
        return self
    
    def predict(self, X):
        preds = []
        
#         for selected_features, model in self.fitted_models:
#             yhat = model.predict(X[selected_features])
#             preds.append(yhat)
        
        for model in self.fitted_models:
            yhat = model.predict(X)
            preds.append(yhat)
        
        final_preds = self.majority_vote(preds)
        return final_preds

In [118]:
# pipe1 = (15, xgb.XGBClassifier(max_depth=3, learning_rate=0.08, min_child_weight=1, \
#                           n_estimators=150, colsample_bytree=0.8, subsample=0.8, seed=231))
# pipe2 = (15, xgb.XGBClassifier(max_depth=4, learning_rate=0.08, min_child_weight=1, \
#                           n_estimators=120, colsample_bytree=0.8, subsample=0.8, seed=1231))
# pipe3 = (25, xgb.XGBClassifier(max_depth=3, learning_rate=0.08, min_child_weight=1, \
#                           n_estimators=53, colsample_bytree=0.8, subsample=0.8, seed=2231))
# pipe4 = (25, xgb.XGBClassifier(max_depth=4, learning_rate=0.08, min_child_weight=1, \
#                           n_estimators=78, colsample_bytree=0.8, subsample=0.8, seed=233331))

# 226.66666666666666

estimators = [xgb.XGBClassifier(max_depth=3, learning_rate=0.09, gamma=2, min_child_weight=1, \
                          n_estimators=1000, colsample_bytree=0.6, subsample=0.9, seed=np.random.randint(0, 1000)) for i in range(10)]

In [119]:
bag = BaggingClassifier(estimators)
bag.fit(Xtr, ytr)

<__main__.BaggingClassifier at 0x7fe76a1baa90>

In [120]:
yhat = bag.predict(Xte)
print('Accuracy on the held out set:%f '%(accuracy_score(yte, yhat)))

Accuracy on the held out set:0.625673 


** Full Training **

In [121]:
bag = BaggingClassifier(estimators)
bag.fit(X, y)
predictions = bag.predict(Xtest)

In [None]:
est.fit(X[selected_features], y)
predictions = est.predict(Xtest[selected_features])

In [123]:
sub = pd.read_csv('../data/sampleSubmission2016.csv')
prediction_labels = map(lambda x: 'Democrat' if x == 1 else 'Republican', predictions)
sub['Predictions'] = list(prediction_labels)
timestamp = str(datetime.now()).replace(' ', '_')
sub.to_csv('../submissions/bag_10_xgbs_without_cmim%s.csv'%(timestamp), index=False)