In [1]:
%matplotlib inline 

import numpy as np
import pandas as pd

from skfeature.function.information_theoretical_based import CMIM
from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import xgboost as xgb

from collections import Counter
from datetime import datetime

%run ../predict_voting_outcomes/data_preparation.py
%run ../predict_voting_outcomes/feature_preprocessing.py

np.random.seed(2016) # set seed

In [2]:
train = pd.read_csv('../data/train2016.csv')
test  = pd.read_csv('../data/test2016.csv')
sub   = pd.read_csv('../data/sampleSubmission2016.csv')

In [3]:
# concat training and test data
data = pd.concat((train, test))

In [4]:
# feature engineering
def engineer_features(data):
    data = missing_value_features(data)
    data['Age'] = data.YOB.map(create_age)
    data['Age_Mapping'] = data.Age.map(create_age_mapping)
    
    return data

data = engineer_features(data)
data = remove_outliers(data)

features = data.columns.drop(['USER_ID'])

data = fill_missing_values_with_flag(data[features])
data = encode_cat_features(data)

In [6]:
# consider only this feature age mapping and see if it is of any importance
features = data.columns.drop('Party')
mask = (data.Party.notnull())

X = data.loc[mask, features]
y = (data[mask].Party == 'Democrat').astype(int)

Xtest = data.loc[~mask, features]

** Only include discrete variables for now **

In [102]:
X     = X.select_dtypes(include=['int64'])
Xtest = Xtest.select_dtypes(include=['int64'])

** Split into training and test splits **

In [7]:
Xtr, Xte, ytr, yte = train_test_split(X, y, stratify=y, test_size=0.2, random_state=12386)

** Feature Selection **

In [180]:
feature_indices = CMIM.cmim(Xtr.values, ytr.values, n_selected_features=25)
selected_features = X.columns[feature_indices]

** Fit Models **

In [31]:
est = RandomForestClassifier(n_estimators=175, max_depth=5, max_features='sqrt', random_state=1231)
skf = StratifiedKFold(ytr, n_folds=5, random_state=123)

cv_scores = cross_val_score(est, Xtr, ytr, scoring='accuracy', cv=skf, n_jobs=-1)

In [32]:
print('Mean cv-score: %f and std: %f'%(np.mean(cv_scores), np.std(cv_scores)))

Mean cv-score: 0.613330 and std: 0.008747


In [33]:
est.fit(Xtr, ytr)
yhat = est.predict(Xte)
print('Accuracy on held out set: %f'%(accuracy_score(yte, yhat)))

Accuracy on held out set: 0.618433


** XGBClassifier **

In [70]:
model = xgb.XGBClassifier(max_depth=3, learning_rate=0.03, min_child_weight=3, \
                          n_estimators=1000, colsample_bytree=0.5, subsample=0.9, seed=231)

In [71]:
%%time 

model.fit(
    Xtr.values, 
    ytr.values, 
    eval_set = [(Xte.values, yte.values)], 
    early_stopping_rounds = 50,
    verbose = False
)

CPU times: user 4.14 s, sys: 24 ms, total: 4.16 s
Wall time: 1.11 s


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=3,
       min_child_weight=3, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=231, silent=True, subsample=0.9)

In [72]:
p = model.predict(Xte.values)
score = accuracy_score(yte, p)
score

0.63041474654377883

In [73]:
model.best_iteration / 0.9

181.11111111111111

** Bagging Different Classifiers **

In [72]:
class BaggingClassifier:
    
    @staticmethod
    def majority_vote(preds):
        """
        Given an array of predictions from various classifiers
        return single array with ensemble of predictions based on
        simple majority voting

        Input: list of list [[y1, y2, y3, ..], [y1, y2, y3, ...], ..] 
        Output: final prediction [y1, y2, y3, ..]
        """
        length = [len(pred) for pred in preds]

        if len(set(length)) != 1:
            raise ValueError('Predictions must be of the same length')

        pred_matrix = np.matrix(preds)
        ensemble_preds = []

        for i in range(len(preds[0])):
            pred_column = np.array(pred_matrix[:, i]).ravel()
            common_pred = Counter(pred_column)
            most_common = common_pred.most_common()[0][0]

            ensemble_preds.append(most_common)

        return ensemble_preds
    
    def __init__(self, estimators, voting='hard'):
        """
        Estimators will be a list of tuples of (n_selected_features, estimator)
        """
        
        self.estimators = estimators
        self.fitted_models = []
        
    def fit(self, X, y=None):
        
#         for n_selected_features, model in self.estimators:
#             feature_indices = CMIM.cmim(X.values, y.values, n_selected_features=n_selected_features)
#             selected_features = X.columns[feature_indices]
#             model.fit(X[selected_features], y)    
#             self.fitted_models.append((selected_features, model))
        
        for model in self.estimators:
            model.fit(X, y)
            self.fitted_models.append(model)
    
        return self
    
    def predict(self, X):
        preds = []
        
#         for selected_features, model in self.fitted_models:
#             yhat = model.predict(X[selected_features])
#             preds.append(yhat)
        
        for model in self.fitted_models:
            yhat = model.predict(X)
            preds.append(yhat)
        
        final_preds = self.majority_vote(preds)
        return final_preds

In [267]:
# pipe1 = (15, xgb.XGBClassifier(max_depth=3, learning_rate=0.08, min_child_weight=1, \
#                           n_estimators=150, colsample_bytree=0.8, subsample=0.8, seed=231))
# pipe2 = (15, xgb.XGBClassifier(max_depth=4, learning_rate=0.08, min_child_weight=1, \
#                           n_estimators=120, colsample_bytree=0.8, subsample=0.8, seed=1231))
# pipe3 = (25, xgb.XGBClassifier(max_depth=3, learning_rate=0.08, min_child_weight=1, \
#                           n_estimators=53, colsample_bytree=0.8, subsample=0.8, seed=2231))
# pipe4 = (25, xgb.XGBClassifier(max_depth=4, learning_rate=0.08, min_child_weight=1, \
#                           n_estimators=78, colsample_bytree=0.8, subsample=0.8, seed=233331))

# 226.66666666666666

pipe1 = (xgb.XGBClassifier(max_depth=4, learning_rate=0.05, min_child_weight=10, \
                          n_estimators=96, colsample_bytree=0.6, subsample=0.9, seed=231))

pipe2 = (xgb.XGBClassifier(max_depth=4, learning_rate=0.05, min_child_weight=10, \
                          n_estimators=60, colsample_bytree=0.8, subsample=0.9, seed=231))

pipe3 = (xgb.XGBClassifier(max_depth=3, learning_rate=0.03, min_child_weight=3, \
                          n_estimators=197, colsample_bytree=0.7, subsample=0.9, seed=231))

pipe4 = (xgb.XGBClassifier(max_depth=3, learning_rate=0.03, min_child_weight=1, \
                          n_estimators=200, colsample_bytree=0.6, subsample=0.9, seed=231))

estimators = [pipe1, pipe2, pipe3, pipe4]

In [268]:
bag = BaggingClassifier(estimators)
bag.fit(Xtr, ytr)

<__main__.BaggingClassifier at 0x7f98123820f0>

In [269]:
yhat = bag.predict(Xte)
print('Accuracy on the held out set:%f '%(accuracy_score(yte, yhat)))

Accuracy on the held out set:0.636866 


** Full Training **

In [270]:
bag = BaggingClassifier(estimators)
bag.fit(X, y)
predictions = bag.predict(Xtest)

In [None]:
est.fit(X[selected_features], y)
predictions = est.predict(Xtest[selected_features])

In [271]:
prediction_labels = map(lambda x: 'Democrat' if x == 1 else 'Republican', predictions)
sub['Predictions'] = list(prediction_labels)
timestamp = str(datetime.now()).replace(' ', '_')
sub.to_csv('../submissions/bag_xgbs_without_cmim%s.csv'%(timestamp), index=False)