In [34]:
import pandas as pd
import numpy as np

from collections import defaultdict

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.cross_validation import StratifiedKFold, cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# set random seed
# np.random.seed(1294)

In [3]:
train = pd.read_csv('../data/train.csv', index_col='ID')
test = pd.read_csv('../data/test.csv', index_col='ID')

In [4]:
# get features with zero standard deviation
def get_constant_features(df):
    columns = df.columns
    return [col for col in columns if df[col].std() == 0.0]

constant_features = get_constant_features(train)

# get features which are identical to other features
def get_identical_features(df):
    columns = df.columns
    identical_feat = []
    
    for i in range(len(columns)):
        for j in range(i+1, len(columns)):
            if (df[columns[i]] == df[columns[j]]).all():
                identical_feat.append(columns[i])
    
    return identical_feat

identical_feat = get_identical_features(train)

In [11]:
def get_features_to_remove(constant_features, identical_features):
    features_to_remove = []
    
    for feat in constant_features:
        features_to_remove.append(feat)
    
    for feat in identical_features:
        features_to_remove.append(feat)
    
    return features_to_remove

remove_features = get_features_to_remove(constant_features, identical_feat)
remove_features.append('delta_imp_aport_var13_1y3')
remove_features.append('delta_num_aport_var13_1y3')
remove_features.append('TARGET')

In [12]:
reduced_features = train.columns.drop(remove_features)
X = train[reduced_features]
y = train.TARGET

test = test[reduced_features]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y, random_state=123)

In [14]:
print 'X_train shape %s and X_test shape %s ' %(X_train.shape, X_test.shape) 

X_train shape (38010, 304) and X_test shape (38010, 304) 


In [35]:
## model specification
model = XGBClassifier(seed=4242)

In [40]:
parameters = {
        'n_estimators': [100, 250, 500],
        'learning_rate': [0.05, 0.1, 0.3],
        'max_depth': [3, 5],
        'subsample': [0.8, 0.9],
        'colsample_bytree': [0.7, 0.8],
    }

clf = GridSearchCV(model, parameters, scoring='roc_auc', n_jobs=1, cv=3)

In [41]:
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=4242, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100, 250, 500], 'subsample': [0.8, 0.9], 'learning_rate': [0.05, 0.1, 0.3], 'colsample_bytree': [0.7, 0.8], 'max_depth': [3, 5]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [42]:
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print score
for param_name in sorted(best_parameters.keys()):
    print "%s: %r" % (param_name, best_parameters[param_name])

0.839090694067
colsample_bytree: 0.7
learning_rate: 0.1
max_depth: 3
n_estimators: 100
subsample: 0.8


In [48]:
# train different models
model1 = XGBClassifier(n_estimators=100, colsample_bytree=0.7, subsample=0.8, seed=4242, learning_rate=0.1, max_depth=3)
model2 = XGBClassifier(n_estimators=100, colsample_bytree=0.65, subsample=0.75, min_child_weight=2, seed=1234, learning_rate=0.1, max_depth=2)

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.65,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=2,
       min_child_weight=2, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1234, silent=True, subsample=0.75)

In [49]:
predictions1 = model1.predict_proba(X_test)[:, 1]
predictions2 = model2.predict_proba(X_test)[:, 1]

print 'Out of sample prediction AUC score ', roc_auc_score(y_test, predictions1)
print 'Out of sample prediction AUC score ', roc_auc_score(y_test, predictions2)

Out of sample prediction AUC score  0.835365548697
Out of sample prediction AUC score  0.832326664678


## Rank Ensembling

In [18]:
def transform_for_ranked(preds, index):
    ranks = []

    for i, pred in enumerate(preds):
        ranks.append((index[i], pred))

    return ranks

def ranked_averaging(predictions):
    all_ranks = defaultdict(list)

    for i, preds in enumerate(predictions):
        individual_ranks = []

        for e, pred in enumerate(preds):
            individual_ranks.append( (float(pred[1]), e, pred[0]) )

        for rank, item in enumerate( sorted(individual_ranks) ) :
            all_ranks[(item[1], item[2])].append(rank)

    average_ranks = []

    for k in sorted(all_ranks):
        average_ranks.append((sum(all_ranks[k])/len(all_ranks[k]),k))

    ranked_ranks = []

    for rank, k in enumerate(sorted(average_ranks)):
        ranked_ranks.append((k[1][0],k[1][1],(rank * 1.)/(len(average_ranks)-1)))
    return sorted(ranked_ranks)

In [50]:
## average predictions of linear and gbm model to see how it performs

transformed_gbm_predictions_1 = transform_for_ranked(predictions1, X_test.index.values)
transformed_gbm_predictions_2 = transform_for_ranked(predictions2, X_test.index.values)

prediction_ranks = ranked_averaging([transformed_gbm_predictions_1, transformed_gbm_predictions_2])
ensemble_ranks = [k3 for k1, k2, k3 in prediction_ranks]

In [51]:
print 'AUC score after ensembling ranks %f ' %(roc_auc_score(y_test, ensemble_ranks))

AUC score after ensembling ranks 0.834479 


In [277]:
pd.DataFrame({'rf': rfPredictions, 'gbm': gbmPredictions}).corr()

Unnamed: 0,gbm,rf
gbm,1.0,0.798724
rf,0.798724,1.0


## Stacking

In [62]:
## Creates blending set for training and test and list of classifiers

def get_blending_sets(X_train, y_train, X_test, n_folds=5,):
    n_trees = 100
    n_folds = n_folds

    # Our level 0 classifiers
    clfs = [
        RandomForestClassifier(n_estimators = n_trees, criterion = 'gini', n_jobs=-1),
        ExtraTreesClassifier(n_estimators = n_trees * 2, criterion = 'gini', n_jobs=-1),
        RandomForestClassifier(n_estimators = n_trees, criterion = 'entropy', n_jobs=-1),
        ExtraTreesClassifier(n_estimators = n_trees * 2, criterion = 'entropy', n_jobs=-1),
        XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, colsample_bytree=0.7, subsample=0.8)
    ]

    # Ready for cross validation
    skf = list(StratifiedKFold(y_train, n_folds))

    # Pre-allocate the data
    blend_train = np.zeros((X_train.shape[0], len(clfs))) # Number of training data x Number of classifiers
    blend_test = np.zeros((X_test.shape[0], len(clfs))) # Number of testing data x Number of classifiers

    print 'X_test.shape = %s' % (str(X_test.shape))
    print 'blend_train.shape = %s' % (str(blend_train.shape))
    print 'blend_test.shape = %s' % (str(blend_test.shape))
    
    return blend_train, blend_test, clfs, skf

In [63]:
## Takes in classifier, training set, labels, and test set
## on which we predict stuff on

def stacking(clfs, X_train, y_train, X_test, blend_train, blend_test, skf, y_test=None):
    
    # For each classifier, we train the number of fold times (=len(skf))
    for j, clf in enumerate(clfs):
        print 'Training classifier [%s]' % (j)
        blend_test_j = np.zeros((X_test.shape[0], len(skf))) # Number of testing data x Number of folds , we will take the mean of the predictions later
        for i, (train_index, cv_index) in enumerate(skf):
            print 'Fold [%s]' % (i)

            # This is the training and validation set
            X_dev = X_train.iloc[train_index]
            Y_dev = y_train.iloc[train_index]
            X_cv = X_train.iloc[cv_index]
            Y_cv = y_train.iloc[cv_index]

            clf.fit(X_dev, Y_dev)

            # This output will be the basis for our blended classifier to train against,
            # which is also the output of our classifiers
            blend_train[cv_index, j] = clf.predict_proba(X_cv)[:, 1]
            blend_test_j[:, i] = clf.predict_proba(X_test)[:, 1]

        # Take the mean of the predictions of the cross validation set
        blend_test[:, j] = blend_test_j.mean(1)

    print 'y_train.shape = %s' % (y_train.shape)

    # Start blending!
    bclf = LogisticRegression()
    bclf.fit(blend_train, y_train)

    # Predict now
    Y_test_predict = bclf.predict_proba(blend_test)[:, 1]
    
    if y_test:
        score = roc_auc_score(y_test, Y_test_predict)
        print 'roc_auc_score = %s' % (score)
    
    return Y_test_predict

In [64]:
blend_train, blend_test, clfs, skf = get_blending_sets(X_train, y_train, X_test)

X_test.shape = (38010, 304)
blend_train.shape = (38010, 5)
blend_test.shape = (38010, 5)


In [65]:
predictionsStacking = stacking(clfs, X_train, y_train, X_test, blend_train, blend_test, skf)

Training classifier [0]
Fold [0]
Fold [1]
Fold [2]
Fold [3]
Fold [4]
Training classifier [1]
Fold [0]
Fold [1]
Fold [2]
Fold [3]
Fold [4]
Training classifier [2]
Fold [0]
Fold [1]
Fold [2]
Fold [3]
Fold [4]
Training classifier [3]
Fold [0]
Fold [1]
Fold [2]
Fold [3]
Fold [4]
Training classifier [4]
Fold [0]
Fold [1]
Fold [2]
Fold [3]
Fold [4]
y_train.shape = 38010


In [66]:
print 'ROC AUC for stacking %f ' %(roc_auc_score(y_test, predictionsStacking))

ROC AUC for stacking 0.836646 


## Train on full dataset

In [70]:
blend_train, blend_test, clfs = get_blending_sets(X, y, test)

X_test.shape = (75818, 307)
blend_train.shape = (76020, 5)
blend_test.shape = (75818, 5)


In [71]:
predictions = stacking(clfs, X, y, test, blend_train, blend_test)

Training classifier [0]
Fold [0]
Fold [1]
Fold [2]
Fold [3]
Fold [4]
Training classifier [1]
Fold [0]
Fold [1]
Fold [2]
Fold [3]
Fold [4]
Training classifier [2]
Fold [0]
Fold [1]
Fold [2]
Fold [3]
Fold [4]
Training classifier [3]
Fold [0]
Fold [1]
Fold [2]
Fold [3]
Fold [4]
Training classifier [4]
Fold [0]
Fold [1]
Fold [2]
Fold [3]
Fold [4]
y_train.shape = 76020


## Submission

In [74]:
submission = pd.read_csv('../data/sample_submission.csv')
submission['TARGET'] = predictions
submission.to_csv('../submissions/stacking.csv', index=False)