In [11]:
import pandas as pd
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.cross_validation import StratifiedKFold
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
# set random seed
np.random.seed(1294)

In [3]:
train = pd.read_csv('../data/train.csv', index_col='ID')
test = pd.read_csv('../data/test.csv', index_col='ID')

In [4]:
# get features with zero standard deviation
def get_constant_features(df):
    columns = df.columns
    return [col for col in columns if df[col].std() == 0.0]

constant_features = get_constant_features(train)

# get features which are identical to other features
def get_identical_features(df):
    columns = df.columns
    identical_feat = []
    
    for i in range(len(columns)):
        for j in range(i+1, len(columns)):
            if (df[columns[i]] == df[columns[j]]).all():
                identical_feat.append(columns[i])
    
    return identical_feat

identical_feat = get_identical_features(train)

In [6]:
def get_features_to_remove(constant_features, identical_features):
    features_to_remove = []
    
    for feat in constant_features:
        features_to_remove.append(feat)
    
    for feat in identical_features:
        features_to_remove.append(feat)
    
    return features_to_remove

remove_features = get_features_to_remove(constant_features, identical_feat)
remove_features.append('TARGET')

In [7]:
reduced_features = train.columns.drop(remove_features)
X = train[reduced_features]
y = train.TARGET

test = test[reduced_features]

In [8]:
# replace -999999.000000 with 2 ( most common value )
X = X.replace(-999999.000000, 2)
test = test.replace(-999999.000000, 2)

## Feature Engineering

In [9]:
# number of zeros in each row as feature
X.loc[:, 'num_zeros'] = (X == 0).astype(int).sum(axis=1)
test.loc[:, 'num_zeros'] = (test == 0).astype(int).sum(axis=1)

## Train and Test Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y, random_state=44)

## Stacking

In [44]:
n_trees = 200
n_folds = 5
early_stopping_rounds = 50

# Our level 0 classifiers
clfs = [
    RandomForestClassifier(n_estimators = n_trees, criterion = 'gini'),
    ExtraTreesClassifier(n_estimators = n_trees * 2, criterion = 'gini'),
    LogisticRegression(C=1.)
#     XGBClassifier(learning_rate=0.02, seed=1234, n_estimators=n_trees, max_depth=5, colsample_bytree=0.9, subsample=0.9)
]

# Ready for cross validation
skf = list(StratifiedKFold(y_train, n_folds))

# Pre-allocate the data
blend_train = np.zeros((X_train.shape[0], len(clfs))) # Number of training data x Number of classifiers
blend_test = np.zeros((X_test.shape[0], len(clfs))) # Number of testing data x Number of classifiers

print 'X_test.shape = %s' % (str(X_test.shape))
print 'blend_train.shape = %s' % (str(blend_train.shape))
print 'blend_test.shape = %s' % (str(blend_test.shape))

X_test.shape = (38010, 307)
blend_train.shape = (38010, 3)
blend_test.shape = (38010, 3)


In [None]:
# For each classifier, we train the number of fold times (=len(skf))
for j, clf in enumerate(clfs):
    print 'Training classifier [%s]' % (j)
    blend_test_j = np.zeros((X_test.shape[0], len(skf))) # Number of testing data x Number of folds , we will take the mean of the predictions later
    for i, (train_index, cv_index) in enumerate(skf):
        print 'Fold [%s]' % (i)

        # This is the training and validation set
        X_dev = X_train.iloc[train_index]
        Y_dev = y_train.iloc[train_index]
        X_cv = X_train.iloc[cv_index]
        Y_cv = y_train.iloc[cv_index]
    
        if j == 3:
            clf.fit(X_dev, Y_dev, early_stopping_rounds=early_stopping_rounds, eval_metric='auc',\
                    eval_set=[(X_cv, Y_cv)])
        else:
            clf.fit(X_dev, Y_dev)

        # This output will be the basis for our blended classifier to train against,
        # which is also the output of our classifiers
        if j == 3:
            blend_train[cv_index, j] = clf.predict_proba(X_cv, ntree_limit=clf.best_iteration)[:, 1]
            blend_test_j[:, i] = clf.predict_proba(X_test, ntree_limit=clf.best_iteration)[:, 1]
        else:
            blend_train[cv_index, j] = clf.predict_proba(X_cv)[:, 1]
            blend_test_j[:, i] = clf.predict_proba(X_test)[:, 1]

    # Take the mean of the predictions of the cross validation set
    blend_test[:, j] = blend_test_j.mean(1)

print 'y_train.shape = %s' % (y_train.shape)

# Start blending!
bclf = LogisticRegression()
bclf.fit(blend_train, y_train)

# Predict now
Y_test_predict = bclf.predict_proba(blend_test)[:, 1]
score = roc_auc_score(y_test, Y_test_predict)
print 'roc_auc_score = %s' % (score)

Training classifier [0]
Fold [0]
Fold [1]
Fold [2]
Fold [3]
Fold [4]
Training classifier [1]

## Correlation among predictions from different classifiers

In [42]:
## Blend Test contains mean prediction for the three classifiers across different folds
pd.DataFrame(blend_test).corr()

Unnamed: 0,0,1
0,1.0,0.873545
1,0.873545,1.0


In [34]:
blend_tes

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [44]:
scaler = StandardScaler()
X_train = scaler.fit_transform(np.log(1 + X_train - X_train.min()))
X_test = scaler.transform(np.log(1 + X_test - X_test.min()))

clf = XGBClassifier(learning_rate=0.02, seed=1234, \
                    n_estimators=200, max_depth=5, colsample_bytree=0.8, subsample=0.9)

pipeline = Pipeline([('clf', clf)])

In [45]:
# fit on the training examples
pipeline.fit(X_train, y_train)

Pipeline(steps=[('clf', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.02, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1234, silent=True, subsample=0.9))])

In [46]:
# predictions on the training and test set
print 'ROC AUC score on training examples : %.3f ' %(roc_auc_score(y_train, pipeline.predict_proba(X_train)[:, 1]))
print 'ROC AUC score on unseen examples : %.3f ' %(roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1]))

ROC AUC score on training examples : 0.874 
ROC AUC score on unseen examples : 0.760 


## Train on full dataset

In [73]:
pipeline.fit(X, y)

Pipeline(steps=[('clf', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.02, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=224, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1234, silent=True, subsample=0.9))])

## Prediction

In [86]:
predictions = pipeline.predict_proba(test)[:, 1]

## Submission

In [88]:
submission = pd.read_csv('../data/sample_submission.csv')
submission['TARGET'] = predictions
submission.to_csv('../submissions/reduced_features.csv', index=False)