In [31]:
import pandas as pd
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline

In [2]:
# set random seed
np.random.seed(1294)

In [3]:
train = pd.read_csv('../data/train.csv', index_col='ID')
test = pd.read_csv('../data/test.csv', index_col='ID')

In [52]:
features = train.columns.drop('TARGET')

# reduce the feature set by removing features with constant values
def remove_constant_features(features):
    num_unique_values_train = {}
    num_unique_values_test = {}
    percentage_uniqueness = {}
    
    # create a mapping of feature vs number of unique values for that feature
    for feature in features:
        num_unique_values_train[feature] = len(train[feature].unique())
        num_unique_values_test[feature] = len(test[feature].unique())    
        percentage_uniqueness[feature] = (test[feature].value_counts() * 1. / len(test)).max()
        
    return [feature for feature in features if num_unique_values_train[feature] > 1 \
                                               and num_unique_values_test[feature] > 1 \
                                               and percentage_uniqueness[feature] < 0.99]

In [53]:
reduced_features = remove_constant_features(features)

X = train[reduced_features]
y = train.TARGET

In [55]:
# number of zeros in each row as feature
X.loc[:, 'num_zeros'] = (X == 0).astype(int).sum(axis=1)
test.loc[:, 'num_zeros'] = (test == 0).astype(int).sum(axis=1)

In [56]:
X.loc[:, 'num_zeros_greater_than_median'] = X.num_zeros.map(lambda x: int(x > 340))
test.loc[:, 'num_zeros_greater_than_median'] = test.num_zeros.map(lambda x: int(x > 340))

## Train and Test Split

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y, random_state=44)

In [70]:
# scaler = StandardScaler()
# select = SelectKBest(f_classif, k=100)
clf = XGBClassifier(learning_rate=0.02, seed=1234, \
                    n_estimators=224, max_depth=5, colsample_bytree=0.8, subsample=0.9)

pipeline = Pipeline([('clf', clf)])

In [71]:
# fit on the training examples
pipeline.fit(X_train, y_train)

Pipeline(steps=[('clf', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.02, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=224, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1234, silent=True, subsample=0.9))])

In [72]:
# predictions on the training and test set
print 'ROC AUC score on training examples : %.3f ' %(roc_auc_score(y_train, pipeline.predict_proba(X_train)[:, 1]))
print 'ROC AUC score on unseen examples : %.3f ' %(roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1]))

ROC AUC score on training examples : 0.878 
ROC AUC score on unseen examples : 0.836 


## Train on full dataset

In [None]:
pipeline.fit(X, y)

## Prediction

In [18]:
predictions = pipeline.predict_proba(test)[:, 1]

## Submission

In [19]:
submission = pd.read_csv('../data/sample_submission.csv')
submission['TARGET'] = predictions
submission.to_csv('../submissions/reduced_features.csv', index=False)