In [29]:
import pandas as pd
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.pipeline import Pipeline

In [7]:
# set random seed
np.random.seed(1294)

In [2]:
train = pd.read_csv('../data/train.csv', index_col='ID')
test = pd.read_csv('../data/test.csv', index_col='ID')

In [4]:
features = train.columns.drop('TARGET')

X = train[features]
y = train.TARGET

In [6]:
# number of zeros in each row as feature
X['num_zeros'] = (X == 0).astype(int).sum(axis=1)
test['num_zeros'] = (test == 0).astype(int).sum(axis=1)

In [15]:
X['num_zeros_greater_than_median'] = X.num_zeros.map(lambda x: int(x > 340))
test['num_zeros_greater_than_median'] = test.num_zeros.map(lambda x: int(x > 340))

## Train and Test Split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=44)

In [50]:
scaler = StandardScaler()
pca = PCA(n_components=100, whiten=True)
clf = ExtraTreesClassifier(n_estimators=1000, max_depth=10)

pipeline = Pipeline([('scaler', scaler), ('pca', pca), ('clf', clf)])

In [51]:
# fit on the training examples
pipeline.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, n_components=100, whiten=True)), ('clf', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=10, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [52]:
# predictions on the training and test set
print 'ROC AUC score on training examples : %.3f ' %(roc_auc_score(y_train, pipeline.predict_proba(X_train)[:, 1]))
print 'ROC AUC score on unseen examples : %.3f ' %(roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1]))

ROC AUC score on training examples : 0.835 
ROC AUC score on unseen examples : 0.777 


## Train on full dataset

In [53]:
pipeline.fit(X, y)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, n_components=100, whiten=True)), ('clf', ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=10, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

## Prediction

In [55]:
predictions = pipeline.predict_proba(test)[:, 1]

## Submission

In [56]:
submission = pd.read_csv('../data/sample_submission.csv')
submission['TARGET'] = predictions
submission.to_csv('../submissions/extra_trees_pipeline', index=False)