In [1]:
from __future__ import division

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM

In [2]:
# load the data
df_train = pd.read_csv('train.csv/train.csv')
df_test = pd.read_csv('test.csv/test.csv')

In [3]:
# remove constant columns
remove = []
for col in df_train.columns:
    if df_train[col].std() == 0:
        remove.append(col)

In [4]:
df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

In [27]:
df_train['TARGET'].values

array([0, 0, 0, ..., 0, 0, 0])

In [5]:
#prepare data for model
y_train = df_train['TARGET'].values
X_train = df_train.drop(['ID','TARGET'], axis=1).values

In [9]:
id_test = df_test['ID']
X_test = df_test.drop(['ID'], axis=1).values

In [10]:
# classifier
clf = xgb.XGBClassifier(missing=np.nan, max_depth=5, n_estimators=350, 
                        learning_rate=0.03, nthread=4, subsample=0.95, 
                        colsample_bytree=0.85, seed=4242)

In [11]:
X_fit, X_eval, y_fit, y_eval= train_test_split(X_train, y_train, test_size=0.3)

In [12]:
# fitting
clf.fit(X_train, y_train, early_stopping_rounds=20, 
        eval_metric="auc", eval_set=[(X_eval, y_eval)])

Will train until validation_0 error hasn't decreased in 20 rounds.
[0]	validation_0-auc:0.817908
[1]	validation_0-auc:0.819647
[2]	validation_0-auc:0.819267
[3]	validation_0-auc:0.820721
[4]	validation_0-auc:0.808922
[5]	validation_0-auc:0.812752
[6]	validation_0-auc:0.813505
[7]	validation_0-auc:0.814464
[8]	validation_0-auc:0.814879
[9]	validation_0-auc:0.814983
[10]	validation_0-auc:0.809959
[11]	validation_0-auc:0.809572
[12]	validation_0-auc:0.810201
[13]	validation_0-auc:0.811047
[14]	validation_0-auc:0.809826
[15]	validation_0-auc:0.811432
[16]	validation_0-auc:0.807852
[17]	validation_0-auc:0.808929
[18]	validation_0-auc:0.810947
[19]	validation_0-auc:0.809815
[20]	validation_0-auc:0.810504
[21]	validation_0-auc:0.810153
[22]	validation_0-auc:0.810929
[23]	validation_0-auc:0.810595
Stopping. Best iteration:
[3]	validation_0-auc:0.820721



XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.85,
       gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=350, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=4242, silent=True, subsample=0.95)

In [13]:
print('Overall AUC:', roc_auc_score(y_train, clf.predict_proba(X_train)[:,1]))

('Overall AUC:', 0.83922943918574733)


In [24]:
# predicting
y_pred= clf.predict_proba(X_test)[:,1]

In [25]:
y_pred

array([ 0.26126292,  0.26583791,  0.24533668, ...,  0.24521795,
        0.27286133,  0.24499308], dtype=float32)

In [26]:
submission = pd.DataFrame({"ID":id_test, "TARGET":y_pred})
submission.to_csv("submission5.csv", index=False)