In [31]:
from __future__ import division

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
import pickle

In [44]:

# load the data
df_train = pd.read_csv('train.csv/train.csv')
df_test = pd.read_csv('test.csv/test.csv')

#load the pickle where extra computer features are stored
my_features = pickle.load(open("my_features.pickle", "rb"))

In [45]:
feature_to_clean = ['ind_var2_0', 'ind_var2', 'ind_var27_0', 'ind_var28_0', 'ind_var28', 'ind_var27', 'ind_var41', 'ind_var46_0', 'ind_var46', 'num_var27_0', 'num_var28_0', 'num_var28', 'num_var27', 'num_var41', 'num_var46_0', 'num_var46', 'saldo_var28', 'saldo_var27', 'saldo_var41', 'saldo_var46', 'imp_amort_var18_hace3', 'imp_amort_var34_hace3', 'imp_reemb_var13_hace3', 'imp_reemb_var33_hace3', 'imp_trasp_var17_out_hace3', 'imp_trasp_var33_out_hace3', 'num_var2_0_ult1', 'num_var2_ult1', 'num_reemb_var13_hace3', 'num_reemb_var33_hace3', 'num_trasp_var17_out_hace3', 'num_trasp_var33_out_hace3', 'saldo_var2_ult1', 'saldo_medio_var13_medio_hace3',
                   'ind_var29_0', 'ind_var29', 'ind_var13_medio', 'ind_var18', 'ind_var26', 
                     'ind_var25', 'ind_var32', 'ind_var34', 'ind_var37', 'ind_var39', 'num_var29_0', 
                     'num_var29', 'num_var13_medio', 'num_var18', 'num_var26', 'num_var25', 'num_var32', 
                     'num_var34', 'num_var37', 'num_var39', 'saldo_var29', 'saldo_medio_var13_medio_ult1', 
                     'delta_num_reemb_var13_1y3', 'delta_num_reemb_var17_1y3', 'delta_num_reemb_var33_1y3', 
                     'delta_num_trasp_var17_in_1y3', 'delta_num_trasp_var17_out_1y3', 
                     'delta_num_trasp_var33_in_1y3', 'delta_num_trasp_var33_out_1y3',
                   'delta_imp_reemb_var33_1y3',  'imp_reemb_var17_hace3', 'imp_reemb_var33_ult1', 'imp_trasp_var17_in_hace3', 'num_reemb_var17_hace3', 'num_reemb_var33_ult1', 'num_trasp_var17_in_hace3']



df_train.drop(feature_to_clean, axis=1, inplace=True)
df_test.drop(feature_to_clean, axis=1, inplace=True)

## Add Sum of 0's

In [46]:
df_train['sum_of_0'] = my_features['sum_of_0']

## Add Nationality Patch

In [50]:
def fix_nationality(nat):
    for value in nat:
        if value == -999999:
            return 2
        else:
            return value

df_train['var3'] = df_train[['var3']].apply(fix_nationality, axis=1)

0          2
1          2
2          2
3          2
4          2
5          2
6          2
7          2
8          2
9          2
10         2
11         2
12         2
13         2
14         2
15         2
16         2
17         2
18       229
19         2
20         2
21         2
22         2
23         2
24         2
25         2
26         2
27         2
28         2
29         2
        ... 
75990      2
75991      2
75992      2
75993      2
75994      2
75995      2
75996      2
75997      2
75998      2
75999      2
76000      2
76001      2
76002      2
76003      2
76004      2
76005      2
76006      2
76007      2
76008      2
76009      2
76010      2
76011      2
76012      4
76013      2
76014      2
76015      2
76016      2
76017      2
76018      2
76019      2
Name: var3, dtype: int64




In [51]:
#prepare data for model
y_train = df_train['TARGET'].values
X_train = df_train.drop(['ID','TARGET'], axis=1).values

In [52]:
id_test = df_test['ID']
X_test = df_test.drop(['ID'], axis=1).values

In [53]:
# classifier
clf = xgb.XGBClassifier(missing=np.nan, max_depth=5, n_estimators=350, 
                        learning_rate=0.03, nthread=4, subsample=0.95, 
                        colsample_bytree=0.85, seed=4242)

In [54]:
X_fit, X_eval, y_fit, y_eval= train_test_split(X_train, y_train, test_size=0.3)

In [55]:
# fitting
clf.fit(X_train, y_train, early_stopping_rounds=20, 
        eval_metric="auc", eval_set=[(X_eval, y_eval)])

Will train until validation_0 error hasn't decreased in 20 rounds.
[0]	validation_0-auc:0.768519
[1]	validation_0-auc:0.783029
[2]	validation_0-auc:0.809720
[3]	validation_0-auc:0.793603
[4]	validation_0-auc:0.795985
[5]	validation_0-auc:0.799391
[6]	validation_0-auc:0.806389
[7]	validation_0-auc:0.810384
[8]	validation_0-auc:0.804231
[9]	validation_0-auc:0.802876
[10]	validation_0-auc:0.804977
[11]	validation_0-auc:0.803552
[12]	validation_0-auc:0.805443
[13]	validation_0-auc:0.801921
[14]	validation_0-auc:0.804785
[15]	validation_0-auc:0.806669
[16]	validation_0-auc:0.806245
[17]	validation_0-auc:0.808654
[18]	validation_0-auc:0.809759
[19]	validation_0-auc:0.807287
[20]	validation_0-auc:0.808914
[21]	validation_0-auc:0.810080
[22]	validation_0-auc:0.811500
[23]	validation_0-auc:0.812543
[24]	validation_0-auc:0.813898
[25]	validation_0-auc:0.814883
[26]	validation_0-auc:0.816105
[27]	validation_0-auc:0.817544
[28]	validation_0-auc:0.819007
[29]	validation_0-auc:0.821311
[30]	validati

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.85,
       gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=350, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=4242, silent=True, subsample=0.95)

In [56]:
print('Overall AUC:', roc_auc_score(y_train, clf.predict_proba(X_train)[:,1]))

('Overall AUC:', 0.88291825762611453)


In [57]:
# predicting
y_pred= clf.predict_proba(X_test)[:,1]

In [58]:
y_pred

array([ 0.05047565,  0.06481396,  0.00199505, ...,  0.00441693,
        0.07138801,  0.00156516], dtype=float32)

In [59]:
submission = pd.DataFrame({"ID":id_test, "TARGET":y_pred})
submission.to_csv("submission15.csv", index=False)

In [None]:
'''
ADDITIONAL FUNCTIONS FOR FEATURE ENGINEERING
==============================================================
import pickle
to_save = {
    "sum_of_0":df_train['sum_of_0']
}
pickle.dump(to_save, open('my_features.pickle','a'))
===============================================================
#NAME: ADD SUM OF 0'S
# select on the features, leaving out the 'TARGET' feature


features = [f for f in df_train.columns if f != 'TARGET' and f!='ID' ]

def calculate_zeros(cols):
    sum_array = [] 
    for value in cols:
        if value == 0:
            sum_array.append(value)
    return len(sum_array)

df_train['sum_of_0'] = df_train[features].apply(calculate_zeros, axis=1)

===============================================================

# remove duplicated columns
remove = []
c = df_train.columns
for i in range(len(c)-1):
    v = df_train[c[i]].values
    for j in range(i+1, len(c)):
        if np.array_equal(v, df_train[c[j]].values):
            remove.append(c[j])

df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

================================================================

# remove constant columns
remove = []
for col in df_train.columns:
    if df_train[col].std() == 0:
        remove.append(col)


'''