In [155]:
from __future__ import division

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
import pickle

In [156]:
# load the data
df_train = pd.read_csv('train.csv/train.csv')
df_test = pd.read_csv('test.csv/test.csv')

#load the pickle where extra computer features are stored
my_features = pickle.load(open("sums.pickle", "rb"))

In [157]:
feature_to_clean = ['ind_var2_0', 'ind_var2', 'ind_var27_0', 'ind_var28_0', 'ind_var28', 'ind_var27', 'ind_var41', 'ind_var46_0', 'ind_var46', 'num_var27_0', 'num_var28_0', 'num_var28', 'num_var27', 'num_var41', 'num_var46_0', 'num_var46', 'saldo_var28', 'saldo_var27', 'saldo_var41', 'saldo_var46', 'imp_amort_var18_hace3', 'imp_amort_var34_hace3', 'imp_reemb_var13_hace3', 'imp_reemb_var33_hace3', 'imp_trasp_var17_out_hace3', 'imp_trasp_var33_out_hace3', 'num_var2_0_ult1', 'num_var2_ult1', 'num_reemb_var13_hace3', 'num_reemb_var33_hace3', 'num_trasp_var17_out_hace3', 'num_trasp_var33_out_hace3', 'saldo_var2_ult1', 'saldo_medio_var13_medio_hace3',
                   'ind_var29_0', 'ind_var29', 'ind_var13_medio', 'ind_var18', 'ind_var26', 
                     'ind_var25', 'ind_var32', 'ind_var34', 'ind_var37', 'ind_var39', 'num_var29_0', 
                     'num_var29', 'num_var13_medio', 'num_var18', 'num_var26', 'num_var25', 'num_var32', 
                     'num_var34', 'num_var37', 'num_var39', 'saldo_var29', 'saldo_medio_var13_medio_ult1', 
                     'delta_num_reemb_var13_1y3', 'delta_num_reemb_var17_1y3', 'delta_num_reemb_var33_1y3', 
                     'delta_num_trasp_var17_in_1y3', 'delta_num_trasp_var17_out_1y3', 
                     'delta_num_trasp_var33_in_1y3', 'delta_num_trasp_var33_out_1y3',
                   'delta_imp_reemb_var33_1y3',  'imp_reemb_var17_hace3', 'imp_reemb_var33_ult1', 'imp_trasp_var17_in_hace3', 'num_reemb_var17_hace3', 'num_reemb_var33_ult1', 'num_trasp_var17_in_hace3']



df_train.drop(feature_to_clean, axis=1, inplace=True)
df_test.drop(feature_to_clean, axis=1, inplace=True)

## Add binary features for each value
1. here I want to add additional columns for each value in each feature.
2. For now I will just try add add "1" for where "0" is found

In [158]:
features_extraction = ['num_var13_corto', 'num_var13_corto_0', 'num_meses_var12_ult3', 'num_meses_var13_corto_ult3',
                      'num_meses_var39_vig_ult3', 'num_meses_var5_ult3','num_var24_0','num_var12','var36',
                      'num_var5','num_var5_0','num_var12_0','num_var13','num_var13_0','num_var42','num_var4',
                      'num_var42_0','num_var30','num_var39_0','num_var41_0']


In [159]:
#add a binary new feature for each category in each of the features extraction

def add_binary_features(cols,cat):
    for value in cols:
        if value == cat:
            return 1
        else:
            return 0 

tracker = 1
for f in features_extraction:
    categories = df_train[f].unique()
    for cat in categories:
        df_train[f+"_"+str(cat)] = df_train[[f]].apply(add_binary_features, args=(cat,), axis=1)
    print "Done with " + str(tracker)
    tracker+=1


'\ndef add_binary_features(cols,cat):\n    for value in cols:\n        if value == cat:\n            return 1\n        else:\n            return 0 \n\ntracker = 1\nfor f in features_extraction:\n    categories = df_train[f].unique()\n    for cat in categories:\n        df_train[f+"_"+str(cat)] = df_train[[f]].apply(add_binary_features, args=(cat,), axis=1)\n    print "Done with " + str(tracker)\n    tracker+=1\n'

In [160]:
#df_train.drop(features_extraction, axis=1, inplace=True)
#df_test.drop(features_extraction, axis=1, inplace=True)

## Add Sum of 0's

In [161]:
features = [f for f in df_train.columns if f != 'TARGET' and f!='ID' ]
#df_train['sum_of_0'] = my_features['sum_of_0']
def calculate_zeros(cols):
    sum_array = [] 
    for value in cols:
        if value == 0:
            sum_array.append(value)
    return len(sum_array)

df_train['sum_of_0'] = df_train[features].apply(calculate_zeros, axis=1)
df_test['sum_of_0'] = df_test[features].apply(calculate_zeros, axis=1)

In [162]:
df_train['sum_of_0']

0        285
1        259
2        270
3        241
4        249
5        285
6        270
7        271
8        285
9        276
10       253
11       285
12       266
13       243
14       211
15       270
16       290
17       244
18       255
19       270
20       285
21       270
22       248
23       248
24       285
25       201
26       285
27       285
28       270
29       237
        ... 
75990    217
75991    269
75992    270
75993    285
75994    277
75995    271
75996    270
75997    270
75998    285
75999    235
76000    257
76001    270
76002    246
76003    236
76004    246
76005    248
76006    227
76007    285
76008    268
76009    238
76010    285
76011    222
76012    246
76013    240
76014    230
76015    285
76016    254
76017    271
76018    270
76019    285
Name: sum_of_0, dtype: int64

In [163]:
#prepare data for model
y_train = df_train['TARGET'].values
X_train = df_train.drop(['ID','TARGET'], axis=1).values

In [164]:
id_test = df_test['ID']
X_test = df_test.drop(['ID'], axis=1).values

In [165]:
# classifier
clf = xgb.XGBClassifier(missing=np.nan, max_depth=5, n_estimators=350, 
                        learning_rate=0.03, nthread=4, subsample=0.95, 
                        colsample_bytree=0.85, seed=4242)

In [166]:
X_fit, X_eval, y_fit, y_eval= train_test_split(X_train, y_train, test_size=0.3)

In [167]:
# fitting
clf.fit(X_train, y_train, early_stopping_rounds=20, 
        eval_metric="auc", eval_set=[(X_eval, y_eval)])

Will train until validation_0 error hasn't decreased in 20 rounds.
[0]	validation_0-auc:0.767471
[1]	validation_0-auc:0.783326
[2]	validation_0-auc:0.805103
[3]	validation_0-auc:0.789699
[4]	validation_0-auc:0.792127
[5]	validation_0-auc:0.795795
[6]	validation_0-auc:0.802650
[7]	validation_0-auc:0.807788
[8]	validation_0-auc:0.800707
[9]	validation_0-auc:0.798828
[10]	validation_0-auc:0.801777
[11]	validation_0-auc:0.799618
[12]	validation_0-auc:0.802402
[13]	validation_0-auc:0.800230
[14]	validation_0-auc:0.803008
[15]	validation_0-auc:0.804977
[16]	validation_0-auc:0.803877
[17]	validation_0-auc:0.806636
[18]	validation_0-auc:0.807967
[19]	validation_0-auc:0.805143
[20]	validation_0-auc:0.806271
[21]	validation_0-auc:0.807524
[22]	validation_0-auc:0.808583
[23]	validation_0-auc:0.809914
[24]	validation_0-auc:0.811353
[25]	validation_0-auc:0.812425
[26]	validation_0-auc:0.813593
[27]	validation_0-auc:0.815352
[28]	validation_0-auc:0.816571
[29]	validation_0-auc:0.818591
[30]	validati

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.85,
       gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=350, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=4242, silent=True, subsample=0.95)

In [172]:
print('Overall AUC:', roc_auc_score(y_train, clf.predict_proba(X_train)[:,1]))

('Overall AUC:', 0.88249734213757924)


In [173]:
# predicting
y_pred= clf.predict_proba(X_test)[:,1]

In [174]:
submission = pd.DataFrame({"ID":id_test, "TARGET":y_pred})
submission.to_csv("submission89.csv", index=False)

In [171]:
'''
ADDITIONAL FUNCTIONS FOR FEATURE ENGINEERING
==============================================================
import pickle
to_save = {
    "sum_of_0":df_train['sum_of_0']
}
pickle.dump(to_save, open('my_features.pickle','a'))
===============================================================
#NAME: ADD SUM OF 0'S
# select on the features, leaving out the 'TARGET' feature


features = [f for f in df_train.columns if f != 'TARGET' and f!='ID' ]

def calculate_zeros(cols):
    sum_array = [] 
    for value in cols:
        if value == 0:
            sum_array.append(value)
    return len(sum_array)

df_train['sum_of_0'] = df_train[features].apply(calculate_zeros, axis=1)

===============================================================

# remove duplicated columns
remove = []
c = df_train.columns
for i in range(len(c)-1):
    v = df_train[c[i]].values
    for j in range(i+1, len(c)):
        if np.array_equal(v, df_train[c[j]].values):
            remove.append(c[j])

df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

================================================================

# remove constant columns
remove = []
for col in df_train.columns:
    if df_train[col].std() == 0:
        remove.append(col)
================================================================

def fix_nationality(nat):
    for value in nat:
        if value == -999999:
            return 2
        else:
            return value

df_train['var3'] = df_train[['var3']].apply(fix_nationality, axis=1)


'''

