# Trabalho Final v2.0
# Yuri Lopes e Maurício Borges

In [35]:
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

import pandas as pd
import matplotlib.pyplot as plt

import copy


### Funções de pré-processamento

In [69]:
def process_base(train, test):
    train.loc[(train['var38']>117310.979) & (train['var38']<117310.98), 'var38'] = -999.0
    test.loc[(test['var38']>117310.979) & (test['var38']<117310.98), 'var38'] = -999.0

    train.loc[train['var3']==-999999, 'var3'] = -999.0
    test.loc[test['var3']==-999999, 'var3'] = -999.0

    for f in ['imp_op_var40_comer_ult1', 'imp_op_var40_efect_ult3', 'imp_op_var41_comer_ult3', 'imp_sal_var16_ult1']:
        train.loc[train[f]==0.0, f] = -999.0
        test.loc[test[f]==0.0, f] = -999.0

    return train, test

def drop_duplicated(train, test):
    # drop var6 variable (it is similar to var29)
    flist = [x for x in train.columns if not x in ['TARGET']]            
    train.drop([x for x in flist if 'var6' in x], axis=1, inplace=True)
    test.drop([x for x in flist if 'var6' in x], axis=1, inplace=True)

    # remove repeated columns with _0 in the name
    flist = [x for x in train.columns if not x in ['TARGET']]        
    flist_remove = []
    for i in range(len(flist)-1):
        v = train[flist[i]].values
        for j in range(i+1, len(flist)):
            if np.array_equal(v, train[flist[j]].values):
                if '_0' in flist[j]:
                    flist_remove.append(flist[j])
                elif  '_0' in flist[i]:
                    flist_remove.append(flist[i])
    train.drop(flist_remove, axis=1, inplace=True)
    test.drop(flist_remove, axis=1, inplace=True)

    flist_remove = ['saldo_medio_var13_medio_ult1', 'delta_imp_reemb_var13_1y3', 'delta_imp_reemb_var17_1y3', 'delta_imp_reemb_var33_1y3', 'delta_imp_trasp_var17_in_1y3', 'delta_imp_trasp_var17_out_1y3', 'delta_imp_trasp_var33_in_1y3', 'delta_imp_trasp_var33_out_1y3', "ind_var2_0","ind_var2","ind_var27_0","ind_var28_0","ind_var28","ind_var27", "ind_var41","ind_var46_0","ind_var46","num_var27_0","num_var28_0","num_var28","num_var27","num_var41","num_var46_0", "num_var46","saldo_var28","saldo_var27","saldo_var41","saldo_var46","imp_amort_var18_hace3","imp_amort_var34_hace3", "imp_reemb_var13_hace3","imp_reemb_var33_hace3","imp_trasp_var17_out_hace3","imp_trasp_var33_out_hace3", "num_var2_0_ult1","num_var2_ult1","num_reemb_var13_hace3","num_reemb_var33_hace3","num_trasp_var17_out_hace3", "num_trasp_var33_out_hace3","saldo_var2_ult1","saldo_medio_var13_medio_hace3","ind_var6_0","ind_var6", "ind_var13_medio_0","ind_var18_0","ind_var26_0","ind_var25_0","ind_var32_0","ind_var34_0","ind_var37_0", "ind_var40","num_var6_0","num_var6","num_var13_medio_0","num_var18_0","num_var26_0","num_var25_0","num_var32_0", "num_var34_0","num_var37_0","num_var40","saldo_var6","saldo_var13_medio","delta_imp_reemb_var13_1y3", "delta_imp_reemb_var17_1y3","delta_imp_reemb_var33_1y3","delta_imp_trasp_var17_in_1y3","delta_imp_trasp_var17_out_1y3", "delta_imp_trasp_var33_in_1y3","delta_imp_trasp_var33_out_1y3"]
    train.drop(flist_remove, axis=1, inplace=True)
    test.drop(flist_remove, axis=1, inplace=True)

    return train, test

## Pré-processamento

In [70]:
data_train = pd.read_csv("new_train.csv")
data_test = pd.read_csv("test.csv")

#TODO concatenar dataset de treino e teste para fazer essas verificações para remoção

#verificação de colunas constantes
list_remove = []
for col in data_train.columns:
    if data_train[col].std() == 0: #se o desvio padrão da coluna for zero, ela é constante
        list_remove.append(col)
        
#removo as colunas constantes dos datasets
data_train.drop(list_remove, axis=1, inplace=True)
data_test.drop(list_remove, axis=1, inplace=True)

#verificação de colunas duplicadas
list_remove = []
columns = data_train.columns
for i in range(len(columns) - 1):
    values = data_train[columns[i]].values
    for j in range(i+1, len(columns)):
        if np.array_equal(values, data_train[columns[j]].values):
            list_remove.append(columns[j])

data_train.drop(list_remove, axis=1, inplace=True)
data_test.drop(list_remove, axis=1, inplace=True)

data_train, data_test = process_base(data_train, data_test)
data_train, data_test = process_base(data_train, data_test)

data_test_original = copy.deepcopy(data_test)

In [71]:
y_train = data_train['TARGET'].values
x_train = data_train.drop(['TARGET'], axis=1).values
x_test = data_test.values

zero_train = (x_train == 0).astype(int).sum(axis=1)
zero_train = pd.DataFrame(zero_train, columns=["ZERO"])
x_train = pd.DataFrame(x_train)
x_train = pd.concat([x_train, zero_train], axis=1)

zero_test = (x_test == 0).astype(int).sum(axis=1)
zero_test = pd.DataFrame(zero_test, columns=["ZERO"])
x_test = pd.DataFrame(x_test)
x_test = pd.concat([x_test, zero_test], axis=1)

d_train = xgb.DMatrix(x_train, label=y_train)
watchlist  = [(d_train,'train')]

In [72]:
r_seed = 3002

In [73]:
#X_train, X_eval, Y_train, y_eval = train_test_split(x_train, y_train, test_size=0.25)#, random_state=r_seed)
#d_train = xgb.DMatrix(X_train, label=Y_train)
#d_test = xgb.DMatrix(X_eval, label=y_eval)
#watchlist  = [(d_test,'eval'), (d_train,'train')]

## Treinamento e Classificação

In [76]:
params = {}
params['objective'] = 'binary:logistic'
params['booster'] = 'gbtree'
params['eval_metric'] = 'auc'
params['eta'] = 0.0102048
params['max_depth'] = 5
params['subsample'] = 0.6815
params['colsample_bytree'] = 0.701
params['silent'] = 1
params['verbose_eval'] = False
params['min_child_weight'] = 3
params['nthread'] = 8
#params['seed'] = r_seed

clf = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds = 100)

d_test = xgb.DMatrix(x_test)
y_pred = clf.predict(d_test)

[0]	train-auc:0.809769
Will train until train-auc hasn't improved in 100 rounds.
[1]	train-auc:0.805405
[2]	train-auc:0.799823
[3]	train-auc:0.80164
[4]	train-auc:0.807208
[5]	train-auc:0.813538
[6]	train-auc:0.816372
[7]	train-auc:0.813892
[8]	train-auc:0.815809
[9]	train-auc:0.8184
[10]	train-auc:0.821506
[11]	train-auc:0.8256
[12]	train-auc:0.826818
[13]	train-auc:0.828146
[14]	train-auc:0.830115
[15]	train-auc:0.832521
[16]	train-auc:0.833737
[17]	train-auc:0.832025
[18]	train-auc:0.831524
[19]	train-auc:0.831648
[20]	train-auc:0.830935
[21]	train-auc:0.829502
[22]	train-auc:0.831469
[23]	train-auc:0.832343
[24]	train-auc:0.83034
[25]	train-auc:0.831748
[26]	train-auc:0.832667
[27]	train-auc:0.832585
[28]	train-auc:0.832041
[29]	train-auc:0.833168
[30]	train-auc:0.833841
[31]	train-auc:0.833165
[32]	train-auc:0.83391
[33]	train-auc:0.834433
[34]	train-auc:0.835604
[35]	train-auc:0.834428
[36]	train-auc:0.833883
[37]	train-auc:0.834665
[38]	train-auc:0.835545
[39]	train-auc:0.835072

[332]	train-auc:0.859696
[333]	train-auc:0.859747
[334]	train-auc:0.859803
[335]	train-auc:0.859834
[336]	train-auc:0.859938
[337]	train-auc:0.860003
[338]	train-auc:0.86007
[339]	train-auc:0.860149
[340]	train-auc:0.860203
[341]	train-auc:0.860291
[342]	train-auc:0.860406
[343]	train-auc:0.860439
[344]	train-auc:0.860543
[345]	train-auc:0.860603
[346]	train-auc:0.860648
[347]	train-auc:0.860685
[348]	train-auc:0.860752
[349]	train-auc:0.860817
[350]	train-auc:0.860887
[351]	train-auc:0.860968
[352]	train-auc:0.861031
[353]	train-auc:0.861085
[354]	train-auc:0.861168
[355]	train-auc:0.861255
[356]	train-auc:0.861321
[357]	train-auc:0.861367
[358]	train-auc:0.861416
[359]	train-auc:0.861528
[360]	train-auc:0.861619
[361]	train-auc:0.861664
[362]	train-auc:0.86171
[363]	train-auc:0.861826
[364]	train-auc:0.861894
[365]	train-auc:0.861969
[366]	train-auc:0.862052
[367]	train-auc:0.862121
[368]	train-auc:0.862148
[369]	train-auc:0.862199
[370]	train-auc:0.862262
[371]	train-auc:0.862336
[3

[662]	train-auc:0.873932
[663]	train-auc:0.873947
[664]	train-auc:0.873963
[665]	train-auc:0.874025
[666]	train-auc:0.874057
[667]	train-auc:0.874074
[668]	train-auc:0.874121
[669]	train-auc:0.874135
[670]	train-auc:0.874155
[671]	train-auc:0.874186
[672]	train-auc:0.874247
[673]	train-auc:0.874305
[674]	train-auc:0.874319
[675]	train-auc:0.874363
[676]	train-auc:0.874386
[677]	train-auc:0.874423
[678]	train-auc:0.874447
[679]	train-auc:0.87447
[680]	train-auc:0.874505
[681]	train-auc:0.874525
[682]	train-auc:0.874567
[683]	train-auc:0.874597
[684]	train-auc:0.87461
[685]	train-auc:0.874636
[686]	train-auc:0.874644
[687]	train-auc:0.87467
[688]	train-auc:0.874721
[689]	train-auc:0.874739
[690]	train-auc:0.874787
[691]	train-auc:0.874815
[692]	train-auc:0.874837
[693]	train-auc:0.874856
[694]	train-auc:0.8749
[695]	train-auc:0.87492
[696]	train-auc:0.874926
[697]	train-auc:0.874993
[698]	train-auc:0.875011
[699]	train-auc:0.875022
[700]	train-auc:0.875041
[701]	train-auc:0.87513
[702]	t

[992]	train-auc:0.883275
[993]	train-auc:0.8833
[994]	train-auc:0.883337
[995]	train-auc:0.883377
[996]	train-auc:0.883384
[997]	train-auc:0.883399
[998]	train-auc:0.883434
[999]	train-auc:0.883448


## Pós-processamento

In [77]:
data_test_original['prob'] = pd.Series(y_pred, index=data_test_original.index)

data_test_original.loc[data_test_original.var15 < 23, 'prob'] = 0
data_test_original.loc[data_test_original.saldo_medio_var5_hace2 > 160000, 'prob'] = 0
data_test_original.loc[data_test_original.saldo_var33 > 0, 'prob'] = 0
data_test_original.loc[data_test_original.var38 > 3988596, 'prob'] = 0
data_test_original.loc[data_test_original.var21 > 7500, 'prob'] = 0
data_test_original.loc[data_test_original.num_var30 > 9, 'prob'] = 0
data_test_original.loc[data_test_original.num_var13_0 > 6, 'prob'] = 0
data_test_original.loc[data_test_original.num_var33_0 > 9, 'prob'] = 0
data_test_original.loc[data_test_original.imp_ent_var16_ult1 > 51003, 'prob'] = 0
data_test_original.loc[data_test_original.imp_op_var39_comer_ult3 > 13184, 'prob'] = 0
data_test_original.loc[data_test_original.saldo_medio_var5_ult3 > 108251, 'prob'] = 0

#nv
data_test_original.loc[data_test_original.num_var33 + data_test_original.saldo_medio_var33_ult3 + data_test_original.saldo_medio_var44_hace2 + data_test_original.saldo_medio_var44_hace3 + data_test_original.saldo_medio_var33_ult1 + data_test_original.saldo_medio_var44_ult1 > 0, 'prob'] = 0

y_predicted = data_test_original['prob'].values
np.savetxt('sub_xgb_pos.txt', y_predicted, delimiter=',')