In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler, Imputer, OneHotEncoder, LabelEncoder
from sklearn.cross_validation import train_test_split
import numpy as np
import seaborn as sns
import operator
import math
from data_science import *
import xgboost as xgb
from hyperopt import fmin, tpe, hp, fmin, STATUS_OK, Trials
from hyperopt.mongoexp import MongoTrials

%matplotlib inline

# Preprocessings

In [2]:
# train and test data are imported in a single dataframe to do the preprocessings
# using import_data from utils/data_science.py module
df_all = import_data('data/train.csv', 'data/test.csv', 'VARIABLE_CIBLE', na_values='(MISSING)')

# creating two subsets of data
df_all0 = df_all[df_all.SOURCE_CITED_AGE == 'IMPUT']
df_all1 = df_all[df_all.SOURCE_CITED_AGE == 'CALC']

date_columns = ['PRIORITY_MONTH', 'FILING_MONTH', 'PUBLICATION_MONTH', 'BEGIN_MONTH']

# preprocessings using the Preprocessing class in the utils/data_science.py module
preproc = Preprocessings(date_columns=date_columns)

df0 = preproc.datetime_processings(df_all0, format='%m/%Y')
df0 = preproc.cat_to_codes(df0)
df0 = df0.fillna(df0.median())
df0 = df0.dropna(axis=1, how='all')


preproc = Preprocessings(date_columns=date_columns)

df1 = preproc.datetime_processings(df_all1, format='%m/%Y')
df1 = preproc.cat_to_codes(df1)
df1 = df1.fillna(df1.median())
df1 = df1.dropna(axis=1, how='all')

# splitting the train and test data (re_split is in utils.data_science.py)
df0_train, df0_eval = re_split(df0, 'VARIABLE_CIBLE')
df1_train, df1_eval = re_split(df1, 'VARIABLE_CIBLE')

Index([u'COUNTRY', u'FIRST_CLASSE', u'FISRT_APP_COUNTRY', u'FISRT_APP_TYPE',
       u'FISRT_INV_COUNTRY', u'FISRT_INV_TYPE', u'LANGUAGE_OF_FILLING',
       u'MAIN_IPC', u'SOURCE_BEGIN_MONTH', u'SOURCE_CITED_AGE',
       u'SOURCE_IDX_ORI', u'SOURCE_IDX_RAD', u'TECHNOLOGIE_FIELD',
       u'TECHNOLOGIE_SECTOR', u'VARIABLE_CIBLE', u'VOIE_DEPOT'],
      dtype='object')
Index([u'COUNTRY', u'FIRST_CLASSE', u'FISRT_APP_COUNTRY', u'FISRT_APP_TYPE',
       u'FISRT_INV_COUNTRY', u'FISRT_INV_TYPE', u'LANGUAGE_OF_FILLING',
       u'MAIN_IPC', u'SOURCE_BEGIN_MONTH', u'SOURCE_CITED_AGE',
       u'SOURCE_IDX_ORI', u'SOURCE_IDX_RAD', u'TECHNOLOGIE_FIELD',
       u'TECHNOLOGIE_SECTOR', u'VARIABLE_CIBLE', u'VOIE_DEPOT'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


# Training the model

## on df0

In [3]:
# columns selected for df0_train after RFECV
columns0 = ['APP_NB', 'APP_NB_PAYS', 'APP_NB_TYPE', 'COUNTRY', 'FIRST_CLASSE', 'FISRT_APP_COUNTRY', 
           'FISRT_APP_TYPE', 'FISRT_INV_COUNTRY', 'FISRT_INV_TYPE', 'INV_NB', 'INV_NB_PAYS', 
           'INV_NB_TYPE', 'LANGUAGE_OF_FILLING', 'MAIN_IPC', 'NB_BACKWARD', 'NB_BACKWARD_AUTRE', 
           'NB_BACKWARD_NPL', 'NB_BACKWARD_PL', 'NB_BACKWARD_XY', 'NB_CLASSES', 'NB_FIELDS', 
           'NB_ROOT_CLASSES', 'NB_SECTORS', 'TECHNOLOGIE_FIELD', 'TECHNOLOGIE_SECTOR', 'VOIE_DEPOT', 
           'oecd_NB_BACKWARD_NPL', 'oecd_NB_BACKWARD_PL', 'oecd_NB_ROOT_CLASSES', 'pct_NB_IPC', 'pct_NB_IPC_LY',
           'filing-begin', 'pub-filing', 'pub_year', 'VARIABLE_CIBLE']

removed = [col for col in df0_train.columns if col not in columns0]
print(removed)

df_train0, df_test0 = train_test_split(df0_train[columns0], test_size=0.2, random_state=42)

columns0.remove('VARIABLE_CIBLE')


test_size = len(df_test0)
train_size = len(df_train0)
# rescale weight to make it same as test set
weight = np.ones(train_size) * float(test_size) / float(train_size)

print(df_train0.columns)

# computing weights to balance the repartition of 'VARIABLE_CIBLE'values
sum_wpos = sum( weight[i] for i in range(train_size) if df_train0['VARIABLE_CIBLE'].iloc[i] == 1.0  )
sum_wneg = sum( weight[i] for i in range(train_size) if df_train0['VARIABLE_CIBLE'].iloc[i] == 0.0  )

# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
dtrain0 = xgb.DMatrix( df_train0[columns0].values, label=df_train0['VARIABLE_CIBLE'].values, weight=weight )
dtest0 = xgb.DMatrix( df_test0[columns0].values, label=df_test0['VARIABLE_CIBLE'].values)

['IDX_ORIGIN', 'IDX_RADIC', 'NB_BACKWARD_I', 'SOURCE_BEGIN_MONTH', 'SOURCE_CITED_AGE', 'SOURCE_IDX_ORI', 'SOURCE_IDX_RAD', 'cited_age_max', 'cited_age_mean', 'cited_age_median', 'cited_age_min', 'index_origin']
Index([u'APP_NB', u'APP_NB_PAYS', u'APP_NB_TYPE', u'COUNTRY', u'FIRST_CLASSE',
       u'FISRT_APP_COUNTRY', u'FISRT_APP_TYPE', u'FISRT_INV_COUNTRY',
       u'FISRT_INV_TYPE', u'INV_NB', u'INV_NB_PAYS', u'INV_NB_TYPE',
       u'LANGUAGE_OF_FILLING', u'MAIN_IPC', u'NB_BACKWARD',
       u'NB_BACKWARD_AUTRE', u'NB_BACKWARD_NPL', u'NB_BACKWARD_PL',
       u'NB_BACKWARD_XY', u'NB_CLASSES', u'NB_FIELDS', u'NB_ROOT_CLASSES',
       u'NB_SECTORS', u'TECHNOLOGIE_FIELD', u'TECHNOLOGIE_SECTOR',
       u'VOIE_DEPOT', u'oecd_NB_BACKWARD_NPL', u'oecd_NB_BACKWARD_PL',
       u'oecd_NB_ROOT_CLASSES', u'pct_NB_IPC', u'pct_NB_IPC_LY',
       u'filing-begin', u'pub-filing', u'pub_year', u'VARIABLE_CIBLE'],
      dtype='object')


In [4]:
# Parameters optained with hyperopt
param = {'colsample_bytree': 0.55, 'min_child_weight': 2.0,
         'subsample': 0.9, 'eta': 0.025, 'max_depth': 12.0, 'gamma': 0.75, 'objective': 'binary:logistic',
        'nthread': 6, 'scale_pos_weight': sum_wneg/sum_wpos, 'eval_metric': 'auc', 'silent': 0}


plst = list(param.items())

evallist  = [(dtrain0,'train'), (dtest0,'eval')]



num_round = 590
print ('loading data end, start to boost trees')
bst0 = xgb.train( plst, dtrain0, num_round, evallist)#, early_stopping_rounds=50 );
# save out model
bst0.save_model('df0_xgboost_optimized.model')

[0]	train-auc:0.664510	eval-auc:0.616507
[1]	train-auc:0.718911	eval-auc:0.664660
[2]	train-auc:0.726324	eval-auc:0.671996
[3]	train-auc:0.731837	eval-auc:0.674549
[4]	train-auc:0.738772	eval-auc:0.679690
[5]	train-auc:0.739712	eval-auc:0.680703
[6]	train-auc:0.742091	eval-auc:0.683541
[7]	train-auc:0.745953	eval-auc:0.687262
[8]	train-auc:0.748822	eval-auc:0.689131
[9]	train-auc:0.749418	eval-auc:0.688463
[10]	train-auc:0.749909	eval-auc:0.688969
[11]	train-auc:0.752329	eval-auc:0.690325
[12]	train-auc:0.754652	eval-auc:0.691837
[13]	train-auc:0.755249	eval-auc:0.692265
[14]	train-auc:0.756030	eval-auc:0.692718
[15]	train-auc:0.757039	eval-auc:0.692957
[16]	train-auc:0.756477	eval-auc:0.692163
[17]	train-auc:0.757786	eval-auc:0.692609
[18]	train-auc:0.758530	eval-auc:0.692875
[19]	train-auc:0.759006	eval-auc:0.693461
[20]	train-auc:0.759093	eval-auc:0.693412
[21]	train-auc:0.760736	eval-auc:0.694223
[22]	train-auc:0.761443	eval-auc:0.694947
[23]	train-auc:0.761891	eval-auc:0.695098
[2

loading data end, start to boost trees


## on df1

In [5]:
# Columns selected by RFECV
columns1 = ['APP_NB', 'APP_NB_PAYS', 'APP_NB_TYPE', 'COUNTRY', 'FIRST_CLASSE', 'FISRT_APP_COUNTRY', 
            'FISRT_APP_TYPE', 'FISRT_INV_COUNTRY', 'FISRT_INV_TYPE', 'IDX_ORIGIN', 'IDX_RADIC', 'INV_NB',
            'INV_NB_PAYS', 'INV_NB_TYPE', 'LANGUAGE_OF_FILLING', 'MAIN_IPC', 'NB_BACKWARD', 'NB_BACKWARD_AUTRE',
            'NB_BACKWARD_NPL', 'NB_BACKWARD_PL', 'NB_BACKWARD_XY', 'NB_CLASSES', 'NB_FIELDS', 'NB_ROOT_CLASSES',
            'NB_SECTORS', 'SOURCE_BEGIN_MONTH', 'SOURCE_IDX_RAD', 'TECHNOLOGIE_FIELD', 'TECHNOLOGIE_SECTOR',
            'cited_age_max', 'cited_age_mean', 'cited_age_median', 'cited_age_min', 'cited_age_std', 'cited_n',
            'oecd_NB_BACKWARD_NPL', 'oecd_NB_BACKWARD_PL', 'oecd_NB_ROOT_CLASSES', 'pct_NB_IPC', 'pct_NB_IPC_LY',
            'filing-begin', 'pub-filing', 'pub_year', 'VARIABLE_CIBLE']

df_train1, df_test1 = train_test_split(df1_train[columns1], test_size=0.2, random_state=42)

columns1.remove('VARIABLE_CIBLE')

test_size = len(df_test1)
train_size = len(df_train1)
# rescale weight to make it same as test set
weight = np.ones(train_size) * float(test_size) / float(train_size)

# computing weights to balance the repartition of 'VARIABLE_CIBLE'values
sum_wpos = sum( weight[i] for i in range(train_size) if df_train1['VARIABLE_CIBLE'].iloc[i] == 1.0  )
sum_wneg = sum( weight[i] for i in range(train_size) if df_train1['VARIABLE_CIBLE'].iloc[i] == 0.0  )

# construct xgboost.DMatrix from numpy array, treat -999.0 as missing value
dtrain1 = xgb.DMatrix( df_train1[columns1].values, label=df_train1['VARIABLE_CIBLE'].values, weight=weight )
dtest1 = xgb.DMatrix( df_test1[columns1].values, label=df_test1['VARIABLE_CIBLE'].values)

In [6]:
# parameters optained with hyperopt
param = {'colsample_bytree': 0.5, 'min_child_weight': 4.0, 'subsample': 0.8,
         'eta': 0.025, 'max_depth': 9.0, 'gamma': 0.7000000000000001,
         'objective': 'binary:logistic',
         'nthread': 6, 'scale_pos_weight': sum_wneg/sum_wpos, 'eval_metric': 'auc', 'silent': 0}



plst = list(param.items())

evallist  = [(dtrain1,'train'), (dtest1,'eval')]



num_round = 664
print ('loading data end, start to boost trees')
bst1 = xgb.train( plst, dtrain1, num_round, evallist)#, early_stopping_rounds=50 );
# save out model
bst1.save_model('df1_xgboost_optimized.model')

[0]	train-auc:0.682331	eval-auc:0.650454
[1]	train-auc:0.702271	eval-auc:0.665190
[2]	train-auc:0.708856	eval-auc:0.672176
[3]	train-auc:0.710931	eval-auc:0.673586
[4]	train-auc:0.714125	eval-auc:0.676053
[5]	train-auc:0.715907	eval-auc:0.676413
[6]	train-auc:0.719519	eval-auc:0.677203
[7]	train-auc:0.722659	eval-auc:0.678032
[8]	train-auc:0.724661	eval-auc:0.679861
[9]	train-auc:0.725724	eval-auc:0.680706
[10]	train-auc:0.726044	eval-auc:0.680456
[11]	train-auc:0.728047	eval-auc:0.681770
[12]	train-auc:0.730716	eval-auc:0.683725
[13]	train-auc:0.732351	eval-auc:0.684556
[14]	train-auc:0.733652	eval-auc:0.685823
[15]	train-auc:0.733548	eval-auc:0.685572
[16]	train-auc:0.734400	eval-auc:0.686349
[17]	train-auc:0.734781	eval-auc:0.686108
[18]	train-auc:0.735684	eval-auc:0.686366
[19]	train-auc:0.736426	eval-auc:0.686411
[20]	train-auc:0.737105	eval-auc:0.686971
[21]	train-auc:0.737413	eval-auc:0.686790
[22]	train-auc:0.738875	eval-auc:0.687582
[23]	train-auc:0.739745	eval-auc:0.687882
[2

loading data end, start to boost trees


# Combining models

In [7]:
deval0 = xgb.DMatrix(df0_eval[columns0].values)
y0 = bst0.predict(deval0)

deval1 = xgb.DMatrix(df1_eval[columns1].values)
y1 = bst1.predict(deval1)


df0_eval['proba_c0'] = y0
df0_eval['proba_c1'] = 1 - y0 

df1_eval['proba_c0'] = y1 
df1_eval['proba_c1'] = 1 - y1 

cols = ['proba_c0', 'proba_c1', 'index_origin']

df_eval_res = pd.concat((df0_eval[cols], df1_eval[cols]), axis=0, ignore_index=True)

print(y0)
print(y1)

df_eval_res.head()

[ 0.58521754  0.32299241  0.74920309 ...,  0.5295555   0.55801564
  0.5213533 ]
[ 0.56575465  0.46858999  0.39499402 ...,  0.64521116  0.40201771
  0.56934792]


Unnamed: 0,proba_c0,proba_c1,index_origin
0,0.585218,0.414782,1
1,0.322992,0.677008,2
2,0.749203,0.250797,3
3,0.416809,0.583192,4
4,0.147908,0.852092,7


In [8]:
df_eval_res = df_eval_res.sort_index(by='index_origin', ascending=True)

df_eval_res.head()

  if __name__ == '__main__':


Unnamed: 0,proba_c0,proba_c1,index_origin
84092,0.565755,0.434245,0
0,0.585218,0.414782,1
1,0.322992,0.677008,2
2,0.749203,0.250797,3
3,0.416809,0.583192,4


In [9]:
np.savetxt('y_pred_xgboost_optimized.txt', df_eval_res['proba_c1'].values, fmt='%s')