In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler, Imputer, OneHotEncoder, LabelEncoder
from sklearn.cross_validation import train_test_split
import numpy as np
import seaborn as sns
import operator
import math
from data_science import *
import xgboost as xgb
from hyperopt import fmin, tpe, hp, fmin, STATUS_OK, Trials
from hyperopt.mongoexp import MongoTrials

%matplotlib inline

In [2]:
# train and test data are imported in a single dataframe to do the preprocessings
# using import_data from utils/data_science.py module
df_all = import_data('data/train.csv', 'data/test.csv', 'VARIABLE_CIBLE', na_values='(MISSING)')

# creating two subsets of data
df_all0 = df_all[df_all.SOURCE_CITED_AGE == 'IMPUT']
df_all1 = df_all[df_all.SOURCE_CITED_AGE == 'CALC']

date_columns = ['PRIORITY_MONTH', 'FILING_MONTH', 'PUBLICATION_MONTH', 'BEGIN_MONTH']

# preprocessings using the Preprocessing class in the utils/data_science.py module
preproc = Preprocessings(date_columns=date_columns)

df0 = preproc.datetime_processings(df_all0, format='%m/%Y')
df0 = preproc.cat_to_codes(df0)
df0 = df0.fillna(df0.median())
df0 = df0.dropna(axis=1, how='all')


preproc = Preprocessings(date_columns=date_columns)

df1 = preproc.datetime_processings(df_all1, format='%m/%Y')
df1 = preproc.cat_to_codes(df1)
df1 = df1.fillna(df1.median())
df1 = df1.dropna(axis=1, how='all')

# splitting the train and test data (re_split is in utils.data_science.py)
df0_train, df0_eval = re_split(df0, 'VARIABLE_CIBLE')
df1_train, df1_eval = re_split(df1, 'VARIABLE_CIBLE')

Index([u'COUNTRY', u'FIRST_CLASSE', u'FISRT_APP_COUNTRY', u'FISRT_APP_TYPE',
       u'FISRT_INV_COUNTRY', u'FISRT_INV_TYPE', u'LANGUAGE_OF_FILLING',
       u'MAIN_IPC', u'SOURCE_BEGIN_MONTH', u'SOURCE_CITED_AGE',
       u'SOURCE_IDX_ORI', u'SOURCE_IDX_RAD', u'TECHNOLOGIE_FIELD',
       u'TECHNOLOGIE_SECTOR', u'VARIABLE_CIBLE', u'VOIE_DEPOT'],
      dtype='object')
Index([u'COUNTRY', u'FIRST_CLASSE', u'FISRT_APP_COUNTRY', u'FISRT_APP_TYPE',
       u'FISRT_INV_COUNTRY', u'FISRT_INV_TYPE', u'LANGUAGE_OF_FILLING',
       u'MAIN_IPC', u'SOURCE_BEGIN_MONTH', u'SOURCE_CITED_AGE',
       u'SOURCE_IDX_ORI', u'SOURCE_IDX_RAD', u'TECHNOLOGIE_FIELD',
       u'TECHNOLOGIE_SECTOR', u'VARIABLE_CIBLE', u'VOIE_DEPOT'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


# hyperopt

In [10]:


trials = MongoTrials('mongo://localhost:1234/xgboost/jobs', exp_key='exp1')
best = fmin(math.sin, hp.uniform('x', -2, 2), trials=trials, algo=tpe.suggest, max_evals=10)

print(best)



{u'x': -1.3109299800151657}


In [4]:
def score(params):
    """
    The score function to use in hyperopt.
    It is the validation error of xgboost models
    """
    print "Training with params : "
    print params
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(X_train, label=y_train, weight=weight )
    dvalid = xgb.DMatrix(X_test, label=y_test)
    # watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    model = xgb.train(params, dtrain, num_round)
    predictions = model.predict(dvalid)
    score = - roc_auc_score(y_test, predictions)
    print "\tScore {0}\n\n".format(score)
    return {'loss': score, 'status': STATUS_OK}


def optimize(trials):
    """
    Defining the space to explore, looking for the minimum score.
    """
    space = {
             'n_estimators' : hp.quniform('n_estimators', 100, 1000, 1),
             'eta' : hp.quniform('eta', 0.025, 0.5, 0.025),
             'max_depth' : hp.quniform('max_depth', 1, 13, 1),
             'min_child_weight' : hp.quniform('min_child_weight', 1, 6, 1),
             'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
             'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
             'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05),
             'eval_metric': 'auc',
             'objective': 'binary:logistic',
             'scale_pos_weight' : sum_wneg/sum_wpos,
             'nthread' : 6,
             'silent' : 0
             }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)

    print best


## optimization on df0

In [5]:
# columns selected for df0_train after RFECV
columns0 = ['APP_NB', 'APP_NB_PAYS', 'APP_NB_TYPE', 'COUNTRY', 'FIRST_CLASSE', 'FISRT_APP_COUNTRY', 
           'FISRT_APP_TYPE', 'FISRT_INV_COUNTRY', 'FISRT_INV_TYPE', 'INV_NB', 'INV_NB_PAYS', 
           'INV_NB_TYPE', 'LANGUAGE_OF_FILLING', 'MAIN_IPC', 'NB_BACKWARD', 'NB_BACKWARD_AUTRE', 
           'NB_BACKWARD_NPL', 'NB_BACKWARD_PL', 'NB_BACKWARD_XY', 'NB_CLASSES', 'NB_FIELDS', 
           'NB_ROOT_CLASSES', 'NB_SECTORS', 'TECHNOLOGIE_FIELD', 'TECHNOLOGIE_SECTOR', 'VOIE_DEPOT', 
           'oecd_NB_BACKWARD_NPL', 'oecd_NB_BACKWARD_PL', 'oecd_NB_ROOT_CLASSES', 'pct_NB_IPC', 'pct_NB_IPC_LY',
           'filing-begin', 'pub-filing', 'pub_year', 'VARIABLE_CIBLE']

removed = [col for col in df0_train.columns if col not in columns0]
print(removed)

df_train0, df_test0 = train_test_split(df0_train[columns0], test_size=0.2, random_state=42)

columns0.remove('VARIABLE_CIBLE')


test_size = len(df_test0)
train_size = len(df_train0)
# rescale weight to make it same as test set
weight = np.ones(train_size) * float(test_size) / float(train_size)

print(df_train0.columns)

# computing weights to balance the repartition of 'VARIABLE_CIBLE'values
sum_wpos = sum( weight[i] for i in range(train_size) if df_train0['VARIABLE_CIBLE'].iloc[i] == 1.0  )
sum_wneg = sum( weight[i] for i in range(train_size) if df_train0['VARIABLE_CIBLE'].iloc[i] == 0.0  )

['IDX_ORIGIN', 'IDX_RADIC', 'NB_BACKWARD_I', 'SOURCE_BEGIN_MONTH', 'SOURCE_CITED_AGE', 'SOURCE_IDX_ORI', 'SOURCE_IDX_RAD', 'cited_age_max', 'cited_age_mean', 'cited_age_median', 'cited_age_min', 'index_origin']
Index([u'APP_NB', u'APP_NB_PAYS', u'APP_NB_TYPE', u'COUNTRY', u'FIRST_CLASSE',
       u'FISRT_APP_COUNTRY', u'FISRT_APP_TYPE', u'FISRT_INV_COUNTRY',
       u'FISRT_INV_TYPE', u'INV_NB', u'INV_NB_PAYS', u'INV_NB_TYPE',
       u'LANGUAGE_OF_FILLING', u'MAIN_IPC', u'NB_BACKWARD',
       u'NB_BACKWARD_AUTRE', u'NB_BACKWARD_NPL', u'NB_BACKWARD_PL',
       u'NB_BACKWARD_XY', u'NB_CLASSES', u'NB_FIELDS', u'NB_ROOT_CLASSES',
       u'NB_SECTORS', u'TECHNOLOGIE_FIELD', u'TECHNOLOGIE_SECTOR',
       u'VOIE_DEPOT', u'oecd_NB_BACKWARD_NPL', u'oecd_NB_BACKWARD_PL',
       u'oecd_NB_ROOT_CLASSES', u'pct_NB_IPC', u'pct_NB_IPC_LY',
       u'filing-begin', u'pub-filing', u'pub_year', u'VARIABLE_CIBLE'],
      dtype='object')


In [14]:
X_train = df_train0[columns0].values
y_train = df_train0['VARIABLE_CIBLE'].values

X_test = df_test0[columns0].values
y_test = df_test0['VARIABLE_CIBLE'].values

#Trials object where the history of search will be stored
trials = Trials()

optimize(trials)

Training with params : 
{'colsample_bytree': 0.65, 'silent': 0, 'eval_metric': 'auc', 'scale_pos_weight': 1.3265408761400996, 'nthread': 6, 'min_child_weight': 1.0, 'n_estimators': 731.0, 'subsample': 0.8500000000000001, 'eta': 0.1, 'objective': 'binary:logistic', 'max_depth': 10.0, 'gamma': 0.9500000000000001}
	Score -0.705213727025


Training with params : 
{'colsample_bytree': 0.7000000000000001, 'silent': 0, 'eval_metric': 'auc', 'scale_pos_weight': 1.3265408761400996, 'nthread': 6, 'min_child_weight': 3.0, 'n_estimators': 590.0, 'subsample': 0.75, 'eta': 0.5, 'objective': 'binary:logistic', 'max_depth': 3.0, 'gamma': 0.8}
	Score -0.696043266474


Training with params : 
{'colsample_bytree': 0.9500000000000001, 'silent': 0, 'eval_metric': 'auc', 'scale_pos_weight': 1.3265408761400996, 'nthread': 6, 'min_child_weight': 2.0, 'n_estimators': 290.0, 'subsample': 0.75, 'eta': 0.375, 'objective': 'binary:logistic', 'max_depth': 13.0, 'gamma': 0.55}
	Score -0.669622802133


Training with 

Training with params : 
{'colsample_bytree': 0.55, 'silent': 0, 'eval_metric': 'auc', 'scale_pos_weight': 1.3265408761400996, 'nthread': 6, 'min_child_weight': 2.0, 'n_estimators': 590.0, 'subsample': 0.9, 'eta': 0.025, 'objective': 'binary:logistic', 'max_depth': 12.0, 'gamma': 0.75}
	Score -0.712056925029
    
Training with params : 
{'colsample_bytree': 0.55, 'silent': 0, 'eval_metric': 'auc', 'scale_pos_weight': 1.3265408761400996, 'nthread': 6, 'min_child_weight': 1.0, 'n_estimators': 831.0, 'subsample': 0.9, 'eta': 0.025, 'objective': 'binary:logistic', 'max_depth': 11.0, 'gamma': 0.7000000000000001}
	Score -0.712033955399
    
Training with params : 
{'colsample_bytree': 0.55, 'silent': 0, 'eval_metric': 'auc', 'scale_pos_weight': 1.3265408761400996, 'nthread': 6, 'min_child_weight': 1.0, 'n_estimators': 672.0, 'subsample': 0.9, 'eta': 0.025, 'objective': 'binary:logistic', 'max_depth': 12.0, 'gamma': 0.6000000000000001}
	Score -0.711943463338
    
    
Best results :
{'colsample_bytree': 0.55, 'min_child_weight': 2.0, 'n_estimators': 590.0, 'subsample': 0.9, 'eta': 0.025, 'max_depth': 12.0, 'gamma': 0.75}

## Optimization on df1

In [3]:
# Columns selected by RFECV
columns1 = ['APP_NB', 'APP_NB_PAYS', 'APP_NB_TYPE', 'COUNTRY', 'FIRST_CLASSE', 'FISRT_APP_COUNTRY', 
            'FISRT_APP_TYPE', 'FISRT_INV_COUNTRY', 'FISRT_INV_TYPE', 'IDX_ORIGIN', 'IDX_RADIC', 'INV_NB',
            'INV_NB_PAYS', 'INV_NB_TYPE', 'LANGUAGE_OF_FILLING', 'MAIN_IPC', 'NB_BACKWARD', 'NB_BACKWARD_AUTRE',
            'NB_BACKWARD_NPL', 'NB_BACKWARD_PL', 'NB_BACKWARD_XY', 'NB_CLASSES', 'NB_FIELDS', 'NB_ROOT_CLASSES',
            'NB_SECTORS', 'SOURCE_BEGIN_MONTH', 'SOURCE_IDX_RAD', 'TECHNOLOGIE_FIELD', 'TECHNOLOGIE_SECTOR',
            'cited_age_max', 'cited_age_mean', 'cited_age_median', 'cited_age_min', 'cited_age_std', 'cited_n',
            'oecd_NB_BACKWARD_NPL', 'oecd_NB_BACKWARD_PL', 'oecd_NB_ROOT_CLASSES', 'pct_NB_IPC', 'pct_NB_IPC_LY',
            'filing-begin', 'pub-filing', 'pub_year', 'VARIABLE_CIBLE']

df_train1, df_test1 = train_test_split(df1_train[columns1], test_size=0.2, random_state=42)

columns1.remove('VARIABLE_CIBLE')

test_size = len(df_test1)
train_size = len(df_train1)
# rescale weight to make it same as test set
weight = np.ones(train_size) * float(test_size) / float(train_size)

# computing weights to balance the repartition of 'VARIABLE_CIBLE'values
sum_wpos = sum( weight[i] for i in range(train_size) if df_train1['VARIABLE_CIBLE'].iloc[i] == 1.0  )
sum_wneg = sum( weight[i] for i in range(train_size) if df_train1['VARIABLE_CIBLE'].iloc[i] == 0.0  )

In [None]:
X_train = df_train1[columns1].values
y_train = df_train1['VARIABLE_CIBLE'].values

X_test = df_test1[columns1].values
y_test = df_test1['VARIABLE_CIBLE'].values

#Trials object where the history of search will be stored
trials = Trials()

optimize(trials)

Training with params : 
{'colsample_bytree': 0.9, 'silent': 0, 'eval_metric': 'auc', 'scale_pos_weight': 1.5966246815738532, 'nthread': 6, 'min_child_weight': 3.0, 'n_estimators': 702.0, 'subsample': 0.8500000000000001, 'eta': 0.1, 'objective': 'binary:logistic', 'max_depth': 6.0, 'gamma': 0.9}
	Score -0.690912980965


Training with params : 
{'colsample_bytree': 0.9, 'silent': 0, 'eval_metric': 'auc', 'scale_pos_weight': 1.5966246815738532, 'nthread': 6, 'min_child_weight': 6.0, 'n_estimators': 427.0, 'subsample': 0.5, 'eta': 0.275, 'objective': 'binary:logistic', 'max_depth': 12.0, 'gamma': 0.65}
	Score -0.639884770707


Training with params : 
{'colsample_bytree': 0.65, 'silent': 0, 'eval_metric': 'auc', 'scale_pos_weight': 1.5966246815738532, 'nthread': 6, 'min_child_weight': 5.0, 'n_estimators': 802.0, 'subsample': 0.55, 'eta': 0.15000000000000002, 'objective': 'binary:logistic', 'max_depth': 2.0, 'gamma': 0.5}
	Score -0.690685349984


Training with params : 
{'colsample_bytree': 