-
Notifications
You must be signed in to change notification settings - Fork 0
/
Main_surrender_modeling.py
348 lines (290 loc) · 17.2 KB
/
Main_surrender_modeling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
import numpy as np
import seaborn as sns
import pandas as pd
import pickle5 as pickle
import os.path, joblib, time
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping
# import functions
from functions.sub_surrender_models import Naive_Classifier, ANN_bagging, Logit_model, hpsearch_boost_ann, resample_and_shuffle
from functions.sub_utils import load_ANN_boost_object
from functions.sub_statistical_evaluation import model_evaluation, pq_plot, display_evaluation_curves, evaluate_surrender_rate
from functions.sub_surrender_profiles import get_model_input
from functions.sub_sklearn_hyperopt import map_and_clean_hparams, hyperopt_get_best_boost_ann
from HPSearch_sklearn import poly_degree_max
from global_vars import path_plots, path_tables
#---------------------
import warnings
warnings.filterwarnings("ignore") # supress eps-PostScript warnings about transparency
#---------------------
pd.set_option('precision', 4)
sns.set()
sns.set_style('ticks')
sns.set_context('paper')
N_epochs = 2000 # at the beginning of script for better visibility
val_share = 0.3 # will be imported from other script to make sure val_share is used consistently
def run_main(surrender_profile: int, bool_load_models: bool, bool_update_plots = True, resampling = 'None'):
'''
Run resp. load all models -> create plots and statistics.
'''
assert surrender_profile <4
assert resampling in ['None', 'SMOTE', 'undersampling']
print(f'Applying {resampling} resampling!')
# training boolean-values
bool_load_ann = True # If true: ANN-Boost model will be loaded
bool_load_ann_boost = True
bool_load_LR = True
bool_save_results = True
# general bagging & boosting hyperparams
N_bagging = 5 # for ANN
N_bagging_logit = 10 # for LogisticRegression
# ANN config
tf_strategy = tf.distribute.MirroredStrategy()
early_stopping = EarlyStopping(monitor= 'val_loss', mode = 'min', patience= 25, restore_best_weights= True)
# path variables
cwd = os.path.dirname(os.path.realpath(__file__))
path_hparams = os.path.join(cwd,r'profile_{}'.format(surrender_profile))
path_save_models = os.path.join(os.path.join(cwd,r'profile_{}'.format(surrender_profile)), r'models')
path_save_models_boosting = os.path.join(path_save_models,r'Boosting')
path_save_models_bagging = os.path.join(path_save_models,r'Bagging')
path_data = os.path.join(cwd,r'profile_{}'.format(surrender_profile))
# adjust path is resampling is applied (optional)
if resampling == 'SMOTE':
path_save_models = os.path.join(path_save_models, r'SMOTE')
path_save_models_bagging, path_save_models_boosting = path_save_models, path_save_models
elif resampling == 'undersampling':
path_save_models = os.path.join(path_save_models, r'Undersampling')
path_save_models_bagging, path_save_models_boosting = path_save_models, path_save_models
else:
pass
# import Training and Test Data
X_train = pd.read_csv(os.path.join(path_data,'X_train.csv'), index_col= 0)
X_test = pd.read_csv(os.path.join(path_data,'X_test.csv'), index_col= 0)
y_train = pd.read_csv(os.path.join(path_data, 'y_train.csv'), index_col= 0).values.flatten()
y_test = pd.read_csv(os.path.join(path_data,'y_test.csv'), index_col= 0 ).values.flatten()
X_train_raw = pd.read_csv(os.path.join(path_data,'X_train_raw.csv'), index_col= 0)
X_test_raw = pd.read_csv(os.path.join(path_data,'X_test_raw.csv'), index_col= 0)
if surrender_profile == 3:
# manually check whether time-feature has been scaled
# Note: although this represents non-stationary noise, its scale might prevent iterative-solvers from converging
assert np.min(X_train['Time'])>=-1 and np.max(X_train['Time'])<=1, 'X_train["Time"] has not been scaled properly'
# restrict data to relevant features -> assume proper exploratory data analysis
features_profile_lst = get_model_input(surrender_profile)
X_train, X_test = X_train[[el for el in features_profile_lst]], X_test[[el for el in features_profile_lst]]
# record times, e.g. for mean lapse ratio lateron
# Note: We keep 'Time' feature in the data set X_raw, as it does not make a difference; raw-data are not used for modeling
times_train, times_test = X_train_raw['Time'], X_test_raw['Time']
# Load Scaling range used in 'Lapse_data_preparation' for later visualization
with open(os.path.join(cwd,'dict_range_scale_{}.pkl'.format(surrender_profile)), 'rb') as f:
dict_range_scale= pickle.load(f)
# Load beta0 of latent surrender model for later visualization
with open(os.path.join(cwd,'profile_{}/beta0.pkl'.format(surrender_profile)), 'rb') as f:
beta0 = pickle.load(f)
print('Implied beta0: ', beta0)
# # Construction of Models
# n_input = X_train.shape[1]
# Baseline Model - Constant surrender probability
Baseline = Naive_Classifier(rate=sum(y_train)/len(y_train))
print('Naive Classifier (Baseline) constructed.')
## Improved models using bagging and boosting (No resampling)
#### model setup
###------------------------- Polynomial LR ##-------------------------
print('\n Constructing bagged LR estimator ...')
if os.path.exists(os.path.join(path_save_models_bagging,r'LR.pkl'))&bool_load_models&bool_load_LR:
with open(os.path.join(path_save_models_bagging,r'LR.pkl'), 'rb') as file:
LR_poly_bag = pickle.load(file)
print('\t LR estimator (bag) loaded.')
else:
trials = joblib.load(os.path.join(path_hparams, r'hyperopt_logit.pkl'))
params = map_and_clean_hparams(trials, 'logit')
time_LR = time.time()
LR_poly_bag = Logit_model(params = params, poly_degrees = [poly_degree_max]*X_train.shape[1], N_bag = N_bagging_logit, resampler= resampling).fit(X_train, y_train)
time_LR = time.time()-time_LR
print('Training time of Logit Bagging: ', time_LR)
print('__________________________________________________________________________', '\n')
with open(os.path.join(path_save_models_bagging,r'LR.pkl'), 'wb') as file:
pickle.dump(LR_poly_bag, file)
###------------------------- Random Forest ##-------------------------
print('\n Constructing Random Forest estimator ...')
if os.path.exists(os.path.join(path_save_models_bagging,r'RF.pkl'))&bool_load_models:
RF = pickle.load(open(os.path.join(path_save_models_bagging,r'RF.pkl'), 'rb'))
print('\t ... RF model loaded')
else:
trials = joblib.load(os.path.join(path_hparams, r'hyperopt_rf.pkl'))
params = map_and_clean_hparams(trials, 'rf')
time_RF = time.time()
RF = RandomForestClassifier(**params)
if resampling != 'None':
X_res, y_res = resample_and_shuffle(X_train, y_train, resample_type= resampling)
else:
X_res, y_res = X_train, y_train
RF.fit(X_res, y_res)
time_RF = time.time()-time_RF
print('Training time RF: ', time_RF)
pickle.dump(RF, open(os.path.join(path_save_models_bagging,r'RF.pkl'),'wb'))
print('__________________________________________________________________________', '\n')
###------------------------- XGBoost classifier ##-------------------------
print('\n Constructing XGB estimator ...')
if (os.path.exists(os.path.join(path_save_models_boosting,r'xgb.model'))&bool_load_models):
trials = joblib.load(os.path.join(path_hparams, r'hyperopt_xgboost.pkl'))
params = map_and_clean_hparams(trials, 'xgboost')
xgb = XGBClassifier(**params)
xgb.load_model(os.path.join(path_save_models_boosting,r'xgb.model'))
print('\t ... XGB loaded')
else:
trials = joblib.load(os.path.join(path_hparams, r'hyperopt_xgboost.pkl'))
params = map_and_clean_hparams(trials, 'xgboost')
time_xgb = time.time()
xgb = XGBClassifier(**params)
if resampling != 'None':
X_res, y_res = resample_and_shuffle(X_train, y_train, resample_type= resampling)
else:
X_res, y_res = X_train, y_train
xgb.fit(X_res, y_res)
time_xgb = time.time()-time_xgb
print('Training time of XGB: ', time_xgb)
xgb.save_model(os.path.join(path_save_models_boosting,r'xgb.model'))
print('__________________________________________________________________________', '\n')
##------------------------- Neural Network - binary crossentropy ##-------------------------
print('\n Constructing bagged NN (bc) estimator ...')
trials = joblib.load(os.path.join(path_hparams, r'hyperopt_ann.pkl'))
params = map_and_clean_hparams(trials, 'ann', surrender_profile= surrender_profile)
print('\t hyperparams: ', params)
NN_bc_bag = ANN_bagging(N_models = N_bagging, hparams=params, tf_dist_strat= tf_strategy, resampler = resampling)
if os.path.exists(os.path.join(path_save_models_bagging,r'NN_bc_bag_0.h5'))&bool_load_ann:
for i in range(N_bagging):
# load existing config
NN_bc_bag.model[i] = load_model(os.path.join(path_save_models_bagging,r'NN_bc_bag_{}.h5'.format(i)), compile=False)
NN_bc_bag.model[i].compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['acc'] )
# important: link ensemble-object to new parametrization
NN_bc_bag.re_init_ensemble()
print('\t NN (bc) loaded!')
else:
time_NN_bag = time.time()
# note: resampling-scheme implicitely set at initialization
NN_bc_bag.fit(X_train = X_train, y_train=y_train, callbacks = [early_stopping], val_share = val_share, N_epochs = N_epochs)
time_NN_bag=time.time()-time_NN_bag
print('Training time of NN_bag: ', time_NN_bag)
# Save NN-model
for i in range(N_bagging):
NN_bc_bag.model[i].save(os.path.join(path_save_models_bagging,r'NN_bc_bag_{}.h5'.format(i)))
# Perform boosting of ANN
print('\n Constructing boosted NN (bc) estimator ...')
if os.path.exists(os.path.join(path_save_models_boosting,r'NN_bc_boost.pkl')) & bool_load_ann_boost:
with tf_strategy.scope():
NN_bc_boost= load_ANN_boost_object(path=os.path.join(path_save_models_boosting,r'NN_bc_boost.pkl'))
print('\t ... NN boosting object loaded!')
else:
if resampling == 'None':
# extract values from HPSearch
try:
with tf_strategy.scope():
NN_bc_boost, time_NN_boost = hyperopt_get_best_boost_ann(path_hparams)
NN_bc_boost.restore_learners() # helper-function which allows ANN-object to be pickled and then to be restored
print('\t ... loaded best ANN_boost from the hypersearch within tf-distribution.')
except:
NN_bc_boost, time_NN_boost = hyperopt_get_best_boost_ann(path_hparams)
NN_bc_boost.restore_learners() # helper-function which allows ANN-object to be pickled and then to be restored
print('\t ... loaded best ANN_boost from the hypersearch.')
NN_bc_boost.save_object(path=os.path.join(path_save_models_boosting,r'NN_bc_boost.pkl'))
else:
# actually train model, based on non-resampling HParams (computational necessity and otherwise HPSearch very seed dependent)
trials = joblib.load(os.path.join(path_hparams, r'hyperopt_boost_ann.pkl'))
params = map_and_clean_hparams(trials, 'boost_ann', surrender_profile= surrender_profile)
print('\t hyperparams: ', params)
# init model, incl. resampling-scheme
NN_bc_boost = hpsearch_boost_ann(resampler = resampling, tf_dist_strat= tf_strategy, **params)
tic = time.time()
NN_bc_boost.fit(x=X_train, y = y_train, N_epochs=N_epochs, N_batch=params['batch_size'], val_share=val_share, callbacks=[early_stopping], correction_freq=params['n_boosting']+1)
time_NN_boost = time.time()-tic
NN_bc_boost.save_object(path=os.path.join(path_save_models_boosting,r'NN_bc_boost.pkl'))
if bool_update_plots:
if resampling != 'None':
tag = '_' + resampling
else:
tag = ''
t_start_eval = time.time()
print('\n Plotting mean surrender rate incl. CIs:')
evaluate_surrender_rate(times = times_train.append(times_test), X=X_train.append(X_test), y=np.concatenate((y_train,y_test)), data_split = len(np.unique(times_train)),
model_lst = [Baseline, LR_poly_bag, RF, xgb, #lgb_model,
NN_bc_bag, NN_bc_boost],
model_names_lst = ['Baseline', 'Logist. Regr.', 'Random Forest', 'XGBoost', #'LGB',
'NN - bagging','NN - boosting'],
path = os.path.join(path_plots, r'{}_msr_boost{}'.format(surrender_profile, tag)))
print('Time msr calculation: ', time.time()-t_start_eval)
t_start_eval = time.time()
print('\n', 'pq-plot of predicted vs. actual lapse probability:')
pq_plot(x_scal=X_test, x_raw = X_test_raw,
model_lst = [LR_poly_bag, RF, xgb, #lgb_model,
NN_bc_bag, NN_bc_boost],
model_names_lst = ['Logist. Regr.', 'Random Forest', 'XGBoost', #'LGB',
'NN - bagging','NN - boosting'],
beta0=beta0, profile = surrender_profile, path= os.path.join(path_plots, r'{}_pq_boost{}'.format(surrender_profile, tag)))
print('Time pq calculation: ', time.time()-t_start_eval)
t_start_eval = time.time()
print('\n','ROC and RP curve for vanilla estimators')
display_evaluation_curves(x=X_test, y=y_test,
predictors_lst= [LR_poly_bag, RF, xgb, #lgb_model,
NN_bc_bag, NN_bc_boost],#, NN_bc],
predictors_name_lst= ['Logist. Regr.', 'Random Forest', 'XGBoost', #'LGB',
'NN - bagging','NN - boosting'],
curve_type= 'both', figsize= (8,3),
path= os.path.join(path_plots, r'{}_roc_boost{}'.format(surrender_profile,tag)))
print('Time ROC calculation: ', time.time()-t_start_eval)
t_start_eval = time.time()
print('Computing statistics for Boosted model')
eval_boost = model_evaluation(X_train=X_train, X_test=X_test, X_train_raw= X_train_raw, X_test_raw = X_test_raw,
y_train=y_train, y_test=y_test,
model_lst = [Baseline, LR_poly_bag, RF, xgb, #lgb_model,
NN_bc_bag, NN_bc_boost],
model_names_lst = ['Baseline', 'Logist. Regr.', 'Random Forest', 'XGBoost', #'LGB',
'NN - bagging','NN - boosting'],
beta0_true= beta0, dict_range_scale=dict_range_scale,
profile = surrender_profile)
print(eval_boost)
print('Time df-stats calculation: ', time.time()-t_start_eval)
if bool_save_results:
with open(os.path.join(path_tables,r'{}_stats_boost{}.tex'.format(surrender_profile, tag)),'w') as f:
f.write(eval_boost.to_latex())
# Create, Update and/or save training times
# Note: new training times will be updated starting line 453
try:
# if file exists and is not empty
with open(os.path.join(path_tables, r'{}_training_times{}.pkl'.format(surrender_profile,tag)), 'rb') as f:
dict_times = pickle.load(f)
print('Training times could be loaded!')
except:
dict_times = {'LR': None, 'RF': None, 'XGB': None, 'NN_bag': None, 'NN_boost': None}
print('Training times had to be re-initialized!')
# check whether new training times have been recorded
try:
dict_times['RF'] = time_RF
print('Training time of RF updated!')
except: pass
try:
dict_times['XGB'] = time_xgb
print('Training time of XGBoost updated!')
except: pass
try:
dict_times['LR'] = time_LR
print('Training time of LR updated!')
except: pass
try:
dict_times['NN_boost'] = time_NN_boost
print('Training time of NN-boost updated!')
except: pass
try:
dict_times['NN_bag'] = time_NN_bag
print('Training time of NN-bag updated!')
except: pass
with open(os.path.join(path_tables, r'{}_training_times{}.pkl'.format(surrender_profile, tag)), 'wb') as f:
pickle.dump(dict_times, f, pickle.HIGHEST_PROTOCOL)
if __name__ == '__main__':
updatePlots = False
loadExistingModels = True
for res_type in ['None', 'SMOTE', 'undersampling']:
for i in [0,1,2,3]:
run_main(surrender_profile=i, bool_load_models=loadExistingModels, bool_update_plots=updatePlots, resampling= res_type)