In [None]:
import pickle
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import  LogisticRegression
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from .utils import  *
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

%matplotlib inline
plt.style.use('fivethirtyeight')

## Part 2: Supervised Learning Model


## Load and clean the train and test dataset

In [None]:
mailout_train = pd.read_csv('data/mail_train.csv',sep=';')
mailout_test = pd.read_csv('data/mail_test.csv',sep=';')


#most of the features has less than 30% missing values similar to azdias and customers datasets
sns.distplot(mailout_train.isnull().sum()/len(mailout_train))
sns.distplot(mailout_test.isnull().sum()/len(mailout_test))

In [None]:
#process load data
attributes = pd.read_excel('data/attributes.xlsx', engine='openpyxl', skiprows = 1)

#train data
mailout_train = unknown_unify(mailout_train, attributes)
X = feature_transform(mailout_train)

#test data
mailout_test = unknown_unify(mailout_test, attributes)
X_sub = feature_transform(mailout_test)

#----!

y = X['RESPONSE']
X.drop(['RESPONSE'], axis = 1, inplace = True)

print(X.shape, X_sub.shape)

In [None]:
sns.barplot(x = y.value_counts().index, y = y.value_counts())
plt.xlabel('RESPONSE')
plt.ylabel('Count')
print(y.value_counts()/y.count())


### Test the initial models for overall performance based on the competition metri ROC_AUC

In [None]:
#create list of models with default hyerparameters using models with capability for probabilistic prediction
model_list = {'LR':LogisticRegression(), 'RF' : RandomForestClassifier(),'LGBM':  LGBMClassifier(), 'XGB':  XGBClassifier()}

#loop through the models and evaluate basic performance
fig = plt.figure(figsize = (10,10))
for keys in model_list:
    #plot series
    pipeline = Pipeline([('impute', SimpleImputer(strategy= 'constant', fill_value = -1)),('scale', StandardScaler()),  ('clf', model_list[keys])])
    plot_roc(pipeline,X, y, keys)

plt.show()

### Results:

It's clear that with the default parameters LGBMClassifier gives the highest results. After some parameter tuning and submission to the competition actually XGB gave higher scores so I will continue with BayesianOptimization of the algorithm


In [None]:
pipeline = Pipeline([('impute', SimpleImputer(strategy= 'constant', fill_value = -1)),
                     ('clf', XGBClassifier())
                    ])

In [None]:
# Defining machine learning pipeline:
xgbc_ml_pipe_bayes = pipeline

# Setting parameters to be tested:
bayes_search_space = {'clf__booster': Categorical(['gbtree', 'dart']),
                      'clf__learning_rate': Real(0.01, 0.3),
                      'clf__gamma': Integer(0, 100),
                      'clf__min_child_weight': Integer(0, 10),
                      'clf__reg_lambda': Integer(1, 100),
                      'clf__reg_alpha': Integer(0, 100),
                      'clf__tree_method': Categorical(['auto', 'hist']),
                      'clf__max_depth': Integer(2, 7)
}

# Defining function to display scores:
def show_score(optim_result):
    '''
    It shows iteration scores during Bayesian Optimization
    '''
    # Computing score:
    score = xgbc_bayes_clf.best_score_
    print('Best ROC_AUC Score:{}'.format(score))

    # Early stop:
    if score >= 0.81:
        print('At least 0.81 ROC_AUC score achieved!')

        return True

# Grid search + ML pipleine:
xgbc_bayes_clf = BayesSearchCV(xgbc_ml_pipe_bayes, bayes_search_space, scoring = 'roc_auc', cv = 5, verbose = 2,
                                   n_iter = 10
)

# Training model:
xgbc_bayes_clf.fit(X, y, callback = show_score)# Defining machine learning pipeline:

In [None]:
# pipeline = Pipeline([('impute', SimpleImputer(strategy= 'constant', fill_value = -1)),
#                      ('clf', XGBClassifier())
#                     ])

#### Bayesian Optimization of XGB hyperparameters.

Using 5 fold StratifiedKFold to evaluate each step. Here I use only 10 steps for example and try to keep the notebook as clean as possible.

In [None]:
# ITERATIONS = 10
# SEED = 42
#
# bayes_cv_tuner_xg = BayesSearchCV(
#     estimator = pipeline,
#     search_spaces = {
#         'clf__learning_rate': (0.001, 0.9, 'log-uniform'),
#         'clf__max_depth': (2, 10),
#         'clf__min_child_weight': (1, 10),
#         'clf__gamma': (0.0, 1.0, 'uniform'),
#         'clf__subsample': (0.5, 1.0, 'uniform'),
#         'clf__colsample_bytree': (0.5, 1.0, 'uniform'),
#         'clf__reg_alpha': (1e-9, 1.0, 'log-uniform'),
#         'clf__n_estimators': (50, 500),
#         'clf__scale_pos_weight': (1,90)
#
#     },
#     scoring = 'roc_auc',
#     cv = StratifiedKFold(
#         n_splits=5,
#         shuffle=True,
#         random_state= SEED
#     ),
#     n_jobs = -1,
#     n_iter = ITERATIONS,
#     verbose = 0,
#     refit = True,
#     random_state = np.random.RandomState(50)
# )
#
# def status_print(optim_result):
#     """Status callback durring bayesian hyperparameter search"""
#
#     # Get all the models tested so far in DataFrame format },
#     all_models = pd.DataFrame(bayes_cv_tuner_xg.cv_results_)
#
#     # Get current parameters and the best parameters
#     best_params = pd.Series(bayes_cv_tuner_xg.best_params_)
#
#     print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
#         len(all_models),
#         np.round(bayes_cv_tuner_xg.best_score_, 4),
#         bayes_cv_tuner_xg.best_params_
#     ))
#
#     # Save all model results
#     clf_name = bayes_cv_tuner_xg.estimator.__class__.__name__
#     all_models.to_csv(clf_name+"_cv_results.csv")
#
# %time
# result_xg = bayes_cv_tuner_xg.fit(X, y, callback=status_print)

In [None]:
#optimized algorithm
parameters= dict([('colsample_bytree', 0.6888348010130712), ('gamma', 0.43702345115978325), ('learning_rate', 0.0030871248366675184), ('max_depth', 3), ('min_child_weight', 5), ('n_estimators', 376), ('reg_alpha', 0.9682055105813826), ('scale_pos_weight', 86), ('subsample', 0.9162583886152913)])


pipeline = Pipeline([('impute', SimpleImputer(strategy= 'constant', fill_value = -1)),
                     ('clf', XGBClassifier(**parameters))
                    ])

fig
fig1 = plt.figure(figsize = (10,10))
plot_roc(pipeline,X, y, 'XGB')


In [None]:
# Saving X:
filename = 'X.pkl'
pickle.dump(X, open(filename, 'wb'))

In [None]:
# Saving X_sub:
filename = 'X_sub.pkl'
pickle.dump(X_sub, open(filename, 'wb'))

In [None]:
# Saving y:
filename = 'y.pkl'
pickle.dump(y, open(filename, 'wb'))

In [None]:
# Saving mailout_test:
filename = 'mailout_test.pkl'
pickle.dump(mailout_test, open(filename, 'wb'))

In [None]:
# Saving parameters:
filename = 'parameters.pkl'
pickle.dump(parameters, open(filename, 'wb'))


