In [None]:
# Importing libraries:

from .utils import *
import joblib


import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as Pipeline_imb

from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, learning_curve, train_test_split

from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

from lightgbm import LGBMClassifier

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# magic word for producing visualizations in notebook
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')



In [None]:
azdias = pd.read_csv('../data/azdias.csv', sep=';')
customers=pd.read_csv('../data/customers.csv',sep=';')











In [None]:
# Creating a list of variables that overcomes the threshold for nan values:
nan_threshold = 0.35 # 35%

# Nan proportion, using general population dataframe (more observations):
var_nan_prop = azdias.isnull().mean()

# List:
nan_list = list()
for i in range(len(var_nan_prop)):
    if var_nan_prop[i] >= nan_threshold:
        nan_list.append(var_nan_prop.index.values[i])

print('{} columns with more than {}% of nan values.'.format(len(nan_list), nan_threshold*100))

In [None]:
# Creating function that delete the columns listed in nan_list:
def eliminate_nan_columns(df, nan_cols = nan_list):
    '''
    It deletes dataframe columns in nan_cols list.

    Inputs:
    df: original dataframe;
    nan_cols:list of columns to be deleted.

    Output:
    df: dataframe updated without nan_cols.
    '''
    # Deleting nan_cols:
    df.drop(columns = nan_cols, inplace = True)

    return df

## Part 2: Supervised Learning Model<a name="part2"></a>

Now that you've found which parts of the population are more likely to be customers of the mail-order company, it's time to build a prediction model. Each of the rows in the "MAILOUT" data files represents an individual that was targeted for a mailout campaign. Ideally, we should be able to use the demographic information from each individual to decide whether or not it will be worth it to include that person in the campaign.

The "MAILOUT" data has been split into two approximately equal parts, each with almost 43 000 data rows. In this part, you can verify your model with the "TRAIN" partition, which includes a column, "RESPONSE", that states whether or not a person became a customer of the company following the campaign. In the next part, you'll need to create predictions on the "TEST" partition, where the "RESPONSE" column has been withheld.

In [None]:
# Load in the data:
mailout_train = pd.read_csv('../data/mail_train.csv', sep=';')

# Loading train data:
#mailout_train = joblib.load('train')

mailout_train.head()

In [None]:
# Verifying class balance:
mailout_train.RESPONSE.value_counts()

### 2.1 Data Transformation<a name="datatrans"></a>

For the supervised learning task, the strategy will be less conservative, eliminating as few features as possible. Because of that, the process of selecting columns will be simplified, increasing the threshold percentage for *nan* values.

In [None]:
# Joining nan and unknown values:
mailout_train = join_nan_with_unknown(mailout_train)

In [None]:
# Creating a list of variables that overcomes the threshold for nan values:
new_nan_threshold = 0.5 # 50%

# Nan proportion, using general population dataframe (more observations):
var_nan_prop = mailout_train.isnull().mean()

# List:
new_nan_list = list()
for i in range(len(var_nan_prop)):
    if var_nan_prop[i] >= new_nan_threshold:
        new_nan_list.append(var_nan_prop.index.values[i])

print('{} columns with more than {}% of nan values.'.format(len(new_nan_list), new_nan_threshold*100))

In [None]:
# Updating dtypes_dict:
new_dtypes_dict = dtypes_dict.copy()
new_dtypes_dict['AGER_TYP'] = 'cat'
new_dtypes_dict['D19_GESAMT_ANZ_24'] = 'num'
new_dtypes_dict['D19_GESAMT_DATUM'] = 'num'
new_dtypes_dict['D19_GESAMT_OFFLINE_DATUM'] = 'num'
new_dtypes_dict['D19_GESAMT_ONLINE_DATUM'] = 'num'
new_dtypes_dict['D19_KONSUMTYP'] = 'cat'
new_dtypes_dict['D19_KONSUMTYP_MAX'] = 'num'
new_dtypes_dict['D19_SONSTIGE'] = 'num'
new_dtypes_dict['D19_SOZIALES'] = 'num'
new_dtypes_dict['D19_VERSAND_DATUM'] = 'num'
new_dtypes_dict['D19_VERSAND_OFFLINE_DATUM'] = 'num'
new_dtypes_dict['D19_VOLLSORTIMENT'] = 'num'
new_dtypes_dict['EXTSEL992'] = 'num'
new_dtypes_dict['GEBURTSJAHR'] = 'num'

In [None]:
# Target column:
y = mailout_train.RESPONSE

In [None]:
def supervised_data_transformation(df, test_set):
    '''
    It applies all the data transformation steps before treating nan values.

    Input:
    df: original dataframe;
    train_set: boolean indicating whether it's train or test set.

    Output:
    df: transformed dataframe.
    '''
    if test_set:
        # Joining nan and unknown values:
        df = join_nan_with_unknown(df)

        # Columns do drop:
        drop_cols = ['LNR', 'EINGEFUEGT_AM', 'EINGEZOGENAM_HH_JAHR']

    else:
        drop_cols = ['LNR', 'EINGEFUEGT_AM', 'EINGEZOGENAM_HH_JAHR', 'RESPONSE']

    # Eliminating nan columns:
    df = eliminate_nan_columns(df, nan_cols = new_nan_list)

    # Dropping useless columns:
    df = df.drop(columns = drop_cols)

    # Changing column dtypes:
    df = change_dtypes(df, dtypes = new_dtypes_dict)

    # Feature engineering I:
    df = feature_engineer(df)

    # Feature engineering II:
    df = transform_cameo_deu(df)

    return df

In [None]:
# Applying data transformations on train set:
mailout_train = supervised_data_transformation(mailout_train, test_set = False)

Next step, different pipelines will be built in order to treat *nan* values differently, according to column dtype.

In [None]:
# Creating dtypes list:
numerical_cols_list = list()
binary_cols_list = list()
categorical_cols_list = list()

# Appending columns to lists:
for col in mailout_train.columns:
    try:
        dtype = new_dtypes_dict[col]
    except:
        dtype = new_feat_dtypes_dict[col]

    if dtype == 'num':
        numerical_cols_list.append(col)
    elif dtype == 'cat':
        categorical_cols_list.append(col)
    else:
        binary_cols_list.append(col)

In [None]:
# Pipeline for treating nan values:
# Numerical features: nan values will be imputed using the 'median', and then StandardScaler will be applied:
num_features = numerical_cols_list
num_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())
])

# Categorical features will be one-hot-encoded:
cat_features = categorical_cols_list
cat_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('cat_ohe', OneHotEncoder(handle_unknown = 'ignore'))
])

# Binary features with nan values will also be one-hot-encoded:
bin_features = list(mailout_train[binary_cols_list].columns[mailout_train[binary_cols_list].isnull().sum() > 0])
bin_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('bin_ohe', OneHotEncoder(handle_unknown = 'ignore'))
])

# Encapsulating transformations:
preproc = ColumnTransformer(
    transformers = [
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features),
        ('bin', bin_transformer, bin_features)
    ])

### 2.2 Analyzing Learning Curves<a name="learningcurve"></a>

To better understand which algorithms would be a better choice, the learning curves related to a few algorithms are going to be drawn:

In [None]:
# Defining a function to plot learning curves:
def plot_learning_curves(X, y, model, steps):
    '''
    It plots the learning curve for the desired algorithm.

    Input:
    X: predictive features;
    y: target feature;
    model: instantiated object of the algorithm to be trained;
    steps: integer defining the steps for training size.
    '''
    # Setting train_sizes:
    train_sizes = np.linspace(0.1, 1.0, steps)

    # Applying sklearn learning_curve:
    train_size, train_score, test_score = learning_curve(model, X, y,
                                                         scoring = 'roc_auc',
                                                         train_sizes = train_sizes)
    # Computing average train and test scores:
    avg_train_score = np.mean(train_score, axis = 1)
    avg_test_score = np.mean(test_score, axis = 1)

    # Printing results:
    print("ROC_AUC train score: {:.2f}".format(avg_train_score[-1]))
    print("ROC_AUC valid. score: {:.2f}".format(avg_test_score[-1]))

    # Creating learning curve plot:
    fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (15, 5))

    sns.lineplot(x = train_sizes * 100, y = avg_train_score,
                 marker = 'o', color = 'tomato',
                 label = 'Train', ax = ax).set(xlabel = 'Training Set Percentage',
                                               ylabel = 'Model Score',
                                               title = 'Learning Curve')

    sns.lineplot(x = train_sizes * 100, y = avg_test_score,
                 marker = 'o', color = 'springgreen',
                 label = 'Validation', ax = ax)

    sns.despine(left=True, top = True)

    fig.show()

In [None]:
# XGBClassifier

# Print classifier's name:
print('XGBClassifier')

# Defining machine learning pipeline:
ml_pipe = Pipeline(steps = [
    ('preprocessing', preproc),
    ('clf', XGBClassifier())
])

# Creating a randomized version of the train dataframe:
df = mailout_train.copy()
df['RESPONSE'] = list(y.values)
rand_df = df.sample(frac = 1)

# Creating X_train and y_train objects:
rand_y = rand_df['RESPONSE']
rand_X = rand_df.drop(['RESPONSE'], axis = 1)

# Plotting learning curve:
plot_learning_curves(rand_X, rand_y, ml_pipe, 10)

Clearly, the learning curves are not converging: the average score on the training set stands in high values, while validation scores are poor.

It means that the **XGBClassifier** is overfitting, and the model is not actually learning or generalizing, explaining the low scores in the validation set.

In [None]:
# GradientBoosting

# Print classifier's name:
print('GradientBoostingClassifier')

# Defining machine learning pipeline:
ml_pipe = Pipeline(steps = [
    ('preprocessing', preproc),
    ('clf', GradientBoostingClassifier())
])

# Creating a randomized version of the train dataframe:
df = mailout_train.copy()
df['RESPONSE'] = list(y.values)
rand_df = df.sample(frac = 1)

# Creating X_train and y_train objects:
rand_y = rand_df['RESPONSE']
rand_X = rand_df.drop(['RESPONSE'], axis = 1)

# Plotting learning curve:
plot_learning_curves(rand_X, rand_y, ml_pipe, 10)

On the other hand, **GradientBoostingClassifier** represents a better option once the learning curves are converging, and the validation score is consistently improving while the algorithm receives more information.

In [None]:
# AdaBoostClassifier

# Print classifier's name:
print('AdaBoostClassifier')

# Defining machine learning pipeline:
ml_pipe = Pipeline(steps = [
    ('preprocessing', preproc),
    ('clf', AdaBoostClassifier())
])

# Creating a randomized version of the train dataframe:
df = mailout_train.copy()
df['RESPONSE'] = list(y.values)
rand_df = df.sample(frac = 1)

# Creating X_train and y_train objects:
rand_y = rand_df['RESPONSE']
rand_X = rand_df.drop(['RESPONSE'], axis = 1)

# Plotting learning curve:
plot_learning_curves(rand_X, rand_y, ml_pipe, 10)

**AdaBoostClassifier** shows a similar pattern when comparing to the GradientBoostingClassifier. However, its validation score was not as good.

Considering the models that didn't overfit, **GradientBoostingClassifier** seems a better option:
* learning curves seem to keep converging, showing perspective of improvements;
* validation score achieved higher values, indicating that it performs better on unseen data.

### 2.3 Training Classifier<a name="training"></a>

Now that the learning curve was observed for different algorithms, and the *GradientBoostingClassifier* was chosen as a better option, a few steps will be followed:
* defining data pipeline;
* setting different parameters for model tuning;
* *GridSearchCV* to optimize parameters' combination.

Since the data is highly unbalanced, the evaluation metric will be the `roc_auc` score.

#### 2.3.1 Training on Unbalanced Data<a name="t1"></a>

In this first attempt, the unbalance seen in the classes will not be treated.

In [None]:
# Defining machine learning pipeline:
gbc_ml_pipe = Pipeline(steps = [
    ('preprocessing', preproc),
    ('clf', GradientBoostingClassifier(learning_rate = 0.1,
                                       n_estimators = 150,
                                       random_state = 301))
])

# Setting parameters to be tested:
params = {'clf__min_samples_split': [2, 4],
          'clf__max_depth': [3, 5],
          'clf__max_features': [None, 'auto']
}

# Grid search + ML pipleine:
gbc_clf = GridSearchCV(gbc_ml_pipe, param_grid = params, scoring = 'roc_auc', verbose = 2)

# Training model:
gbc_clf.fit(mailout_train, y)

In [None]:
# Checking best parameters:
print(gbc_clf.best_params_)

# Checking best score:
print('Best ROC_AUC score: {:.2f}'.format(gbc_clf.best_score_))

In [None]:
# Saving model:
filename = 'gbc_model.pkl'
pickle.dump(gbc_clf, open(filename, 'wb'))

#### 2.3.2 Training on Balanced Data<a name="t2"></a>

In this second attempt, the `SMOTE` technique will be included in the machine learning pipeline.

The purpose is to see if the `roc_auc` score increases, once the unbalance is treated.

In [None]:
# Defining machine learning pipeline:
gbc_smote_ml_pipe = Pipeline_imb(steps = [
    ('preprocessing', preproc),
    ('smote', SMOTE(random_state = 501)),
    ('clf', GradientBoostingClassifier(learning_rate = 0.1,
                                       n_estimators = 150,
                                       random_state = 501))
])

# Setting parameters to be tested:
params_smote = {'smote__sampling_strategy': [0.3, 0.5, 1.0],
                'clf__min_samples_split': [2, 4],
                'clf__max_depth': [3],
                'clf__max_features': [None]
}

# Grid search + ML pipleine:
gbc_smote_clf = GridSearchCV(gbc_smote_ml_pipe, param_grid = params_smote, scoring = 'roc_auc', verbose = 2)

# Training model:
gbc_smote_clf.fit(mailout_train, y)

In [None]:
# Checking best parameters:
print(gbc_smote_clf.best_params_)

# Checking best score:
print('Best ROC_AUC score: {:.2f}'.format(gbc_smote_clf.best_score_))


In [None]:
# Saving model:
filename = 'gbc_smote_clf.pkl'
pickle.dump(gbc_clf, open(filename, 'wb'))

This strategy to deal with the class unbalance didn't result in a better score. Because of that, in the next attempts, unbalance will not be treated.

#### 2.3.3 Using Information Level and PCA Transformation<a name="t3"></a>

In this third attempt, a similar approach used during the cluster analysis will be performed here. In this case, data will be treated differently not only considering the columns' dtypes, but also the information level related to the columns.

As an example, `person` information level will be split into:
* numerical features;
* categorical features;
* binary features.

In the most generic levels of information like `microcell` and `macrocell`, dimensionality reduction will be applied (`PCA` for numerical features, and `TruncatedSVD` for sparse matrix (categorical columns after the one-hot encoding process).

This way, when applied, the dimensionality reduction will result in components representing one single level of information.

In [None]:
# Updating info_level dictionary:
new_info_level = info_level.copy()

# Adding columns:
# Person:
new_info_level['person'].append('AGER_TYP')
new_info_level['person'].append('GEBURTSJAHR')

# Household:
new_info_level['household'].append('D19_GESAMT_ANZ_24')
new_info_level['household'].append('D19_GESAMT_DATUM')
new_info_level['household'].append('D19_GESAMT_OFFLINE_DATUM')
new_info_level['household'].append('D19_GESAMT_ONLINE_DATUM')
new_info_level['household'].append('D19_KONSUMTYP')
new_info_level['household'].append('D19_KONSUMTYP_MAX')
new_info_level['household'].append('D19_SONSTIGE')
new_info_level['household'].append('D19_SOZIALES')
new_info_level['household'].append('D19_VERSAND_DATUM')
new_info_level['household'].append('D19_VERSAND_OFFLINE_DATUM')
new_info_level['household'].append('D19_VOLLSORTIMENT')

# Macrocell:
new_info_level['macrocell'].append('EXTSEL992')

In [None]:
# Dividing Person features into numerical, categorical and binary:
pers_num_features = list()

pers_cat_features = list()

pers_bin_features = list()

# Adding columns to lists:
for pers_col in new_info_level['person']:
    try:
        dtype = new_dtypes_dict[pers_col]
    except:
        dtype = new_feat_dtypes_dict[pers_col]

    if dtype == 'num':
        pers_num_features.append(pers_col)
    elif dtype == 'cat':
        pers_cat_features.append(pers_col)
    else:
        pers_bin_features.append(pers_col)

In [None]:
# Dividing Household features into numerical, categorical and binary:
hh_num_features = list()

hh_cat_features = list()

hh_bin_features = list()

# Adding columns to lists:
for hh_col in new_info_level['household']:
    try:
        dtype = new_dtypes_dict[hh_col]
    except:
        dtype = new_feat_dtypes_dict[hh_col]

    if dtype == 'num':
        hh_num_features.append(hh_col)
    elif dtype == 'cat':
        hh_cat_features.append(hh_col)
    else:
        hh_bin_features.append(hh_col)

In [None]:
# Dividing Microcell features into numerical, categorical and binary:
mic_num_features = list()

mic_cat_features = list()

mic_bin_features = list()

# Adding columns to lists:
for mic_col in new_info_level['microcell']:
    try:
        dtype = new_dtypes_dict[mic_col]
    except:
        dtype = new_feat_dtypes_dict[mic_col]

    if dtype == 'num':
        mic_num_features.append(mic_col)
    elif dtype == 'cat':
        mic_cat_features.append(mic_col)
    else:
        mic_bin_features.append(mic_col)

In [None]:
# Dividing Macrocell features into numerical, categorical and binary:
mac_num_features = list()

mac_cat_features = list()

mac_bin_features = list()

# Adding columns to lists:
for mac_col in new_info_level['macrocell']:
    try:
        dtype = new_dtypes_dict[mac_col]
    except:
        dtype = new_feat_dtypes_dict[mac_col]

    if dtype == 'num':
        mac_num_features.append(mac_col)
    elif dtype == 'cat':
        mac_cat_features.append(mac_col)
    else:
        mac_bin_features.append(mac_col)

In [None]:
# Dividing Community features into numerical, categorical and binary:
com_num_features = list()

com_cat_features = list()

com_bin_features = list()

# Adding columns to lists:
for com_col in new_info_level['community']:
    try:
        dtype = new_dtypes_dict[com_col]
    except:
        dtype = new_feat_dtypes_dict[com_col]

    if dtype == 'num':
        com_num_features.append(com_col)
    elif dtype == 'cat':
        com_cat_features.append(com_col)
    else:
        com_bin_features.append(com_col)

In [None]:
# Pipeline for treating nan values and applying PCA to different information levels:

# PERSON level:
# Numerical features: nan values will be imputed using the 'median', and then StandardScaler will be applied:
# pers_num_features
pers_num_transformer = Pipeline(steps = [
    ('pers_num_imputer', SimpleImputer(strategy = 'median')),
    ('pers_num_scaler', StandardScaler())
])

# Categorical features will be one-hot-encoded:
# pers_cat_features
pers_cat_transformer = Pipeline(steps = [
    ('pers_cat_imputer', SimpleImputer(strategy = 'most_frequent')),
    ('pers_cat_ohe', OneHotEncoder(handle_unknown = 'ignore')),
    ('pers_cat_pca', TruncatedSVD(n_components = 9, random_state = 701))
])

# Binary features with nan values will also be one-hot-encoded:
# pers_bin_features
pers_bin_transformer = Pipeline(steps = [
    ('pers_bin_imputer', SimpleImputer(strategy = 'most_frequent')),
    ('pers_bin_ohe', OneHotEncoder(handle_unknown = 'ignore'))
])

In [None]:
# HOUSEHOLD level:
# Numerical features: nan values will be imputed using the 'median', and then StandardScaler will be applied:
# hh_num_features
hh_num_transformer = Pipeline(steps = [
    ('hh_num_imputer', SimpleImputer(strategy = 'median')),
    ('hh_num_scaler', StandardScaler())
])

# Categorical features will be one-hot-encoded:
# hh_cat_features
hh_cat_transformer = Pipeline(steps = [
    ('hh_cat_imputer', SimpleImputer(strategy = 'most_frequent')),
    ('hh_cat_ohe', OneHotEncoder(handle_unknown = 'ignore'))
])

In [None]:
# MICROCELL level:
# Numerical features: nan values will be imputed using the 'median', and then StandardScaler will be applied:
# mic_num_features
mic_num_transformer = Pipeline(steps = [
    ('mic_num_imputer', SimpleImputer(strategy = 'median')),
    ('mic_num_scaler', StandardScaler()),
    ('mic_num_pca', PCA(n_components = 6, random_state = 702))
])

# Categorical features will be one-hot-encoded:
# mic_cat_features
mic_cat_transformer = Pipeline(steps = [
    ('mic_cat_imputer', SimpleImputer(strategy = 'most_frequent')),
    ('mic_cat_ohe', OneHotEncoder(handle_unknown = 'ignore')),
    ('mic_cat_pca', TruncatedSVD(n_components = 2, random_state = 703))
])

# Binary features with nan values will also be one-hot-encoded:
# mic_bin_features
mic_bin_transformer = Pipeline(steps = [
    ('mic_bin_imputer', SimpleImputer(strategy = 'most_frequent')),
    ('mic_bin_ohe', OneHotEncoder(handle_unknown = 'ignore')),
    ('mic_bin_pca', TruncatedSVD(n_components = 2, random_state = 704))
])

In [None]:
# MACROCELL level:
# Numerical features: nan values will be imputed using the 'median', and then StandardScaler will be applied:
# mac_num_features
mac_num_transformer = Pipeline(steps = [
    ('mac_num_imputer', SimpleImputer(strategy = 'median')),
    ('mac_num_scaler', StandardScaler()),
    ('mac_num_pca', PCA(n_components = 8, random_state = 705))
])

# Categorical features will be one-hot-encoded:
# mac_cat_features
mac_cat_transformer = Pipeline(steps = [
    ('mac_cat_imputer', SimpleImputer(strategy = 'most_frequent')),
    ('mac_cat_ohe', OneHotEncoder(handle_unknown = 'ignore')),
    ('mac_cat_pca', TruncatedSVD(n_components = 2, random_state = 706))
])

# Binary features with nan values will also be one-hot-encoded:
# mac_bin_features
mac_bin_transformer = Pipeline(steps = [
    ('mac_bin_imputer', SimpleImputer(strategy = 'most_frequent')),
    ('mac_bin_ohe', OneHotEncoder(handle_unknown = 'ignore'))
])

In [None]:
# COMMUNITY level:
# Numerical features: nan values will be imputed using the 'median', and then StandardScaler will be applied:
# com_num_features
com_num_transformer = Pipeline(steps = [
    ('com_num_imputer', SimpleImputer(strategy = 'median')),
    ('com_num_scaler', StandardScaler())
])

In [None]:
# Encapsulating transformations:
preproc_pca = ColumnTransformer(
    transformers = [
        ('pers_num', pers_num_transformer, pers_num_features),
        ('pers_cat', pers_cat_transformer, pers_cat_features),
        ('pers_bin', pers_bin_transformer, pers_bin_features),
        ('hh_num', hh_num_transformer, hh_num_features),
        ('hh_cat', hh_cat_transformer, hh_cat_features),
        ('mic_num', mic_num_transformer, mic_num_features),
        ('mic_cat', mic_cat_transformer, mic_cat_features),
        ('mic_bin', mic_bin_transformer, mic_bin_features),
        ('mac_num', mac_num_transformer, mac_num_features),
        ('mac_cat', mac_cat_transformer, mac_cat_features),
        ('mac_bin', mac_bin_transformer, mac_bin_features),
        ('com_num', com_num_transformer, com_num_features)
    ])

In [None]:
# Concatenating selected columns:
selected_columns = pers_num_features + pers_cat_features + pers_bin_features + hh_num_features + hh_cat_features + mic_num_features + mic_cat_features + mic_bin_features + mac_num_features + mac_cat_features + mac_bin_features + com_num_features

In [None]:
# Defining machine learning pipeline:
gbc_pca_ml_pipe = Pipeline(steps = [
    ('preprocessing', preproc_pca),
    ('clf', GradientBoostingClassifier(learning_rate = 0.1,
                                       n_estimators = 150,
                                       random_state = 701))
])

# Setting parameters to be tested:
params_pca = {'clf__min_samples_split': [2, 4],
              'clf__max_depth': [3, 5],
              'clf__max_features': [None]
}

# Grid search + ML pipleine:
gbc_pca_clf = GridSearchCV(gbc_pca_ml_pipe, param_grid = params_pca, scoring = 'roc_auc', verbose = 2)

# Training model:
gbc_pca_clf.fit(mailout_train[selected_columns], y)

In [None]:
# Checking best parameters:
print(gbc_pca_clf.best_params_)

# Checking best score:
print('Best ROC_AUC score: {:.2f}'.format(gbc_pca_clf.best_score_))

In [None]:
# Saving model:
filename = 'gbc_pca_clf.pkl'
pickle.dump(gbc_pca_clf, open(filename, 'wb'))

This approach resulted in a better score when comparing to the second approach, but still not as good as the first strategy.

#### 2.3.4 Using PCA Transformation<a name="t4"></a>

The fourth approach is a variation of the first one, but this time the dimensionality reduction will be applied to the data. Different from the third approach, information level will not be considered.

This way, the `PCA` algorithm will be applied along in the machine learning pipeline, and the components will represent the whole data.

In [None]:
# Defining machine learning pipeline:
gbc_pca2_pipe = Pipeline(steps = [
    ('preprocessing', preproc),
    ('pca', PCA(random_state = 901)),
    ('clf', GradientBoostingClassifier(learning_rate = 0.1,
                                       n_estimators = 150,
                                       random_state = 301))
])

# Setting parameters to be tested:
params_pca2 = {'pca__n_components': [70, 100, 150],
               'clf__min_samples_split': [2, 4],
               'clf__max_depth': [3],
               'clf__max_features': [None]
}

# Grid search + ML pipleine:
gbc_pca2_clf = GridSearchCV(gbc_pca2_pipe, param_grid = params_pca2, scoring = 'roc_auc', verbose = 2)

# Training model:
gbc_pca2_clf.fit(mailout_train, y)

In [None]:
# Checking best parameters:
print(gbc_pca2_clf.best_params_)

# Checking best score:
print('Best ROC_AUC score: {:.2f}'.format(gbc_pca2_clf.best_score_))


In [None]:
# Saving model:
filename = 'gbc_pca2_clf.pkl'
pickle.dump(gbc_pca2_clf, open(filename, 'wb'))

Considering the `roc_auc` metric, this strategy is the worst so far.

#### 2.3.5 XGBoost Classifier and Bayesian Optimization<a name="t5"></a>

This time, not only the algorithm will be changed, but also the parameter tuning approach will be changed.

Instead of the *Gradient Boosting Classifier*, the `XGBoost Classifier` will be trained on the data. The parameter tuning will be performed by the `BayesSearchCV` algorithm. Instead of simply testing all the parameter combinations, this algorithm test different parameters, given a range of possible values.

Once it shows improvement, the algorithm 'explores' deeper the areas that resulted i n better performance.

In [None]:
# Defining machine learning pipeline:
xgbc_ml_pipe_bayes = Pipeline(steps = [
    ('preprocessing', preproc),
    ('clf', XGBClassifier(random_state = 301))
])

# Setting parameters to be tested:
bayes_search_space = {'clf__booster': Categorical(['gbtree', 'dart']),
                      'clf__learning_rate': Real(0.01, 0.3),
                      'clf__gamma': Integer(0, 100),
                      'clf__min_child_weight': Integer(0, 10),
                      'clf__reg_lambda': Integer(1, 100),
                      'clf__reg_alpha': Integer(0, 100),
                      'clf__tree_method': Categorical(['auto', 'hist']),
                      'clf__max_depth': Integer(2, 7)
}

# Defining function to display scores:
def show_score(optim_result):
    '''
    It shows iteration scores during Bayesian Optimization
    '''
    # Computing score:
    score = xgbc_bayes_clf.best_score_
    print('Best ROC_AUC Score:{}'.format(score))

    # Early stop:
    if score >= 0.81:
        print('At least 0.81 ROC_AUC score achieved!')

        return True

# Grid search + ML pipleine:
xgbc_bayes_clf = BayesSearchCV(xgbc_ml_pipe_bayes, bayes_search_space, scoring = 'roc_auc', cv = 5, verbose = 2)

# Training model:
xgbc_bayes_clf.fit(mailout_train, y, callback = show_score)

In [None]:
# Checking best parameters:
print(xgbc_bayes_clf.best_params_)

# Checking best score:
print('Best ROC_AUC score: {:.2f}'.format(xgbc_bayes_clf.best_score_))

In [None]:
# Saving model:
filename = 'xgbc_bayes_clf.pkl'
pickle.dump(xgbc_bayes_clf, open(filename, 'wb'))

This strategy resulted in a score similar to the first one.

Since the first one was the best model so far, this new Bayesian Optimization approach will be performed in a few more algorithms.

In [None]:
# Saving model:
#filename = 'xgbc_bayes_model.pkl'
#pickle.dump(xgbc_bayes_clf, open(filename, 'wb'))

#### 2.3.6 LightGBM and Bayesian Optimization<a name="t6"></a>

`LightGBM Classifier` is similar to the *XGBoost Classifier*, but is considered faster. Besides that, it splits its trees leaf-wise, rather than depth or level-wise like most of the other similar algorithms.

In [None]:
# Defining machine learning pipeline:
lgbm_ml_pipe_bayes = Pipeline(steps = [
    ('preprocessing', preproc),
    ('clf', LGBMClassifier(random_state = 301))
])

# Setting parameters to be tested:
lgbm_bayes_search_space = {'clf__boosting_type': Categorical(['gbdt', 'dart', 'goss']),
                           'clf__num_leaves': Integer(5, 100),
                           'clf__max_depth': Integer(2, 200),
                           'clf__learning_rate': Real(0.01, 0.5),
                           'clf__n_estimators': Integer(100, 500),
                           'clf__min_child_samples': Integer(10, 80),
                           'clf__reg_alpha': Integer(0, 100),
                           'clf__reg_lambda': Integer(0, 100)
}

# Defining function to display scores:
def lgbm_show_score(optim_result):
    '''
    It shows iteration scores during Bayesian Optimization
    '''
    # Computing score:
    score = lgbm_bayes_clf.best_score_
    print('\nBest ROC_AUC Score: {}.\n'.format(score))

    # Early stop:
    if score >= 0.81:
        print('At least 0.81 ROC_AUC score achieved!')

        return True

# Grid search + ML pipleine:
lgbm_bayes_clf = BayesSearchCV(lgbm_ml_pipe_bayes, lgbm_bayes_search_space, scoring = 'roc_auc', cv = 5, verbose = 2)

# Training model:
lgbm_bayes_clf.fit(mailout_train, y, callback = lgbm_show_score)

In [None]:
# Checking best parameters:
print(lgbm_bayes_clf.best_params_)

# Checking best score:
print('Best ROC_AUC score: {:.2f}'.format(lgbm_bayes_clf.best_score_))

In [None]:
# Saving model:
filename = 'lgbm_bayes_clf.pkl'
pickle.dump(lgbm_bayes_clf, open(filename, 'wb'))

The *LightGBM Classifier* resulted in a slightly lower score in comparison to the last model, but their performances are comparable.

