In [6]:
from sklearn.pipeline import Pipeline
import pandas as pd 
import numpy as np
import os
import sys
sys.path.append('../src')

# Source code
from transforms import *
import experiment

# Global variables
MODELS_FOLDER = os.path.join('..', 'models')
PIPELINE_NAME = 'pipeline.pkl'
DATA_FOLDER = os.path.join('..', 'data')
DATA_FOLDER_RAW = os.path.join(DATA_FOLDER, 'raw')
DATA_NAME_RAW = 'winequality_90.csv'
SEED = 93849823
ACTION = 'load' # set "load" to just load the results or "train" to train them all again

# Warnings off:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Set the random state seed for reproducibility
np.random.seed(SEED)

import joblib

# Load dataset


In [7]:
data = pd.read_csv(os.path.join(DATA_FOLDER_RAW, DATA_NAME_RAW))
data.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,Red,8.2,0.635,0.1,2.1,0.073,25.0,60.0,0.99638,3.29,0.75,10.9,6
1,White,5.7,0.1,0.27,1.3,0.047,21.0,100.0,0.9928,3.27,0.46,9.5,5
2,White,6.9,0.28,0.24,2.1,0.034,49.0,121.0,0.98882,2.98,0.43,13.2,7
3,White,5.8,0.36,0.38,0.9,0.037,3.0,75.0,0.9904,3.28,0.34,11.4,4
4,White,7.4,0.2,0.36,1.2,0.038,44.0,111.0,0.9926,3.36,0.34,9.9,6


We can build the pipeplinr from scratch by using the classes under the `transforms.py`. However, we already did that and we encourage you to use the `pipeline.pkl`, which has been built using `sklearn` structured and pipeline class.

In [8]:
pipeline = joblib.load(os.path.join(MODELS_FOLDER, PIPELINE_NAME))
pipeline.steps

[('cleaner', DataCleaning()),
 ('remover', RemoveFeatures(features='type')),
 ('scaler', FeatureScaling(type='std')),
 ('droper', <transforms.DropNaN at 0x7fe6fae6d278>)]

This is how to use the pipeline

In [15]:
labels = GetLables().fit_transform(data)
labels

array([3, 2, 4, ..., 4, 3, 3])

In [16]:
features = data.drop('quality', axis=1)
features.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,Red,8.2,0.635,0.1,2.1,0.073,25.0,60.0,0.99638,3.29,0.75,10.9
1,White,5.7,0.1,0.27,1.3,0.047,21.0,100.0,0.9928,3.27,0.46,9.5
2,White,6.9,0.28,0.24,2.1,0.034,49.0,121.0,0.98882,2.98,0.43,13.2
3,White,5.8,0.36,0.38,0.9,0.037,3.0,75.0,0.9904,3.28,0.34,11.4
4,White,7.4,0.2,0.36,1.2,0.038,44.0,111.0,0.9926,3.36,0.34,9.9


In [29]:
features_transformed = pipeline.fit_transform(features)
features_transformed.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.753317,1.794928,-1.511482,-0.701777,0.474572,-0.311825,-0.986257,-0.092347,0.440301,1.454997,0.34738
1,-1.168491,-1.453357,-0.336296,-0.869575,-0.258709,-0.536331,-0.27984,-0.092822,0.315747,-0.474754,-0.831347
2,-0.246023,-0.360476,-0.543682,-0.701777,-0.625349,1.035214,0.091029,-0.093351,-1.490295,-0.674383,2.283859
3,-1.091618,0.125249,0.424119,-0.953474,-0.54074,-1.546611,-0.721351,-0.093141,0.378024,-1.273271,0.768353
4,0.138338,-0.846201,0.285862,-0.89055,-0.512537,0.754581,-0.085575,-0.092849,0.876242,-1.273271,-0.494568


In [39]:
features_transformed.index.values

array([   0,    1,    2, ..., 5844, 5845, 5846])

In [44]:
labels = labels[features_transformed.index.values]
labels.shape

(5813,)

# Experiment 01:
- All classifiers


In [45]:
def do_experiment(results_name, folder, features, labels, n_classes=2, action='load', classifiers={}, kfold=10):
    print(action)
    if action=='train':
        print(folder)
        # Train all clf
        ### Stratified cross-validation for model selection will be used.

        clf_outputs = experiment.run_classifiers(features, labels, classifiers, kfolds)

        ## Performance assessment
        results = {}
        results['train'] = experiment.results_clf(n_classes, clf_outputs['train']['true'], clf_outputs['train']['pred'])
        results['test'] = experiment.results_clf(n_classes, clf_outputs['test']['true'], clf_outputs['test']['pred'])

        ## Save results
        experiment.export_results(results['test'], 'test', foldertree=folder)
        experiment.export_results(results['train'], 'train', foldertree=folder)

        name = os.path.join(folder, results_name)
        joblib.dump(results, name)
        
        return results
    
    elif action == 'load':
        name = os.path.join(folder, results_name)
        return joblib.load('{}'.format(name))

In [46]:
# Definitions
results_obj_name = 'results-01-clf-all.pkl'
results_folder = os.path.join('Results-analyses-3', '01-all-clf')
kfolds = 10
n_classes = 2

# Define all classifiers:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.ensemble import VotingClassifier

# No hyperparameter tunning for now. They default setting will be used.

knn_clf = KNeighborsClassifier()
random_forest_clf = RandomForestClassifier()
naive_bayes_clf = GaussianNB()
gaussian_linear_clf = LinearDiscriminantAnalysis()
gaussian_quadratic_clf = QuadraticDiscriminantAnalysis()
perceptron_clf = Perceptron()
sgd_clf = SGDClassifier()

# MLP:
mlp_clf = MLPClassifier(solver='adam', learning_rate='adaptive', 
                        max_iter=1300, learning_rate_init=5e-04, tol=1e-4)

mlp_clf_2 = MLPClassifier(solver='lbfgs', learning_rate='adaptive', 
                        max_iter=1300, learning_rate_init=5e-04, tol=1e-4)

eclf = VotingClassifier(estimators=[('mlp', mlp_clf), ('mlp-2', mlp_clf_2), ('naive-bayes', naive_bayes_clf)], 
                        voting='soft', weights=[0.75, 0.75, 1.4])

# Place them all in a dict

classifiers = {'KNN': knn_clf, 'RF': random_forest_clf, 'Naive_bayes': naive_bayes_clf,
               'Gaussian_linear': gaussian_linear_clf, 'Gaussian_quadratic': gaussian_quadratic_clf,
               'Perceptron': perceptron_clf, 'SGDClassifier': sgd_clf,
               'MLP': mlp_clf,
               'Ensemble': eclf}

In [None]:
results_exp_01 = do_experiment(results_obj_name, results_folder, features=features_transformed.values, labels=labels.ravel(), 
                               classifiers=classifiers, n_classes=np.unique(labels).shape[0], action='train')

train
Results-analyses-3/01-all-clf




## The results from this part:

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def see_results(results):
    result_df = pd.DataFrame(columns=['acc', 'recall', 'precision'], index=results['test'].keys())
    
    for clf in results['test'].keys():
        result_df.loc[clf]['acc'] = results['test'][clf]['average']['acc']
        result_df.loc[clf]['recall'] = results['test'][clf]['average']['recall']
        result_df.loc[clf]['precision'] = results['test'][clf]['average']['precision']
    
    return result_df

In [None]:
result_df_clf_all = see_results(results_exp_01)
result_df_clf_all

In [None]:
ax, fig = plt.subplots(figsize=(15,6))
sns.barplot(y='acc', x=result_df_clf_all.index, data=result_df_clf_all)
plt.ylim(0.967, 0.99)
plt.title('Average ACC (%)')
plt.show()

In [None]:
fig, axes = plt.subplots(2,1,figsize=(15,10))
sns.barplot(y='recall', x=result_df_clf_all.index, data=result_df_clf_all, ax = axes[0])
axes[0].set_ylim(0.967, 0.99)
plt.title('Average Recall (%)')

sns.barplot(y='precision', x=result_df_clf_all.index, data=result_df_clf_all, ax = axes[1])
axes[1].set_ylim(0.967, 0.99)
plt.title('Average precision (%)')

plt.show()

Setting the baseline to 0.967 we can exclude all model in which reached accuracies below. Therefore, only the Random Forest (RF) and MLP performed properly. Having the R outperforming in about 1% in average.

# Experiment 02 - Test another models:
- Bagged classifiers with AdaBoost algorithm;
- No feature selection employed;

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Definitions
results_obj_name = 'results-02-clf-adaboost.pkl'
results_folder = os.path.join('Results-analyses-3', '02-Adaboost')
kfolds = 10
n_classes = 2

# CLF definitions
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME",
                         n_estimators=200)

bnn = AdaBoostClassifier(SGDClassifier(),
                         algorithm="SAMME",
                         n_estimators=50)

classifiers_adabost = {'BDT': bdt, 'BNN': bnn}

In [None]:
results_exp_02 = do_experiment(results_obj_name, results_folder, features=features.values, labels=labels.values.ravel(), 
                               classifiers=classifiers_adabost, n_classes=n_classes, action='load')

In [None]:
result_df_bagged = see_results(results_exp_02)
result_df_bagged

In [None]:
ax, fig = plt.subplots(figsize=(15,6))
sns.barplot(y='acc', x=result_df_bagged.index, data=result_df_bagged)
plt.ylim(0.967, 0.99)
plt.title('Average ACC (%)')
plt.show()

In [None]:
fig, axes = plt.subplots(2,1,figsize=(15,10))
sns.barplot(y='recall', x=result_df_bagged.index, data=result_df_bagged, ax = axes[0])
axes[0].set_ylim(0.967, 0.99)
plt.title('Average Recall (%)')

sns.barplot(y='precision', x=result_df_bagged.index, data=result_df_bagged, ax = axes[1])
axes[1].set_ylim(0.967, 0.99)
plt.title('Average precision (%)')

plt.show()

The bagged of decisions tree performed almost as the RF, so there wont be necessary to use them from now. The bagged of Perceptrons will not be used.

# Experiment 03:
- The "best" classifiers up until now (RF and  MLP)
- Feature selection in the pipeline

In [None]:
features_selected = pipeline_02.fit_transform(info_data)
features_selected.head()

Only 14 features:

In [None]:
# Definitions
results_obj_name = 'results-03-clf-feat_selected.pkl'
results_folder = os.path.join('Results-analyses-3', '03-Feature-selected')
kfolds = 10
n_classes = 2

# CLF definitions as  the same above in the Expr 01
classifiers_feat_selected = {'MLP': mlp_clf, 'RF': random_forest_clf}

In [None]:
results_exp_03 = do_experiment(results_obj_name, results_folder, features=features_selected.values, labels=labels.values.ravel(), 
                               classifiers=classifiers_feat_selected, n_classes=n_classes, action='load')

In [None]:
result_df_feat_selected = see_results(results_exp_03)
result_df_feat_selected

In [None]:
results_comparison = pd.DataFrame([result_df_clf_all.loc['RF']['acc'], result_df_feat_selected.loc['RF']['acc']],
                                    index=['47 features', '14 features'], columns=['RF'])

results_comparison['MLP'] = [result_df_clf_all.loc['MLP']['acc'], result_df_feat_selected.loc['MLP']['acc']]
results_comparison

In [None]:
sns.lineplot(data=results_comparison)

In [None]:
def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=20):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
    
    Arguments
    ---------
    confusion_matrix: numpy.ndarray
        The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix. 
        Similarly constructed ndarrays can also be used.
    class_names: list
        An ordered list of class names, in the order they index the given confusion matrix.
    figsize: tuple
        A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
        the second determining the vertical size. Defaults to (10,7).
    fontsize: int
        Font size for axes labels. Defaults to 14.
        
    Returns
    -------
    matplotlib.figure.Figure
        The resulting confusion matrix figure
    """
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    return fig

In [None]:
fig = print_confusion_matrix(results_exp_03['test']['MLP']['confMat'][9], class_names=np.array(['Assinante', 'Cancelou']))
plt.title('MLP confusion matrix')

In [None]:
fig = print_confusion_matrix(results_exp_03['test']['RF']['confMat'][9], class_names=np.array(['Assinante', 'Cancelou']))
plt.title('RF confusion matrix')

There have  been a  slightly improvement about 0.2% by using only 14 features. So, it would not harm to use only 14 features. In this way, the RF and MLP seem to be most suitable. We now want to test them on a larger part of the database and see how they perform. Let's train them all this whole database and test on the larger one called:
- `user-status-after_chunk_30.csv` 
- `weekly-infos-before_chunk_30.csv`

# Load the another part of the database

In [None]:
USER_DATA = 'user-status-after_chunk_30.csv'
INFO_DATA = 'weekly-infos-before_chunk_30.csv'

info_data_30 = pd.read_csv(os.path.join(DATA_FOLDER, INFO_DATA), index_col=0)
user_data_30 = pd.read_csv(os.path.join(DATA_FOLDER, USER_DATA), index_col=0)

In [None]:
info_data_30.shape

In [None]:
user_data_30.shape

It is about 1M entries and 140k users.

# Transform with the pipeline 02
**WARNING**: ONLY TRANSFORM. DO NOT CALL *FIT_TRANSFORM* METHOD, SINCE IT WOULD ADD BIAS ON THE FEATURE SCALING.

In [None]:
features_30 = pipeline_02.transform(info_data_30)
features_30.head()

In [None]:
labels_30 = GetLables().transform(user_data_30, features_30)
labels_30['status'] = labels_30['status'].map({'assinante':0, 'cancelou': 1})

It is about 120k users.

# Train the clf on the 6% of the data and test on the 30%

In [None]:
mlp_clf.fit(X=features_selected.values, y=labels.values.ravel())
random_forest_clf.fit(X=features_selected.values, y=labels.values.ravel())

In [None]:
y_pred_mlp = mlp_clf.predict(features_30.values)
y_pred_rf = random_forest_clf.predict(features_30.values)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

target_names = ['Assinante', 'Cancelou']
print('RF:')
print(classification_report(labels_30.values.ravel(), y_pred_rf, target_names=target_names))

print('MLP:')
print(classification_report(labels_30.values.ravel(), y_pred_mlp, target_names=target_names))

In [None]:
print('RF confusion matrix:')
print(confusion_matrix(labels_30.values.ravel(), y_pred_rf))

In [None]:
print('MLP confusion matrix:')
print(confusion_matrix(labels_30.values.ravel(), y_pred_mlp))

That seems weird!
Despite performing similar behavior in the cross validation made with only 6% of the data, when subjected to 30% of the data the Random Forest starts to decay its performance. Specialy in the class "**Assinante**". RF is extremelly susceptive to overfitting and that might be reason. From now on we will be employing the MLP as our main classifier for this problem.

While working with neural networks there is always the question: *How to proper define the best hyperparameters*

# Hyperparameter tunning

In [None]:
# MLP:
mlp_clf = MLPClassifier(solver='adam', learning_rate='adaptive', max_iter=1300, learning_rate_init=5e-04, tol=1e-4)

In [None]:
param_dist_dict = {'MLP': {"hidden_layer_sizes": list(np.arange(2,1001))}
                  }

classifiers = {'MLP': mlp_clf}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_search = dict((k,[]) for k in classifiers.keys())

In [None]:
results_obj_name = 'results-04-mlp-random-search.pkl'
results_folder = os.path.join('Results-analyses-3', '02-Adaboost')

if ACTION == 'train':
    for clf in param_dist_dict.keys():
        random_search[clf] = RandomizedSearchCV(classifiers[clf], param_dist_dict[clf], cv=10, n_iter=50, verbose=5, scoring='precision')
        random_search[clf].fit(features_selected.values, y=labels.values.ravel())
        joblib.dump(random_search, os.path.join(results_folder, results_obj_name))
else:
    random_search = joblib.load('random-search.pkl')

In [None]:
y_pred_best_mlp = random_search['MLP'].best_estimator_.predict(features_30.values)

In [None]:
print(classification_report(labels_30.values.ravel(), y_pred_best_mlp, target_names=target_names))

In [None]:
random_search['MLP'].best_estimator_

It seems it has improved a bit, with 487 neurons, however we are not convinced. We will keep using the default config of 100 neurons for proper evaluation.

In [None]:
accuracy_score(labels_30.values.ravel(), y_pred_best_mlp)

In [None]:
confusion_matrix(labels_30.values.ravel(), y_pred_best_mlp)

In [None]:
labels_30.shape

In [None]:
features_30.shape

# Summary:

- In respect to the data preprocessing etc.

A whole pipeline is built with the `transforms.py`. Data cleaning, Categorical preprocessing and numerical preprocessing are employed. Two are tested agains the classifiers: (i) one with **Feature Selection** and (ii) one without it.

- In respect to the classifiers:

Different classifiers are are evaluated in the first chunkg of data, the one which contains 6%. The two most promossing, the Random Forest and the Multi-layer Perpectron are choosen to be trained with the whole chunk of 6% and teste agains the one with 30%. The RF performed worse on the "Assinante" class, indicating overfitting. We procced the experiments with the MLP, which will be the one choosen to desing the alpha version of the model.

- The proposed pipeline and model:

The feature map is processed and reduced to 14 features in which the MLP with 100 and 487 neurons are to be used in the next step of analyses.