# Prediction of Wine Quality Classification using Machine Learning Algorithms.

# 1. Import all the required libraries and wine dataset

In [None]:
# Regular exploratory data analysis and plotting libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# to make the plots appear inside the notebook
%matplotlib inline 

# Models adopted for coursework from Scikit-Learn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Model Evaluations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve
import scikitplot as skplt
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTETomek
from sklearn.base import BaseEstimator
import datetime
import time

In [None]:
# Loading the red wine dataset
winedata = pd.read_csv("winequality-white.csv", sep = ';')
print ("wine shape", winedata.shape)

In [None]:
print("////// wine data describe ////////")
print(winedata.info())

# 2. Exploratory Data Analysis (EDA)

In [None]:
print("wine dataset shape - ")
print(winedata.shape)

In [None]:
winedata.describe()

In [None]:
#Check if any dataset has null feature or label value
winedata.isna().sum()
# No feature with null value found

In [None]:
print("- histogram plot of the attributes distribution -")
winedata.hist(figsize=(12,8),bins=20)
plt.show()

In [None]:
# Analysis of the correlation among all the attributes
corr_matrix = winedata.corr()
fig, ax = plt.subplots(figsize=(15, 10))
ax = sns.heatmap(corr_matrix,
                 annot=True,
                 linewidths=0.5,
                 fmt=".2f",
                 cmap="YlGnBu");
bottom, top = ax.get_ylim()

In [None]:
#Checking for count of available data for each class
winedata["quality"].value_counts()

In [None]:
#plotting the class imbalances
sns.countplot(x=winedata["quality"]);

# 3. Data Preparation

### A. Creating 2 classes for wine quality from wine quality score

Recategorizing the wine quality unbalanced ordinal class into an unbalanced binary class

1. Class 0 (Normal quality wine) : Quality score from 0 to 6
2. Class 1 (High quality wine) : Quality score from 7 to 10


In [None]:
def isQualitylevel(quality):
    if quality >= 7:
        return 1
    else:
        return 0

winedata["quality"] = winedata["quality"].apply(isQualitylevel)

In [None]:
winedata["quality"].value_counts()

In [None]:
#plotting the class imbalances
sns.countplot(x=winedata["quality"]);

### B. Split data into X and y (features and target)

In [None]:
# Split data into X and y (features and target)
X = winedata.drop("quality", axis=1)
y = winedata["quality"]

### C. Split X and y into train and test sets

In [None]:
# Split data into train and test sets
np.random.seed(22)

X_pretrain, X_pretest, y_pretrain, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
y_pretrain.value_counts(), y_pretrain.value_counts().sum(), y_test.value_counts(), y_test.value_counts().sum()

### D. Feature Scaling - Standardization of the X_pretrain and X_pretest data

In [None]:
# Standardization of the X_pretrain and X_pretest data
scaler = StandardScaler()
X_pretrainscaled = scaler.fit_transform(X_pretrain)
X_pretrainscaled = pd.DataFrame(X_pretrainscaled, columns=X_pretrain.columns)

X_pretestscaled = scaler.transform(X_pretest)
X_pretestscaled = pd.DataFrame(X_pretestscaled, columns=X_pretest.columns)


In [None]:
# Comparison of the data distribution before scaling and after scaling

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(8, 4))

ax1.set_title('Before Scaling-X_pretrain')
sns.kdeplot(X_pretrain["fixed acidity"],ax=ax1)
sns.kdeplot(X_pretrain["volatile acidity"],ax=ax1)
sns.kdeplot(X_pretrain["citric acid"],ax=ax1)
sns.kdeplot(X_pretrain["residual sugar"],ax=ax1)
sns.kdeplot(X_pretrain["chlorides"],ax=ax1)
sns.kdeplot(X_pretrain["free sulfur dioxide"],ax=ax1)
sns.kdeplot(X_pretrain["total sulfur dioxide"],ax=ax1)
sns.kdeplot(X_pretrain["density"],ax=ax1)
sns.kdeplot(X_pretrain["pH"],ax=ax1)
sns.kdeplot(X_pretrain["sulphates"],ax=ax1)
sns.kdeplot(X_pretrain["alcohol"],ax=ax1)

ax2.set_title('After Scaling-X_pretrain')
sns.kdeplot(X_pretrainscaled["fixed acidity"],ax=ax2)
sns.kdeplot(X_pretrainscaled["volatile acidity"],ax=ax2)
sns.kdeplot(X_pretrainscaled["citric acid"],ax=ax2)
sns.kdeplot(X_pretrainscaled["residual sugar"],ax=ax2)
sns.kdeplot(X_pretrainscaled["chlorides"],ax=ax2)
sns.kdeplot(X_pretrainscaled["free sulfur dioxide"],ax=ax2)
sns.kdeplot(X_pretrainscaled["total sulfur dioxide"],ax=ax2)
sns.kdeplot(X_pretrainscaled["density"],ax=ax2)
sns.kdeplot(X_pretrainscaled["pH"],ax=ax2)
sns.kdeplot(X_pretrainscaled["sulphates"],ax=ax2)
sns.kdeplot(X_pretrainscaled["alcohol"],ax=ax2)

plt.show()

### E. Data imbalance correction - SMOTETomtek link

In [None]:
# Imbalaned y_train data
sns.countplot(x=y_pretrain);

In [None]:
# Resolving data imbalance on the training dataset using SMOTE oversampling--remove this approach

smote_tomek = SMOTETomek()
X_train, y_train = smote_tomek.fit_resample(X_pretrainscaled, y_pretrain)

In [None]:
#Balanced y_trained data
sns.countplot(x=y_train);

In [None]:
X_train["chlorides"].value_counts().sum(), y_train.value_counts().sum()

In [None]:
# Renaming X_pretestscaled to X_test for naming uniformity
X_test = X_pretestscaled


# 4. Modeling with default chosen ML estimators

In [None]:
# Creating a dictionary for the machine learning models

models = {"Logistic Regression" : LogisticRegression(), 
          "KNN" : KNeighborsClassifier(),
          "Decision Tree" : DecisionTreeClassifier()}

# Create a function to fit and score models

def model_fit_and_score(models, X_train, X_test, y_train, y_test):
    """
    Fits and evaluates the with the 3 selected machine learning models
    models : a dict of 3 different Scikit-Learn machine learning models
    """
    # Set random seed
    np.random.seed(22)
    # Make a dictionary to keep model scores
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        #Fit the model to the data
        model.fit(X_train,y_train)
        # Evaluate the model and append its score to model_scores
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

In [None]:
model_scores = model_fit_and_score(models=models,
                             X_train=X_train,
                             X_test=X_test,
                             y_train=y_train,
                             y_test=y_test)
model_scores

## Model Comparison - Initial Model Accuracy comparison

In [None]:
model_compare = pd.DataFrame(model_scores, index=["accuracy"])
model_compare.T.plot.bar();

# 5. Model Tuning with GridSearchCV

## K-Nearest Neighbor model

In [None]:
# Different KNeighborsClassifier hyperparameters
knn_grid = {"n_neighbors": range(1, 4, 1),
              "weights": ["uniform", "distance"],
              "metric": ["euclidean", "manhattan", "minkowski"],
              "leaf_size": range(1, 4, 4)}

# Setup grid hyperparameter search for KNeighborsClassifier
knn_g = KNeighborsClassifier()
np.random.seed(22)
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3)
grid_knn = GridSearchCV(estimator=knn_g, param_grid=knn_grid, n_jobs=1, cv=cv, scoring="accuracy", error_score=0)
grid_knn_results = grid_knn.fit(X_train, y_train);

#tune for best model and capture time
start = time.time()
knn_final_model = knn_g.set_params(**grid_knn_results.best_params_)
knn_final_model.fit(X_train, y_train)
knn_y_pred = knn_final_model.predict(X_test)
elapsed_time = (time.time() - start)
knn_y_proba = knn_final_model.predict_proba(X_test)
print("Time taken : ", elapsed_time)

## Logistic Regression model

In [None]:
# Different LogisticRegression hyperparameters
logreg_grid = {"C": np.logspace(0.1, 0.5, 1, 1.5),
                  "solver": ["liblinear"],
                  "penalty": ["l1", "l2"],
                  "max_iter": [100, 1000, 2500, 5000]}

# Setup grid hyperparameter search for LogisticRegression
logreg_g = LogisticRegression()
np.random.seed(22)
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3)
grid_logreg = GridSearchCV(estimator=logreg_g, param_grid=logreg_grid, n_jobs=1, cv=cv, scoring="accuracy", error_score=0)
grid_logreg_results = grid_logreg.fit(X_train, y_train);

#tune for best model and capture time
start = time.time()
logreg_final_model = logreg_g.set_params(**grid_logreg_results.best_params_)
logreg_final_model.fit(X_train, y_train)
logreg_y_pred = logreg_final_model.predict(X_test)
elapsed_time = (time.time() - start)
logreg_y_proba = logreg_final_model.predict_proba(X_test)
print("Time taken : ", elapsed_time)

## Decision Tree model

In [None]:
# Different LogisticRegression hyperparameters
dectree_grid = {"max_leaf_nodes": range(2000, 2900, 300),
                  "criterion": ["gini", "entropy"],
                  "min_samples_split": [2, 4],
                  "max_depth": range(500, 2000, 500)}

# Setup grid hyperparameter search for LogisticRegression
dectree_g = DecisionTreeClassifier()
np.random.seed(22)
cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3)
grid_dectree = GridSearchCV(estimator=dectree_g, param_grid=dectree_grid, n_jobs=1, 
                            cv=cv, scoring="accuracy", error_score=0)
grid_dectree_results = grid_dectree.fit(X_train, y_train);

#tune for best model and capture time
start = time.time()
dectree_final_model = dectree_g.set_params(**grid_dectree_results.best_params_)
dectree_final_model.fit(X_train, y_train)
dectree_y_pred = dectree_final_model.predict(X_test)
elapsed_time = (time.time() - start)
dectree_y_proba = dectree_final_model.predict_proba(X_test)
print("Time taken : ", elapsed_time)

# 6. Evaluation of tuned classification models

## A. Quick print of classification report, confusion matrix and best_params

In [None]:
print("== K-Nearest Neighbor Evaluation ==")
print("Classification report====")
print(classification_report(y_test,knn_y_pred))
print("Confusion Matrix====")
print(confusion_matrix(knn_y_pred, y_test))
print("best_params====")
print(grid_knn_results.best_params_)
print("")

print("== Logistic Regression Evaluation ==")
print("Classification report====")
print(classification_report(y_test,logreg_y_pred))
print("Confusion Matrix====")
print(confusion_matrix(logreg_y_pred, y_test))
print("best_params====")
print(grid_logreg_results.best_params_)
print("")

print(" == Decision Tree Evaluation ==")
print("Classification report====")
print(classification_report(y_test,dectree_y_pred))
print("Confusion Matrix====")
print(confusion_matrix(dectree_y_pred, y_test))
print("")
print("best_params====")
print(grid_dectree_results.best_params_)
print("")

## B. ROC curve and AUC score

In [None]:
# Plotting ROC curves for the 3 models

plot = skplt.metrics.plot_roc(y_test, knn_y_proba)
plt.title("ROC Curves - K-Nearest Neighbors");

plot = skplt.metrics.plot_roc(y_test, logreg_y_proba)
plt.title("ROC Curves - Logistic Regression");

plot = skplt.metrics.plot_roc(y_test, dectree_y_proba)
plt.title("ROC Curves - Decision Tree");

## C. Confusion Matrix Plot

In [None]:
sns.set(font_scale=1.5)

def plot_conf_mat(y_pred, y_test, graph_title):
    """
    Plots a nice looking confusion matrix using Seaborn's heatmap()
    """
    fig, ax = plt.subplots(figsize=(5, 3))
    ax = sns.heatmap(confusion_matrix(y_pred, y_test), annot=True, cmap="coolwarm_r", linewidths=0.5, fmt ='g')
    plt.xlabel("Predicted wine class")
    plt.ylabel("True wine class")
    plt.title(graph_title)
    
    bottom, top = ax.get_ylim()

In [None]:
plot_conf_mat(y_test, knn_y_pred, "KNNeighbor confusion matrix")
plot_conf_mat(y_test, logreg_y_pred, "LogisticRegresssion Confusion matrix")
plot_conf_mat(y_test, dectree_y_pred, "DecisionTree Confusion Matrix")

## D. Cross Validation with model's best hyperparamter

In [None]:
# Scaling and Sampling the whole wine dataset in parameter fot cross-validation implementation
X_cross_val = scaler.fit_transform(X)
X_cross_val = pd.DataFrame(X_cross_val, columns=X_pretrain.columns)
X_cross_val.head()

X_cv, y_cv = smote_tomek.fit_resample(X_cross_val, y)

In [None]:

def cross_val_best_param (clf, title):
    # Cross-validated accuracy score
    cv_acc = np.mean(cross_val_score(clf, X_cv, y_cv, cv=10, scoring="accuracy")) # accuracy as scoring 
    #print(title, "corss-validated accuracy score = ", cv_acc)
    #print("")

    # Cross-validated precision score
    cv_precision = np.mean(cross_val_score(clf, X_cv, y_cv, cv=10, scoring="precision")) # precision as scoring
    #print(title, "corss-validated precision score = ", cv_precision)
    #print("")

    # Cross-validated recall score
    cv_recall = np.mean(cross_val_score(clf, X_cv, y_cv, cv=10, scoring="recall")) # recall as scoring
    #print(title, "corss-validated recall score = ", cv_recall)
    #print("")

    # Cross-validated recall score
    cv_f1 = np.mean(cross_val_score(clf, X_cv, y_cv, cv=10, scoring="f1")) # recall as F1
    #print(title, "cross-validated F1 score = ", cv_f1)
    #print("")

    # Visualizing cross-validated metrics
    print(title)
    cv_metrics = pd.DataFrame({"Accuracy": cv_acc,
                                "Precision": cv_precision,
                                "Recall": cv_recall,
                                "F1": cv_f1},
                              index=[0])
    print(cv_metrics)
    cv_metrics.T.plot.bar(title=title, legend=False);


In [None]:
cross_val_best_param(knn_g, "K_Nearest neighbor model cross-validation")
print("=========================================================================")
cross_val_best_param(dectree_g, "Decision Tree model cross-validation")
print("=========================================================================")
cross_val_best_param(logreg_g, "Logistic Regression model cross-validation")
