# ML CLASSIFICATION - {"RED WINE QUALITY" DATASET}

## 1. Importing Modules and Setting Configurations

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.feature_selection import SelectKBest, mutual_info_classif

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from pickle import dump, load

import warnings
warnings.filterwarnings('ignore')

from sklearn import set_config
set_config(display='diagram')

In [2]:
# PD Options

pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)

# SB Options

sb.set_theme(context='notebook', style='whitegrid', palette='pastel', font='times new roman', font_scale=1.25)

## 2. Importing Train Dataset

In [3]:
tr = pd.read_pickle('wine_quality_FE_final_train.pkl')

print(f'Shape of the train dataset : {tr.shape}')
tr.head(5)

Shape of the train dataset : (1230, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,6.5,0.51,0.15,3.0,0.064,12.0,27.0,0.9929,3.33,0.59,12.8,1
1,10.1,0.31,0.44,2.3,0.08,22.0,46.0,0.9988,3.32,0.67,9.7,1
2,10.5,0.51,0.64,2.4,0.107,6.0,15.0,0.9973,3.09,0.66,11.8,1
3,7.6,0.645,0.03,1.9,0.086,14.0,57.0,0.9969,3.37,0.46,10.3,0
4,10.7,0.67,0.22,2.7,0.107,17.0,34.0,1.0004,3.28,0.975,9.9,1


In [4]:
Xtr = tr.drop(columns='quality')
ytr = tr['quality']

## 3. Comparing Performance of Hyper Parameter Tuned Models

### 3.1 Dictionary of Models

In [5]:
mdl_dict = {
    'Log_Reg':LogisticRegression(C=1.0, max_iter=50, penalty='l2', solver='lbfgs', random_state=46),

    'KN_CLF':KNeighborsClassifier(algorithm='brute', metric='euclidean', n_neighbors=17, weights='distance'),

    'SV_CLF':SVC(C=0.1, degree=2, gamma='auto', kernel='rbf', random_state=46),

    'DT_CLF':DecisionTreeClassifier(criterion='entropy', max_depth=5, min_impurity_decrease=0.0, min_samples_split=0.3, splitter='random', random_state=46),

    'BAG_CLF':BaggingClassifier(bootstrap=True, estimator=DecisionTreeClassifier(), max_samples=0.5, n_estimators=200, oob_score=True, random_state=46),

    'RF_CLF':RandomForestClassifier(bootstrap=True, criterion='entropy', max_depth=5, max_samples=0.5, n_estimators=200, oob_score=True, random_state=46),

    'GB_CLF':GradientBoostingClassifier(learning_rate=0.01, max_depth=5, n_estimators=200, subsample=0.5, random_state=46),

    'HGB_CLF':HistGradientBoostingClassifier(learning_rate=0.1, max_depth=5, max_iter=200, max_leaf_nodes=25, random_state=46),

    'XGB_CLF':XGBClassifier(eta=0.1, gamma=0.01, max_depth=5, n_estimators=50, subsample=0.5, objective='binary:logistic',
                            eval_metric='auc', seed=46)
    }

print(f'Models for Performance Comparison : \n')
for model_name, model in mdl_dict.items():
    print(f'{model_name} : \n {model} \n')

Models for Performance Comparison : 

Log_Reg : 
 LogisticRegression(max_iter=50, random_state=46) 

KN_CLF : 
 KNeighborsClassifier(algorithm='brute', metric='euclidean', n_neighbors=17,
                     weights='distance') 

SV_CLF : 
 SVC(C=0.1, degree=2, gamma='auto', random_state=46) 

DT_CLF : 
 DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=0.3,
                       random_state=46, splitter='random') 

BAG_CLF : 
 BaggingClassifier(estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=200, oob_score=True, random_state=46) 

RF_CLF : 
 RandomForestClassifier(criterion='entropy', max_depth=5, max_samples=0.5,
                       n_estimators=200, oob_score=True, random_state=46) 

GB_CLF : 
 GradientBoostingClassifier(learning_rate=0.01, max_depth=5, n_estimators=200,
                           random_state=46, subsample=0.5) 

HGB_CLF : 
 HistGradientBoostingClassifier(max_depth=5, max_iter=200, max_leaf_nodes=25

### 3.2 Calculating Model Preformance

In [6]:
# Pre Processors -------------------------------------------------------------------------------------------------------------
ct_preproc = ColumnTransformer([
    ('yj',PowerTransformer(method='yeo-johnson', standardize=False),slice(0,11)),
    ('ss',StandardScaler(),slice(0,11))
    ], remainder='passthrough')


# Feature Selection -----------------------------------------------------------------------------------------------------------
skb = SelectKBest(mutual_info_classif, k='all')


# Function to calculate models performance using Pre-Processors, Feature Selection, and Estimators in the Pipeline -------------
def mdl_scores(mod_name, mod):

    output = []
    output.append(mod_name)

    #-----------------------------------------------------------------------------
    steps = [('ct_preproc',ct_preproc),
         ('skb',skb),
         ('mdl',mod)]
    pipe = Pipeline(steps)

    #----------------------------------------------------------------------------
    # K-fold cross-validation
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=46)
    scores = cross_val_score(pipe, Xtr, ytr, cv=kfold, scoring='accuracy')
    output.append(scores.mean()*100)

    #----------------------------------------------------------------------------
    return output

### 3.3 Comparing Model Performance

In [7]:
mdl_output = []

for mdl_name,mdl in mdl_dict.items():
    mdl_output.append(mdl_scores(mdl_name, mdl))

mdl_perf = pd.DataFrame(mdl_output, columns=['Model','Acc_Score'])

print(f'Comparsion of Model Performance : \n')
mdl_perf.sort_values(['Acc_Score'], ascending=False)

Comparsion of Model Performance : 



Unnamed: 0,Model,Acc_Score
6,GB_CLF,77.2358
8,XGB_CLF,77.0732
5,RF_CLF,76.6667
4,BAG_CLF,76.5854
0,Log_Reg,76.2602
1,KN_CLF,76.1789
7,HGB_CLF,76.0163
2,SV_CLF,75.3659
3,DT_CLF,72.3577


## 4. Best Model With Hyper-Parameters Tuned

In [8]:
# Pre Processors -------------------------------------------------------------------------------------------------------------
ct_preproc = ColumnTransformer([
    ('yj',PowerTransformer(method='yeo-johnson', standardize=False),slice(0,11)),
    ('ss',StandardScaler(),slice(0,11))
    ], remainder='passthrough')


# Feature Selection -----------------------------------------------------------------------------------------------------------
skb = SelectKBest(mutual_info_classif, k='all')


# ML Pipeline -----------------------------------------------------------------------------------------------------------------
steps = [('ct_preproc',ct_preproc),
     ('skb',skb),
     ('mdl',GradientBoostingClassifier(learning_rate=0.01, max_depth=5, n_estimators=200, subsample=0.5, random_state=46))]
pipe_best = Pipeline(steps)

pipe_best.fit(Xtr,ytr)

### 4.1 Best Model Evaluation with cross_val_score using scoring='accuracy'

In [9]:
### Cross Val Score using 'accuracy' metrics

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=46)
print(f'KFold Setting : \n {kfold} \n')

cv_score = cross_val_score(pipe_best, Xtr, ytr, cv=kfold, scoring='accuracy').mean()
print(f'Cross Validation Score is : {round(cv_score*100,4)} %')

KFold Setting : 
 StratifiedKFold(n_splits=10, random_state=46, shuffle=True) 

Cross Validation Score is : 77.2358 %


### 4.2 Best Model Evaluation with cross_validate using scoring=['accuracy','f1','roc_auc']

In [10]:
### Cross Validate Score using ['accuracy', 'f1','roc_auc'] metrics

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=46)
print(f'KFold Setting : \n {kfold} \n')

scoring = ['accuracy', 'f1','roc_auc']
print(f'Scoring Metrics : \n {scoring} \n')

scores = cross_validate(pipe_best, Xtr, ytr, cv=kfold, scoring=scoring)
print(f'Keys in the Score : \n {sorted(scores.keys())} \n')

res = pd.DataFrame(scores)
print(f'Result of Cross Validation : mean of score keys')
res.mean().reset_index().rename(columns={'index':'Keys',0:'Mean Score'})

KFold Setting : 
 StratifiedKFold(n_splits=10, random_state=46, shuffle=True) 

Scoring Metrics : 
 ['accuracy', 'f1', 'roc_auc'] 

Keys in the Score : 
 ['fit_time', 'score_time', 'test_accuracy', 'test_f1', 'test_roc_auc'] 

Result of Cross Validation : mean of score keys


Unnamed: 0,Keys,Mean Score
0,fit_time,2.5948
1,score_time,0.0283
2,test_accuracy,0.7724
3,test_f1,0.766
4,test_roc_auc,0.8386


## 5. Model Save/Export

In [11]:
dump(pipe_best, open('wine_quality_mdl_best.pkl','wb'))
print('Model Object Saved Successfully \n')

dump(Xtr, open('wine_quality_X_best.pkl','wb'))
print('"X" Features Saved Successfully')

Model Object Saved Successfully 

"X" Features Saved Successfully


## 6. Simple Prediction System

### 6.1 Load Best Model

In [12]:
mdl_pipe = load(open('wine_quality_mdl_best.pkl','rb'))

### 6.2 Validation Dataset 

#### 6.2.1 Importing Dataset

In [13]:
val = pd.read_pickle('wine_quality_FE_final_valid.pkl')

print(f'Shape of the validation dataset : {val.shape}')
val.head(5)

Xval = val.drop(columns='quality')
yval = val['quality']

Shape of the validation dataset : (100, 12)


#### 6.2.2 Overall Result on Validation Data

In [14]:
yval_pred = mdl_pipe.predict(Xval)

print(f'Accuracy Score on Validation Data : {round(accuracy_score(yval, yval_pred)*100,4)} % \n')

print(f'Confusion Matrix on Validation Data : \n {confusion_matrix(yval, yval_pred)} \n')

print(f'Classification Report on Validation Data : \n\n {classification_report(yval, yval_pred)}')

Accuracy Score on Validation Data : 75.0 % 

Confusion Matrix on Validation Data : 
 [[35 13]
 [12 40]] 

Classification Report on Validation Data : 

               precision    recall  f1-score   support

           0       0.74      0.73      0.74        48
           1       0.75      0.77      0.76        52

    accuracy                           0.75       100
   macro avg       0.75      0.75      0.75       100
weighted avg       0.75      0.75      0.75       100



#### 6.2.3 Predictions on Validation Samples

In [15]:
samp = Xval.sample(10).index.values.tolist()
samp

val_df = pd.concat([Xval.loc[samp],yval.loc[samp]],axis=1)
val_df.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
958,5.8,0.29,0.26,1.7,0.063,3.0,11.0,0.9915,3.39,0.54,13.5,1
208,8.2,1.0,0.09,2.3,0.065,7.0,37.0,0.9969,3.32,0.55,9.0,1
1336,5.6,0.31,0.78,13.9,0.074,23.0,92.0,0.9968,3.39,0.48,10.5,1
69,9.7,0.32,0.54,2.5,0.094,28.0,83.0,0.9984,3.28,0.82,9.6,0
727,10.6,0.5,0.45,2.6,0.119,34.0,68.0,0.9971,3.23,0.72,10.9,1
477,12.4,0.35,0.49,2.6,0.079,27.0,69.0,0.9994,3.12,0.75,10.4,1
962,9.1,0.6,0.0,1.9,0.058,5.0,10.0,0.9977,3.18,0.63,10.4,1
1063,6.9,0.41,0.33,2.2,0.081,22.0,36.0,0.9949,3.41,0.75,11.1,1
280,7.8,0.62,0.05,2.3,0.079,6.0,18.0,0.9973,3.29,0.63,9.3,0
754,7.1,0.47,0.0,2.2,0.067,7.0,14.0,0.9952,3.4,0.58,10.9,0


In [16]:
#inp_data = (13.7,0.415,0.68,2.9,0.085,17.0,43.0,1.0014,3.06,0.80,10.0)
#inp_data = np.array(inp_data).reshape(1,-1)

idx = val_df.sample().index.values
inp_data = Xval.loc[idx].values

print(f'Validation Data with Index : {idx[0]} is Selected for Prediction: \n')

for fea_name, fea_val in zip(Xval.columns.tolist(), inp_data[0]):
    print(f'"{fea_name}" ---:--- {fea_val}')

Validation Data with Index : 477 is Selected for Prediction: 

"fixed acidity" ---:--- 12.4
"volatile acidity" ---:--- 0.35
"citric acid" ---:--- 0.49
"residual sugar" ---:--- 2.6
"chlorides" ---:--- 0.079
"free sulfur dioxide" ---:--- 27.0
"total sulfur dioxide" ---:--- 69.0
"density" ---:--- 0.9994
"pH" ---:--- 3.12
"sulphates" ---:--- 0.75
"alcohol" ---:--- 10.4


In [17]:
print(f'------------- Prediction for Validation Data with Index : {idx[0]} --------------------\n')

print(f'Actual Wine Quality for the Selected Data : {yval[idx[0]]} \n')

pred = mdl_pipe.predict(inp_data)

if (pred[0] == 1):
    print(f'Predicted Wine Quality : {pred[0]} (High Quality Wine)')
else:
    print(f'Predicted Wine Quality : {pred[0]} (Low Quality Wine)')

------------- Prediction for Validation Data with Index : 477 --------------------

Actual Wine Quality for the Selected Data : 1 

Predicted Wine Quality : 1 (High Quality Wine)


### 6.3 Test Data

#### 6.2.1 Importing Dataset

In [18]:
te = pd.read_pickle('wine_quality_FE_final_test.pkl')

print(f'Shape of the test dataset : {te.shape}')
te.head(5)

Xte = te.drop(columns='quality')
yte = te['quality']

Shape of the test dataset : (100, 12)


#### 6.2.2 Overall Result on Test Data

In [19]:
yte_pred = mdl_pipe.predict(Xte)

print(f'Shape of the Test Data : {te.shape} \n')

print(f'Accuracy Score on Test Data : {round(accuracy_score(yte, yte_pred)*100,4)} % \n')

print(f'Confusion Matrix on Test Data : \n {confusion_matrix(yte, yte_pred)} \n')

print(f'Classification Report on Test Data : \n\n {classification_report(yte, yte_pred)}')

Shape of the Test Data : (100, 12) 

Accuracy Score on Test Data : 76.0 % 

Confusion Matrix on Test Data : 
 [[34 14]
 [10 42]] 

Classification Report on Test Data : 

               precision    recall  f1-score   support

           0       0.77      0.71      0.74        48
           1       0.75      0.81      0.78        52

    accuracy                           0.76       100
   macro avg       0.76      0.76      0.76       100
weighted avg       0.76      0.76      0.76       100



#### 6.2.3 Predictions on Test Samples

In [20]:
samp = Xte.sample(10).index.values.tolist()
samp

te_df = pd.concat([Xte.loc[samp],yte.loc[samp]],axis=1)
te_df.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
403,11.0,0.26,0.68,2.55,0.085,10.0,25.0,0.997,3.18,0.61,11.8,0
261,7.5,0.53,0.06,2.6,0.086,20.0,44.0,0.9965,3.38,0.59,10.7,1
930,9.2,0.31,0.36,2.2,0.079,11.0,31.0,0.9961,3.33,0.86,12.0,1
1066,7.1,0.72,0.0,1.8,0.123,6.0,14.0,0.9963,3.45,0.58,9.8,0
646,7.3,0.51,0.18,2.1,0.07,12.0,28.0,0.9977,3.52,0.73,9.5,1
1260,6.8,0.68,0.21,2.1,0.07,9.0,23.0,0.9955,3.38,0.6,10.3,0
1188,7.9,0.69,0.21,2.1,0.08,33.0,141.0,0.9962,3.25,0.51,9.9,0
449,12.5,0.6,0.49,4.3,0.1,5.0,14.0,1.001,3.25,0.74,11.9,1
894,9.3,0.5,0.36,1.8,0.084,6.0,17.0,0.997,3.27,0.77,10.8,1
1282,6.4,0.79,0.04,2.2,0.061,11.0,17.0,0.9959,3.53,0.65,10.4,1


In [21]:
#inp_data = (13.7,0.415,0.68,2.9,0.085,17.0,43.0,1.0014,3.06,0.80,10.0)
#inp_data = np.array(inp_data).reshape(1,-1)

idx = te_df.sample().index.values
inp_data = Xte.loc[idx].values

print(f'Test Data with Index : {idx[0]} is Selected for Prediction: \n')

for fea_name, fea_val in zip(Xte.columns.tolist(), inp_data[0]):
    print(f'"{fea_name}" ---:--- {fea_val}')

Test Data with Index : 261 is Selected for Prediction: 

"fixed acidity" ---:--- 7.5
"volatile acidity" ---:--- 0.53
"citric acid" ---:--- 0.06
"residual sugar" ---:--- 2.6
"chlorides" ---:--- 0.086
"free sulfur dioxide" ---:--- 20.0
"total sulfur dioxide" ---:--- 44.0
"density" ---:--- 0.9965
"pH" ---:--- 3.38
"sulphates" ---:--- 0.59
"alcohol" ---:--- 10.7


In [22]:
print(f'------------- Prediction for Test Data with Index : {idx[0]} --------------------\n')

print(f'Actual Wine Quality for the Selected Data : {yte[idx[0]]} \n')

pred = mdl_pipe.predict(inp_data)

if (pred[0] == 1):
    print(f'Predicted Wine Quality : {pred[0]} (High Quality Wine)')
else:
    print(f'Predicted Wine Quality : {pred[0]} (Low Quality Wine)')

------------- Prediction for Test Data with Index : 261 --------------------

Actual Wine Quality for the Selected Data : 1 

Predicted Wine Quality : 1 (High Quality Wine)
