# ML CLASSIFICATION - {"RED WINE QUALITY" DATASET}

## 1. Importing Modules and Setting Configurations

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.feature_selection import SelectKBest, mutual_info_classif

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from pickle import dump, load

import warnings
warnings.filterwarnings('ignore')

from sklearn import set_config
set_config(display='diagram')

In [2]:
# PD Options

pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)

# SB Options

sb.set_theme(context='notebook', style='whitegrid', palette='pastel', font='times new roman', font_scale=1.25)

## 2. Importing Train Dataset

In [3]:
tr = pd.read_pickle('wine_quality_FE_prod_train.pkl')

print(f'Shape of the train dataset : {tr.shape}')
tr.head(5)

Shape of the train dataset : (1334, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,6.7,0.67,0.02,1.9,0.061,26.0,42.0,0.9949,3.39,0.82,10.9,1
1,8.8,0.955,0.05,1.8,0.075,5.0,19.0,0.9962,3.3,0.44,9.6,0
2,7.1,0.68,0.0,2.3,0.087,17.0,26.0,0.9978,3.45,0.53,9.5,0
3,11.9,0.39,0.69,2.8,0.095,17.0,35.0,0.9994,3.1,0.61,10.8,1
4,7.1,0.22,0.49,1.8,0.039,8.0,18.0,0.9934,3.39,0.56,12.4,1


In [4]:
Xtr = tr.drop(columns='quality')
ytr = tr['quality']

## 3. Production Model Pipeline

### 3.1 Pre-Processing Steps

In [5]:
# Pre Processors ---------------------------------------------------------------------------------------------
ct_preproc = ColumnTransformer([
    ('yj',PowerTransformer(method='yeo-johnson', standardize=False),slice(0,11)),
    ('ss',StandardScaler(),slice(0,11))
    ], remainder='passthrough')


# Feature Selection ------------------------------------------------------------------------------------------
skb = SelectKBest(mutual_info_classif, k='all')

### 3.2 Production Model : GradientBoostingClassifier()

In [6]:
# Production Model -------------------------------------------------------------------------------------------

mdl = GradientBoostingClassifier(learning_rate=0.01, max_depth=5, n_estimators=200, subsample=0.5, random_state=46)


### 3.3 ML Pipeline

In [7]:
# ML Pipeline -----------------------------------------------------------------------------------------------------------------
steps = [('ct_preproc',ct_preproc),
     ('skb',skb),
     ('mdl',mdl)]

pipe_prod = Pipeline(steps)

pipe_prod.fit(Xtr,ytr)

### 3.4 Production Model Evaluation with cross_val_score using scoring='accuracy'

In [8]:
### Cross Val Score using 'accuracy' metrics

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=46)
print(f'KFold Setting : \n {kfold} \n')

cv_score = cross_val_score(pipe_prod, Xtr, ytr, cv=kfold, scoring='accuracy').mean()
print(f'Cross Validation Score is : {round(cv_score*100,4)} %')

KFold Setting : 
 StratifiedKFold(n_splits=10, random_state=46, shuffle=True) 

Cross Validation Score is : 76.3831 %


### 3.5 Production Model Evaluation with cross_validate using scoring=['accuracy','f1','roc_auc']

In [9]:
### Cross Validate Score using ['accuracy', 'f1','roc_auc'] metrics

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=46)
print(f'KFold Setting : \n {kfold} \n')

scoring = ['accuracy', 'f1','roc_auc']
print(f'Scoring Metrics : \n {scoring} \n')

scores = cross_validate(pipe_prod, Xtr, ytr, cv=kfold, scoring=scoring)
print(f'Keys in the Score : \n {sorted(scores.keys())} \n')

res = pd.DataFrame(scores)
print(f'Result of Cross Validation : mean of score keys')
res.mean().reset_index().rename(columns={'index':'Keys',0:'Mean Score'})

KFold Setting : 
 StratifiedKFold(n_splits=10, random_state=46, shuffle=True) 

Scoring Metrics : 
 ['accuracy', 'f1', 'roc_auc'] 

Keys in the Score : 
 ['fit_time', 'score_time', 'test_accuracy', 'test_f1', 'test_roc_auc'] 

Result of Cross Validation : mean of score keys


Unnamed: 0,Keys,Mean Score
0,fit_time,2.7398
1,score_time,0.0286
2,test_accuracy,0.7638
3,test_f1,0.7526
4,test_roc_auc,0.8429


## 4. Model Save/Export

In [10]:
dump(pipe_prod, open('wine_quality_mdl_prod.pkl','wb'))
print('Model Object Saved Successfully \n')

dump(Xtr, open('wine_quality_X_prod.pkl','wb'))
print('"X" Features Saved Successfully')

Model Object Saved Successfully 

"X" Features Saved Successfully


## 5. Simple Prediction System on Test Data

#### 5.1 Load Model

In [11]:
mdl_pipe = load(open('wine_quality_mdl_prod.pkl','rb'))

#### 5.2.1 Importing Dataset

In [12]:
te = pd.read_pickle('wine_quality_FE_prod_test.pkl')

print(f'Shape of the test dataset : {te.shape}')
te.head(5)

Xte = te.drop(columns='quality')
yte = te['quality']

Shape of the test dataset : (100, 12)


#### 5.2.2 Overall Result on Test Data

In [13]:
yte_pred = mdl_pipe.predict(Xte)

print(f'Shape of the Test Data : {te.shape} \n')

print(f'Accuracy Score on Test Data : {round(accuracy_score(yte, yte_pred)*100,4)} % \n')

print(f'Confusion Matrix on Test Data : \n {confusion_matrix(yte, yte_pred)} \n')

print(f'Classification Report on Test Data : \n\n {classification_report(yte, yte_pred)}')

Shape of the Test Data : (100, 12) 

Accuracy Score on Test Data : 77.0 % 

Confusion Matrix on Test Data : 
 [[34 14]
 [ 9 43]] 

Classification Report on Test Data : 

               precision    recall  f1-score   support

           0       0.79      0.71      0.75        48
           1       0.75      0.83      0.79        52

    accuracy                           0.77       100
   macro avg       0.77      0.77      0.77       100
weighted avg       0.77      0.77      0.77       100



#### 5.2.3 Predictions on Test Samples

In [14]:
samp = Xte.sample(10).index.values.tolist()
samp

te_df = pd.concat([Xte.loc[samp],yte.loc[samp]],axis=1)
te_df.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
930,9.2,0.31,0.36,2.2,0.079,11.0,31.0,0.9961,3.33,0.86,12.0,1
420,10.6,0.28,0.39,15.5,0.069,6.0,23.0,1.0026,3.12,0.66,9.2,0
473,14.3,0.31,0.74,1.8,0.075,6.0,15.0,1.0008,2.86,0.79,8.4,1
760,8.9,0.75,0.14,2.5,0.086,9.0,30.0,0.9982,3.34,0.64,10.5,0
1149,6.1,0.34,0.25,1.8,0.084,4.0,28.0,0.9946,3.36,0.44,10.1,0
1356,5.9,0.55,0.1,2.2,0.062,39.0,51.0,0.9951,3.52,0.76,11.2,1
87,4.7,0.6,0.17,2.3,0.058,17.0,106.0,0.9932,3.85,0.6,12.9,1
1184,8.7,0.675,0.1,1.6,0.09,4.0,11.0,0.9974,3.31,0.65,9.55,0
1064,7.1,0.6,0.01,2.3,0.079,24.0,37.0,0.9951,3.4,0.61,10.9,1
896,5.6,0.605,0.05,2.4,0.073,19.0,25.0,0.9926,3.56,0.55,12.9,0


In [15]:
#inp_data = (13.7,0.415,0.68,2.9,0.085,17.0,43.0,1.0014,3.06,0.80,10.0)
#inp_data = np.array(inp_data).reshape(1,-1)

idx = te_df.sample().index.values
inp_data = Xte.loc[idx].values

print(f'Test Data with Index : {idx[0]} is Selected for Prediction: \n')

for fea_name, fea_val in zip(Xte.columns.tolist(), inp_data[0]):
    print(f'"{fea_name}" ---:--- {fea_val}')

Test Data with Index : 896 is Selected for Prediction: 

"fixed acidity" ---:--- 5.6
"volatile acidity" ---:--- 0.605
"citric acid" ---:--- 0.05
"residual sugar" ---:--- 2.4
"chlorides" ---:--- 0.073
"free sulfur dioxide" ---:--- 19.0
"total sulfur dioxide" ---:--- 25.0
"density" ---:--- 0.99258
"pH" ---:--- 3.56
"sulphates" ---:--- 0.55
"alcohol" ---:--- 12.9


In [16]:
print(f'------------- Prediction for Test Data with Index : {idx[0]} --------------------\n')

print(f'Actual Wine Quality for the Selected Data : {yte[idx[0]]} \n')

pred = mdl_pipe.predict(inp_data)

if (pred[0] == 1):
    print(f'Predicted Wine Quality : {pred[0]} (High Quality Wine)')
else:
    print(f'Predicted Wine Quality : {pred[0]} (Low Quality Wine)')

------------- Prediction for Test Data with Index : 896 --------------------

Actual Wine Quality for the Selected Data : 0 

Predicted Wine Quality : 1 (High Quality Wine)
