# ML CLASSIFICATION - {"RED WINE QUALITY" DATASET}

## 1. Importing Modules and Setting Configurations

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.feature_selection import SelectKBest, mutual_info_classif

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from pickle import dump, load

import warnings
warnings.filterwarnings('ignore')

from sklearn import set_config
set_config(display='diagram')

In [2]:
# PD Options

pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)

# SB Options

sb.set_theme(context='notebook', style='whitegrid', palette='pastel', font='times new roman', font_scale=1.25)

## 2. Importing Train Dataset

In [3]:
tr = pd.read_pickle('wine_quality_FE_final_train.pkl')

print(f'Shape of the train dataset : {tr.shape}')
tr.head(5)

Shape of the train dataset : (1230, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,6.5,0.51,0.15,3.0,0.064,12.0,27.0,0.9929,3.33,0.59,12.8,1
1,10.1,0.31,0.44,2.3,0.08,22.0,46.0,0.9988,3.32,0.67,9.7,1
2,10.5,0.51,0.64,2.4,0.107,6.0,15.0,0.9973,3.09,0.66,11.8,1
3,7.6,0.645,0.03,1.9,0.086,14.0,57.0,0.9969,3.37,0.46,10.3,0
4,10.7,0.67,0.22,2.7,0.107,17.0,34.0,1.0004,3.28,0.975,9.9,1


In [4]:
Xtr = tr.drop(columns='quality')
ytr = tr['quality']

## 3. Hyper Parameter Tuning

In [76]:
# Pre Processors ---------------------------------------------------------------------------------------------
ct_preproc = ColumnTransformer([
    ('yj',PowerTransformer(method='yeo-johnson', standardize=False),slice(0,11)),
    ('ss',StandardScaler(),slice(0,11))
    ], remainder='passthrough')


# Feature Selection ------------------------------------------------------------------------------------------
skb = SelectKBest(mutual_info_classif, k='all')

### 3.1 LogisticRegression

In [77]:
# ML Pipeline ------------------------------------------------------------------------------------------------
steps = [('ct_preproc',ct_preproc),
         ('skb',skb),
         ('mdl',LogisticRegression(random_state=46))]
pipe = Pipeline(steps)


# Model Param Grid -------------------------------------------------------------------------------------------
param_grid = {
    'mdl__C': [0.1,0.5,1.0],
    'mdl__max_iter':[50,100,200],
    'mdl__penalty':['l2'],                                           # ['l2','elasticnet',None]
    'mdl__solver':['lbfgs','sag','liblinear','newton-cg']            # ['lbfgs','liblinear','newton-cg','sag','saga']
    #'mdl__l1_ratio':[1,0.5,0]                                       # 1=l1(lasso) ----- 0=l2(ridge)
    }
print(f'Param Grid : \n {param_grid} \n')


# GridSearchCV Configuration ---------------------------------------------------------------------------------
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=46)
print(f'KFold Setting : \n {kfold} \n')
gscv = GridSearchCV(pipe, param_grid, cv=kfold, scoring='accuracy', n_jobs=-1, verbose=4)
gscv.fit(Xtr, ytr)


# GridSearch Results -----------------------------------------------------------------------------------------
print(f'Best Params : \n {gscv.best_params_} \n')
print(f'Best Scores : \n {round(gscv.best_score_*100,4)} %')

Param Grid : 
 {'mdl__C': [0.1, 0.5, 1.0], 'mdl__max_iter': [50, 100, 200], 'mdl__penalty': ['l2'], 'mdl__solver': ['lbfgs', 'sag', 'liblinear', 'newton-cg']} 

KFold Setting : 
 StratifiedKFold(n_splits=10, random_state=46, shuffle=True) 

Fitting 10 folds for each of 36 candidates, totalling 360 fits
Best Params : 
 {'mdl__C': 1.0, 'mdl__max_iter': 50, 'mdl__penalty': 'l2', 'mdl__solver': 'lbfgs'} 

Best Scores : 
 76.3415 %


### 3.2 KNeighborsClassifier

In [78]:
# ML Pipeline ------------------------------------------------------------------------------------------------
steps = [('ct_preproc',ct_preproc),
         ('skb',skb),
         ('mdl',KNeighborsClassifier())]
pipe = Pipeline(steps)


# Model Param Grid -------------------------------------------------------------------------------------------
param_grid = {
    'mdl__n_neighbors': [1,5,9,13,17],
    'mdl__weights':['uniform', 'distance'],
    'mdl__metric':['euclidean', 'manhattan', 'minkowski'],
    'mdl__algorithm':['brute','auto']
    }
print(f'Param Grid : \n {param_grid} \n')


# GridSearchCV Configuration ---------------------------------------------------------------------------------
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=46)
print(f'KFold Setting : \n {kfold} \n')
gscv = GridSearchCV(pipe, param_grid, cv=kfold, scoring='accuracy', n_jobs=-1, verbose=4)
gscv.fit(Xtr, ytr)


# GridSearch Results -----------------------------------------------------------------------------------------
print(f'Best Params : \n {gscv.best_params_} \n')
print(f'Best Scores : \n {round(gscv.best_score_*100,4)} %')

Param Grid : 
 {'mdl__n_neighbors': [1, 5, 9, 13, 17], 'mdl__weights': ['uniform', 'distance'], 'mdl__metric': ['euclidean', 'manhattan', 'minkowski'], 'mdl__algorithm': ['brute', 'auto']} 

KFold Setting : 
 StratifiedKFold(n_splits=10, random_state=46, shuffle=True) 

Fitting 10 folds for each of 60 candidates, totalling 600 fits
Best Params : 
 {'mdl__algorithm': 'brute', 'mdl__metric': 'euclidean', 'mdl__n_neighbors': 17, 'mdl__weights': 'distance'} 

Best Scores : 
 76.1789 %


### 3.3 SVC

In [79]:
# ML Pipeline ------------------------------------------------------------------------------------------------
steps = [('ct_preproc',ct_preproc),
         ('skb',skb),
         ('mdl',SVC(random_state=46))]
pipe = Pipeline(steps)


# Model Param Grid -------------------------------------------------------------------------------------------
param_grid = {
    'mdl__C':[1.0, 0.1, 0.01],
    'mdl__kernel':['linear','poly', 'rbf', 'sigmoid'],
    'mdl__gamma':['scale','auto'],
    'mdl__degree':[2,3,4,5]
    }
print(f'Param Grid : \n {param_grid} \n')


# GridSearchCV Configuration ---------------------------------------------------------------------------------
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=46)
print(f'KFold Setting : \n {kfold} \n')
gscv = GridSearchCV(pipe, param_grid, cv=kfold, scoring='accuracy', n_jobs=-1, verbose=4)
gscv.fit(Xtr, ytr)


# GridSearch Results -----------------------------------------------------------------------------------------
print(f'Best Params : \n {gscv.best_params_} \n')
print(f'Best Scores : \n {round(gscv.best_score_*100,4)} %')

Param Grid : 
 {'mdl__C': [1.0, 0.1, 0.01], 'mdl__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'mdl__gamma': ['scale', 'auto'], 'mdl__degree': [2, 3, 4, 5]} 

KFold Setting : 
 StratifiedKFold(n_splits=10, random_state=46, shuffle=True) 

Fitting 10 folds for each of 96 candidates, totalling 960 fits
Best Params : 
 {'mdl__C': 1.0, 'mdl__degree': 2, 'mdl__gamma': 'auto', 'mdl__kernel': 'rbf'} 

Best Scores : 
 76.5854 %


### 3.4 DecisionTreeClassifier

In [80]:
# ML Pipeline ------------------------------------------------------------------------------------------------
steps = [('ct_preproc',ct_preproc),
         ('skb',skb),
         ('mdl',DecisionTreeClassifier(random_state=46))]
pipe = Pipeline(steps)


# Model Param Grid -------------------------------------------------------------------------------------------
param_grid = {
    'mdl__criterion':['entropy','gini'],
    'mdl__max_depth':[1,3,5],
    'mdl__splitter':['best','random'],
    'mdl__min_samples_split':[0.3,0.5,0.8],
    'mdl__min_impurity_decrease':[0.0,0.2,0.5]
    }
print(f'Param Grid : \n {param_grid} \n')


# GridSearchCV Configuration ---------------------------------------------------------------------------------
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=46)
print(f'KFold Setting : \n {kfold} \n')
gscv = GridSearchCV(pipe, param_grid, cv=kfold, scoring='accuracy', n_jobs=-1, verbose=4)
gscv.fit(Xtr, ytr)


# GridSearch Results -----------------------------------------------------------------------------------------
print(f'Best Params : \n {gscv.best_params_} \n')
print(f'Best Scores : \n {round(gscv.best_score_*100,4)} %')

Param Grid : 
 {'mdl__criterion': ['entropy', 'gini'], 'mdl__max_depth': [1, 3, 5], 'mdl__splitter': ['best', 'random'], 'mdl__min_samples_split': [0.3, 0.5, 0.8], 'mdl__min_impurity_decrease': [0.0, 0.2, 0.5]} 

KFold Setting : 
 StratifiedKFold(n_splits=10, random_state=46, shuffle=True) 

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
Best Params : 
 {'mdl__criterion': 'entropy', 'mdl__max_depth': 5, 'mdl__min_impurity_decrease': 0.0, 'mdl__min_samples_split': 0.3, 'mdl__splitter': 'random'} 

Best Scores : 
 72.3577 %


### 3.5 BaggingClassifier

In [81]:
# ML Pipeline ------------------------------------------------------------------------------------------------
steps = [('ct_preproc',ct_preproc),
         ('skb',skb),
         ('mdl',BaggingClassifier(random_state=46))]
pipe = Pipeline(steps)


# Model Param Grid -------------------------------------------------------------------------------------------
param_grid = {
    'mdl__n_estimators':[50,100,200],
    'mdl__estimator':[KNeighborsClassifier(),SVC(),LogisticRegression(),DecisionTreeClassifier()],
    'mdl__max_samples':[0.25,0.5],
    'mdl__bootstrap':[True,False],
    'mdl__oob_score':[True]
    }
print(f'Param Grid : \n {param_grid} \n')


# GridSearchCV Configuration ---------------------------------------------------------------------------------
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=46)
print(f'KFold Setting : \n {kfold} \n')
gscv = GridSearchCV(pipe, param_grid, cv=kfold, scoring='accuracy', n_jobs=-1, verbose=4)
gscv.fit(Xtr, ytr)


# GridSearch Results -----------------------------------------------------------------------------------------
print(f'Best Params : \n {gscv.best_params_} \n')
print(f'Best Scores : \n {round(gscv.best_score_*100,4)} %')

Param Grid : 
 {'mdl__n_estimators': [50, 100, 200], 'mdl__estimator': [KNeighborsClassifier(), SVC(), LogisticRegression(), DecisionTreeClassifier()], 'mdl__max_samples': [0.25, 0.5], 'mdl__bootstrap': [True, False], 'mdl__oob_score': [True]} 

KFold Setting : 
 StratifiedKFold(n_splits=10, random_state=46, shuffle=True) 

Fitting 10 folds for each of 48 candidates, totalling 480 fits
Best Params : 
 {'mdl__bootstrap': True, 'mdl__estimator': DecisionTreeClassifier(), 'mdl__max_samples': 0.5, 'mdl__n_estimators': 200, 'mdl__oob_score': True} 

Best Scores : 
 76.5854 %


### 3.6 RandomForestClassifier

In [82]:
# ML Pipeline ------------------------------------------------------------------------------------------------
steps = [('ct_preproc',ct_preproc),
         ('skb',skb),
         ('mdl',RandomForestClassifier(random_state=46))]
pipe = Pipeline(steps)


# Model Param Grid -------------------------------------------------------------------------------------------
param_grid = {
    'mdl__n_estimators':[100, 200, 300],
    'mdl__criterion': ['entropy','gini'],
    'mdl__max_depth': [1,3,5],
    'mdl__max_samples': [0.25,0.50],
    'mdl__bootstrap':[True,False],
    'mdl__oob_score':[True]
    # 'mdl__min_samples_split':[0.3,0.5,0.8],
    # 'mdl__min_impurity_decrease':[0.0,0.2,0.5]
    }
print(f'Param Grid : \n {param_grid} \n')


# GridSearchCV Configuration ---------------------------------------------------------------------------------
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=46)
print(f'KFold Setting : \n {kfold} \n')
gscv = GridSearchCV(pipe, param_grid, cv=kfold, scoring='accuracy', n_jobs=-1, verbose=4)
gscv.fit(Xtr, ytr)


# GridSearch Results -----------------------------------------------------------------------------------------
print(f'Best Params : \n {gscv.best_params_} \n')
print(f'Best Scores : \n {round(gscv.best_score_*100,4)} %')

Param Grid : 
 {'mdl__n_estimators': [100, 200, 300], 'mdl__criterion': ['entropy', 'gini'], 'mdl__max_depth': [1, 3, 5], 'mdl__max_samples': [0.25, 0.5], 'mdl__bootstrap': [True, False], 'mdl__oob_score': [True]} 

KFold Setting : 
 StratifiedKFold(n_splits=10, random_state=46, shuffle=True) 

Fitting 10 folds for each of 72 candidates, totalling 720 fits
Best Params : 
 {'mdl__bootstrap': True, 'mdl__criterion': 'entropy', 'mdl__max_depth': 5, 'mdl__max_samples': 0.5, 'mdl__n_estimators': 200, 'mdl__oob_score': True} 

Best Scores : 
 76.6667 %


### 3.7 GradientBoostingClassifier

In [83]:
# ML Pipeline ------------------------------------------------------------------------------------------------
steps = [('ct_preproc',ct_preproc),
         ('skb',skb),
         ('mdl',GradientBoostingClassifier(random_state=46))]
pipe = Pipeline(steps)


# Model Param Grid -------------------------------------------------------------------------------------------
param_grid = {
    'mdl__learning_rate':[0.001,0.01,0.1],
    'mdl__n_estimators':[50,100,200],
    'mdl__max_depth':[1,3,5],
    'mdl__subsample':[0.5,0.75]
    # 'mdl__min_samples_split':[0.3,0.5,0.8],
    # 'mdl__min_impurity_decrease':[0.0,0.2,0.5]
    }
print(f'Param Grid : \n {param_grid} \n')


# GridSearchCV Configuration ---------------------------------------------------------------------------------
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=46)
print(f'KFold Setting : \n {kfold} \n')
gscv = GridSearchCV(pipe, param_grid, cv=kfold, scoring='accuracy', n_jobs=-1, verbose=4)
gscv.fit(Xtr, ytr)


# GridSearch Results -----------------------------------------------------------------------------------------
print(f'Best Params : \n {gscv.best_params_} \n')
print(f'Best Scores : \n {round(gscv.best_score_*100,4)} %')

Param Grid : 
 {'mdl__learning_rate': [0.001, 0.01, 0.1], 'mdl__n_estimators': [50, 100, 200], 'mdl__max_depth': [1, 3, 5], 'mdl__subsample': [0.5, 0.75]} 

KFold Setting : 
 StratifiedKFold(n_splits=10, random_state=46, shuffle=True) 

Fitting 10 folds for each of 54 candidates, totalling 540 fits
Best Params : 
 {'mdl__learning_rate': 0.01, 'mdl__max_depth': 5, 'mdl__n_estimators': 200, 'mdl__subsample': 0.5} 

Best Scores : 
 76.8293 %


### 3.8 HistGradientBoostingClassifier

In [84]:
# ML Pipeline ------------------------------------------------------------------------------------------------
steps = [('ct_preproc',ct_preproc),
         ('skb',skb),
         ('mdl',HistGradientBoostingClassifier(random_state=46))]
pipe = Pipeline(steps)


# Model Param Grid -------------------------------------------------------------------------------------------
param_grid = {
    'mdl__learning_rate':[0.01,0.1,0.5],
    'mdl__max_depth':[1,3,5],
    'mdl__max_iter':[50,100,200],
    'mdl__max_leaf_nodes':[15,20,25]
    #'mdl__l2_regularization':[0,0.1,0.5]
    }
print(f'Param Grid : \n {param_grid} \n')


# GridSearchCV Configuration ---------------------------------------------------------------------------------
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=46)
print(f'KFold Setting : \n {kfold} \n')
gscv = GridSearchCV(pipe, param_grid, cv=kfold, scoring='accuracy', n_jobs=-1, verbose=4)
gscv.fit(Xtr, ytr)


# GridSearch Results -----------------------------------------------------------------------------------------
print(f'Best Params : \n {gscv.best_params_} \n')
print(f'Best Scores : \n {round(gscv.best_score_*100,4)} %')

Param Grid : 
 {'mdl__learning_rate': [0.01, 0.1, 0.5], 'mdl__max_depth': [1, 3, 5], 'mdl__max_iter': [50, 100, 200], 'mdl__max_leaf_nodes': [15, 20, 25]} 

KFold Setting : 
 StratifiedKFold(n_splits=10, random_state=46, shuffle=True) 

Fitting 10 folds for each of 81 candidates, totalling 810 fits
Best Params : 
 {'mdl__learning_rate': 0.1, 'mdl__max_depth': 5, 'mdl__max_iter': 200, 'mdl__max_leaf_nodes': 25} 

Best Scores : 
 76.0163 %


### 3.9 XGBClassifier

In [None]:
#! pip install xgboost

In [85]:
# ML Pipeline ------------------------------------------------------------------------------------------------
steps = [('ct_preproc',ct_preproc),
         ('skb',skb),
         ('mdl',XGBClassifier(objective='binary:logistic', eval_metric='auc', seed=46))]
pipe = Pipeline(steps)


# Model Param Grid -------------------------------------------------------------------------------------------
param_grid = {
    'mdl__eta':[0.01,0.1,0.5],
    'mdl__n_estimators':[50,100,200],
    'mdl__max_depth':[3,5,7],
    'mdl__gamma':[0.01,0.05,0.1],
    'mdl__subsample':[0.5,0.75]
    # 'mdl__lambda':[0.1,0.25,0.5],
    # 'mdl__alpha':[0.1,0.25,0.5],
    # 'mdl__colsample_bytree':[0.25,0.5,0.75]
    }
print(f'Param Grid : \n {param_grid} \n')


# GridSearchCV Configuration ---------------------------------------------------------------------------------
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=46)
print(f'KFold Setting : \n {kfold} \n')
gscv = GridSearchCV(pipe, param_grid, cv=kfold, scoring='accuracy', n_jobs=-1, verbose=4)
gscv.fit(Xtr, ytr)


# GridSearch Results -----------------------------------------------------------------------------------------
print(f'Best Params : \n {gscv.best_params_} \n')
print(f'Best Scores : \n {round(gscv.best_score_*100,4)} %')

Param Grid : 
 {'mdl__eta': [0.01, 0.1, 0.5], 'mdl__n_estimators': [50, 100, 200], 'mdl__max_depth': [3, 5, 7], 'mdl__gamma': [0.01, 0.05, 0.1], 'mdl__subsample': [0.5, 0.75]} 

KFold Setting : 
 StratifiedKFold(n_splits=10, random_state=46, shuffle=True) 

Fitting 10 folds for each of 162 candidates, totalling 1620 fits
Best Params : 
 {'mdl__eta': 0.1, 'mdl__gamma': 0.01, 'mdl__max_depth': 5, 'mdl__n_estimators': 50, 'mdl__subsample': 0.5} 

Best Scores : 
 77.0732 %
