In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')


In [3]:
# file 'winequality-white.csv' from Kaggle dataset
df = pd.read_csv('./winequality-white.csv', sep=';')
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [8]:
# rename columns to match original dataset
for c in df.columns:
    df.rename(columns={c: c.replace(' ','.')}, inplace=True)
df.dtypes

((4898, 12),
 fixed.acidity           float64
 volatile.acidity        float64
 citric.acid             float64
 residual.sugar          float64
 chlorides               float64
 free.sulfur.dioxide     float64
 total.sulfur.dioxide    float64
 density                 float64
 pH                      float64
 sulphates               float64
 alcohol                 float64
 quality                   int64
 dtype: object)

In [392]:
df.shape, df.dtypes

((3918, 13),
 fixed.acidity           float64
 volatile.acidity        float64
 citric.acid             float64
 residual.sugar          float64
 chlorides               float64
 free.sulfur.dioxide     float64
 total.sulfur.dioxide    float64
 density                 float64
 pH                      float64
 sulphates               float64
 alcohol                 float64
 quality                   int64
 id                        int64
 dtype: object)

In [13]:
df['quality'] = df['quality'].apply(lambda x: 0 if x <=4 else 1 if x <= 7 else 2)
df['quality'].value_counts()

quality
1    4535
0     183
2     180
Name: count, dtype: int64

In [14]:
X = df.drop('quality', axis=1)
y = df['quality']


# Model Building

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier



In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Confusion Matrix: ', confusion_matrix(y_test, y_pred))
    print('Classification Report: ', classification_report(y_test, y_pred))
    return y_pred
    
def evaluate_train_test(model, X_train, y_train, X_test, y_test):
    print('Test data')
    evaluate_model(model, X_test, y_test)

In [18]:
pipe = Pipeline([('scaler', StandardScaler()), ('model', DecisionTreeClassifier(random_state=42))])
pipe.fit(X_train, y_train)
evaluate_train_test(pipe, X_train, y_train, X_test, y_test)


Test data
Accuracy:  0.9163265306122449
Confusion Matrix:  [[ 14  15   1]
 [ 22 868  25]
 [  1  18  16]]
Classification Report:                precision    recall  f1-score   support

           0       0.38      0.47      0.42        30
           1       0.96      0.95      0.96       915
           2       0.38      0.46      0.42        35

    accuracy                           0.92       980
   macro avg       0.57      0.62      0.60       980
weighted avg       0.92      0.92      0.92       980



In [19]:
pipe = Pipeline([('scaler', StandardScaler()), ('model', RandomForestClassifier(random_state=42))])
pipe.fit(X_train, y_train)
evaluate_train_test(pipe, X_train, y_train, X_test, y_test)

Test data
Accuracy:  0.9540816326530612
Confusion Matrix:  [[  8  22   0]
 [  2 912   1]
 [  0  20  15]]
Classification Report:                precision    recall  f1-score   support

           0       0.80      0.27      0.40        30
           1       0.96      1.00      0.98       915
           2       0.94      0.43      0.59        35

    accuracy                           0.95       980
   macro avg       0.90      0.56      0.65       980
weighted avg       0.95      0.95      0.94       980



In [20]:
pipe = Pipeline([('scaler', StandardScaler()), ('model', XGBClassifier(random_state=42))])
pipe.fit(X_train, y_train)
evaluate_train_test(pipe, X_train, y_train, X_test, y_test)


Test data
Accuracy:  0.95
Confusion Matrix:  [[ 11  19   0]
 [  7 904   4]
 [  0  19  16]]
Classification Report:                precision    recall  f1-score   support

           0       0.61      0.37      0.46        30
           1       0.96      0.99      0.97       915
           2       0.80      0.46      0.58        35

    accuracy                           0.95       980
   macro avg       0.79      0.60      0.67       980
weighted avg       0.94      0.95      0.94       980



Decision Tree gave a 91% accuracy. On the other hand using Random Forest and XGBoost increased the accuracy to 94%

| Model | Accuracy |
| --- | --- |
| Decision Tree | 0.91 | 
| Random Forest | 0.94 | 
| XGBoost | 0.94 |

In [21]:
import joblib
joblib.dump(pipe, 'wine-quality.pkl')

['wine-quality.pkl']