# Wine Quality Machine Learning (Using Pycaret)

- This is an experiment notebook using https://pycaret.org library to explore and create a machine learning model for the wine quality dataset.


In [17]:
import pandas as pd
import numpy as np

In [18]:
dataset = pd.read_csv("data/WineQT.csv")

In [19]:
dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,2
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,3
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1592
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6,1593
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,1594
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1595


In [20]:
dataset.shape

(1143, 13)

**Note**: From the initial view of the dataser, this is a classification problem.
We want to determine the quality of the wine based from the different features like fixed acidiity, volatile acidity, etc

In [21]:
dataset = dataset.drop(columns=["Id"])

**Note**: The Id column is not needed so we drop it from the dataframe

In [22]:
dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6


In [23]:
dataset.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0
mean,8.311111,0.531339,0.268364,2.532152,0.086933,15.615486,45.914698,0.99673,3.311015,0.657708,10.442111,5.657043
std,1.747595,0.179633,0.196686,1.355917,0.047267,10.250486,32.78213,0.001925,0.156664,0.170399,1.082196,0.805824
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.3925,0.09,1.9,0.07,7.0,21.0,0.99557,3.205,0.55,9.5,5.0
50%,7.9,0.52,0.25,2.2,0.079,13.0,37.0,0.99668,3.31,0.62,10.2,6.0
75%,9.1,0.64,0.42,2.6,0.09,21.0,61.0,0.997845,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,68.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [24]:
dataset["quality"].value_counts()

5    483
6    462
7    143
4     33
8     16
3      6
Name: quality, dtype: int64

**Note**: The values for quality, ranges from 3 to 8, for this experiment we add a new column "a_quality" (short for adjusted quality).
    We classify anything 5 and below as poor, 6 as good and 7 above as high.

In [25]:
#dataset.assign(a_quality = lambda x: 0 if dataset['quality'] <=5 else (1 if dataset['quality']==6 else 2))

dataset["a_quality"] = ["good" if x <=5 else "elite" for x in dataset['quality']]

In [26]:
dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,a_quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,good
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,good
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,good
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,elite
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,good
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,elite
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6,elite
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,good
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,elite


In [27]:
dataset["a_quality"].value_counts()

elite    621
good     522
Name: a_quality, dtype: int64

In [28]:
dataset.isnull().values.any()

False

In [29]:
dataset = dataset.drop(columns=["quality"])

In [30]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1143 non-null   float64
 1   volatile acidity      1143 non-null   float64
 2   citric acid           1143 non-null   float64
 3   residual sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free sulfur dioxide   1143 non-null   float64
 6   total sulfur dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
 11  a_quality             1143 non-null   object 
dtypes: float64(11), object(1)
memory usage: 107.3+ KB


**Note**: We now use the pycaret classification library.

In [31]:
from pycaret.classification import *

In [32]:
data_1 = dataset.sample(frac=0.90, random_state=1)
data_2 = dataset.drop(data_1.index)

data_1.reset_index(drop=True, inplace=True)
data_2.reset_index(drop=True, inplace=True)

print('Data 1: ' + str(data_1.shape))
print('Data 2: ' + str(data_2.shape))

Data 1: (1029, 12)
Data 2: (114, 12)


In [33]:
exp_clf101 = setup(data = data_1, target = 'a_quality', session_id=1,
                   train_size=0.8, 
                   #ignore_features=['quality'],
                   create_clusters=True,polynomial_features=True,
                  fix_imbalance=True, normalize=True, normalize_method="maxabs")

Unnamed: 0,Description,Value
0,session_id,1
1,Target,a_quality
2,Target Type,Binary
3,Label Encoded,"elite: 0, good: 1"
4,Original Data,"(1029, 12)"
5,Missing Values,False
6,Numeric Features,11
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [35]:
eda()

Shape of your Data Set loaded: (1029, 12)
############## C L A S S I F Y I N G  V A R I A B L E S  ####################
Classifying variables in data set...
    11 Predictors classified...
        No variables removed since no ID or low-information variables found in data set

################ Binary_Classification VISUALIZATION Started #####################


Time to run AutoViz (in seconds) = 4


In [36]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.7862,0.846,0.7789,0.7631,0.77,0.5703,0.5717,3.101
et,Extra Trees Classifier,0.7813,0.8533,0.7632,0.7639,0.7626,0.5599,0.561,0.344
rf,Random Forest Classifier,0.7789,0.8425,0.7553,0.764,0.758,0.5546,0.5564,0.448
lightgbm,Light Gradient Boosting Machine,0.7741,0.8489,0.7684,0.754,0.758,0.5466,0.5503,0.121
xgboost,Extreme Gradient Boosting,0.7704,0.8399,0.7605,0.7534,0.7542,0.5391,0.5423,0.546
gbc,Gradient Boosting Classifier,0.757,0.8253,0.7447,0.7348,0.7378,0.5116,0.514,0.343
svm,SVM - Linear Kernel,0.746,0.0,0.7921,0.7155,0.7372,0.494,0.5137,0.037
ridge,Ridge Classifier,0.7387,0.0,0.7632,0.7002,0.7293,0.4777,0.4807,0.027
lr,Logistic Regression,0.7375,0.7981,0.7605,0.6991,0.7276,0.4751,0.4778,0.055
lda,Linear Discriminant Analysis,0.7351,0.8167,0.7658,0.6938,0.7267,0.4709,0.475,0.03


<catboost.core.CatBoostClassifier at 0x7fd05df89340>

In [37]:
catboost = create_model("catboost")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.759,0.8222,0.7632,0.725,0.7436,0.5166,0.5172
1,0.7831,0.8602,0.7895,0.75,0.7692,0.5649,0.5656
2,0.759,0.8228,0.6579,0.7812,0.7143,0.5086,0.5142
3,0.7927,0.8355,0.7632,0.7838,0.7733,0.5824,0.5826
4,0.8049,0.8499,0.8421,0.7619,0.8,0.6105,0.6134
5,0.8049,0.8523,0.8158,0.775,0.7949,0.6091,0.6098
6,0.8537,0.9348,0.8684,0.825,0.8462,0.7068,0.7076
7,0.7927,0.8439,0.8158,0.7561,0.7848,0.5854,0.5869
8,0.7683,0.8559,0.7632,0.7436,0.7532,0.5349,0.5351
9,0.7439,0.7829,0.7105,0.7297,0.72,0.4841,0.4843


In [38]:
tuned_catboost = tune_model(catboost)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7229,0.7953,0.7632,0.6744,0.716,0.4475,0.4507
1,0.8072,0.8608,0.8421,0.7619,0.8,0.6148,0.6177
2,0.7711,0.8205,0.6842,0.7879,0.7324,0.5341,0.5382
3,0.7927,0.8272,0.7632,0.7838,0.7733,0.5824,0.5826
4,0.7805,0.8541,0.8158,0.7381,0.775,0.5618,0.5644
5,0.8293,0.8786,0.8158,0.8158,0.8158,0.6567,0.6567
6,0.8293,0.9139,0.8421,0.8,0.8205,0.6579,0.6587
7,0.7805,0.8594,0.7895,0.75,0.7692,0.5602,0.5609
8,0.7805,0.8505,0.7895,0.75,0.7692,0.5602,0.5609
9,0.7683,0.7889,0.7105,0.7714,0.7397,0.5316,0.533


In [39]:
plot_model(tuned_catboost)

In [41]:
evaluate_model(tuned_catboost)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [42]:
predict_model(tuned_catboost)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,CatBoost Classifier,0.7816,0.8611,0.7551,0.7789,0.7668,0.5615,0.5617


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,volatile acidity_Power2,total sulfur dioxide_Power2,data_cluster_0,a_quality,Label,Score
0,0.477987,0.291139,0.11,0.167742,0.129296,0.181818,0.176259,0.993135,0.800499,0.285,0.671141,0.084762,0.031067,0.0,good,elite,0.6772
1,0.452830,0.398734,0.03,0.141935,0.130933,0.257576,0.316547,0.993783,0.880299,0.290,0.657718,0.158989,0.100202,0.0,elite,good,0.7840
2,0.635220,0.234177,0.34,0.154839,0.139116,0.075758,0.061151,0.993165,0.790524,0.325,0.711409,0.054839,0.003739,1.0,elite,elite,0.8339
3,0.490566,0.379747,0.14,0.154839,0.140753,0.045455,0.053957,0.993833,0.852868,0.300,0.724832,0.144208,0.002911,0.0,elite,elite,0.6235
4,0.503145,0.113924,0.37,0.058065,0.080196,0.545455,0.392086,0.986430,0.720698,0.220,0.852349,0.012979,0.153732,1.0,elite,elite,0.7494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,0.433962,0.468354,0.03,0.148387,0.088380,0.106061,0.057554,0.991422,0.860349,0.315,0.771812,0.219356,0.003312,0.0,elite,elite,0.9011
202,0.540881,0.458861,0.24,0.425806,0.191489,0.469697,0.482014,0.997718,0.827930,0.535,0.624161,0.210553,0.232338,0.0,good,good,0.9332
203,0.773585,0.316456,0.49,0.141935,0.145663,0.075758,0.050360,0.996523,0.795511,0.220,0.644295,0.100144,0.002536,1.0,good,good,0.9123
204,0.383648,0.446203,0.10,0.180645,0.132570,0.196970,0.100719,0.992647,0.897756,0.330,0.684564,0.199097,0.010144,0.0,good,good,0.6618


In [43]:
final_catboost = finalize_model(tuned_catboost)

In [45]:
unseen_predictions = predict_model(final_catboost, data=data_2)
unseen_predictions.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,a_quality,Label,Score
0,7.9,0.43,0.21,1.6,0.106,10.0,37.0,0.9966,3.17,0.91,9.5,good,good,0.5834
1,7.1,0.71,0.0,1.9,0.08,14.0,35.0,0.9972,3.47,0.55,9.4,good,good,0.9018
2,7.8,0.645,0.0,5.5,0.086,5.0,18.0,0.9986,3.4,0.55,9.6,elite,good,0.8437
3,7.8,0.56,0.19,1.8,0.104,12.0,47.0,0.9964,3.19,0.93,9.5,good,good,0.8398
4,5.2,0.34,0.0,1.8,0.05,27.0,63.0,0.9916,3.68,0.79,14.0,elite,elite,0.9611


In [46]:
save_model(final_catboost,'Final catboost')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='a_quality',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_st...
                  Cluster_Entire_Data(check_clusters_upto=None, random_state=1,
                                      target_variable=None)),
                 ('dummy', Dummify(target='a_quality')),
                 ('fix_perfect', Remove_100(target='a_quality')),
         

In [47]:
saved_final_catboost = load_model('Final catboost')

Transformation Pipeline and Model Successfully Loaded


In [48]:
new_prediction = predict_model(saved_final_catboost, data=data_2)

In [49]:
new_prediction.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,a_quality,Label,Score
0,7.9,0.43,0.21,1.6,0.106,10.0,37.0,0.9966,3.17,0.91,9.5,good,good,0.5834
1,7.1,0.71,0.0,1.9,0.08,14.0,35.0,0.9972,3.47,0.55,9.4,good,good,0.9018
2,7.8,0.645,0.0,5.5,0.086,5.0,18.0,0.9986,3.4,0.55,9.6,elite,good,0.8437
3,7.8,0.56,0.19,1.8,0.104,12.0,47.0,0.9964,3.19,0.93,9.5,good,good,0.8398
4,5.2,0.34,0.0,1.8,0.05,27.0,63.0,0.9916,3.68,0.79,14.0,elite,elite,0.9611
