# Wine Quality Machine Learning (Using Pycaret)

- This is an experiment notebook using https://pycaret.org library to explore and create a machine learning model for the wine quality dataset.


In [52]:
import pandas as pd
import numpy as np

In [53]:
dataset = pd.read_csv("data/WineQT.csv")

In [54]:
dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,2
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,3
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1592
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6,1593
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,1594
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1595


In [55]:
dataset.shape

(1143, 13)

**Note**: From the initial view of the dataset, this is a classification problem.
We want to determine the quality of the wine based from the different features like fixed acidiity, volatile acidity, etc.

In [56]:
dataset = dataset.drop(columns=["Id"])

**Note**: The Id column is not needed so we drop it from the dataframe

In [57]:
dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6


In [58]:
dataset.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0
mean,8.311111,0.531339,0.268364,2.532152,0.086933,15.615486,45.914698,0.99673,3.311015,0.657708,10.442111,5.657043
std,1.747595,0.179633,0.196686,1.355917,0.047267,10.250486,32.78213,0.001925,0.156664,0.170399,1.082196,0.805824
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.3925,0.09,1.9,0.07,7.0,21.0,0.99557,3.205,0.55,9.5,5.0
50%,7.9,0.52,0.25,2.2,0.079,13.0,37.0,0.99668,3.31,0.62,10.2,6.0
75%,9.1,0.64,0.42,2.6,0.09,21.0,61.0,0.997845,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,68.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [59]:
dataset["quality"].value_counts()

5    483
6    462
7    143
4     33
8     16
3      6
Name: quality, dtype: int64

**Note**: The values for quality, ranges from 3 to 8, for this experiment we add a new column "a_quality" (short for adjusted quality).
    We classify anything 5 and below as good, 6 as better.

In [60]:
dataset["a_quality"] = ["good" if x <=5 else "better" for x in dataset['quality']]

In [61]:
dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,a_quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,good
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,good
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,good
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,better
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,good
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,better
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6,better
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,good
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,better


In [62]:
dataset["a_quality"].value_counts()

better    621
good      522
Name: a_quality, dtype: int64

In [63]:
dataset.isnull().values.any()

False

In [64]:
dataset = dataset.drop(columns=["quality"])

**Note**: We can leave the quality column and set to ignore it, but for clarity let us just remove it entirely

In [65]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1143 non-null   float64
 1   volatile acidity      1143 non-null   float64
 2   citric acid           1143 non-null   float64
 3   residual sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free sulfur dioxide   1143 non-null   float64
 6   total sulfur dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
 11  a_quality             1143 non-null   object 
dtypes: float64(11), object(1)
memory usage: 107.3+ KB


**Note**: We now use the pycaret classification library.

In [66]:
from pycaret.classification import *

In [67]:
data_1 = dataset.sample(frac=0.90, random_state=1)
data_2 = dataset.drop(data_1.index)

data_1.reset_index(drop=True, inplace=True)
data_2.reset_index(drop=True, inplace=True)

print('Data 1: ' + str(data_1.shape))
print('Data 2: ' + str(data_2.shape))

Data 1: (1029, 12)
Data 2: (114, 12)


**Note**: This is just dividing our data for training and testing and a holdout data.
- data_1 - will be the train and test data (80% will be used for training, and 20% will be used for testing)
- data_2 - will serve as the holdout data

In [68]:
exp_clf101 = setup(data = data_1, target = 'a_quality', session_id=1,
                   train_size=0.8, remove_outliers=True,
                   create_clusters=True, polynomial_features=True,trigonometry_features=True,
                  fix_imbalance=True, normalize=True, normalize_method="maxabs")

Unnamed: 0,Description,Value
0,session_id,1
1,Target,a_quality
2,Target Type,Binary
3,Label Encoded,"better: 0, good: 1"
4,Original Data,"(1029, 12)"
5,Missing Values,0
6,Numeric Features,11
7,Categorical Features,0
8,Ordinal Features,0
9,High Cardinality Features,0


**Note**: From the documentation https://pycaret.readthedocs.io/en/latest/api/classification.html
- normalize - transforms the numeric features by scaling them to a given range
- normalize_method - maxabs: scales and translates each feature individually such that the maximal absolute value of each feature will be 1.0. It does not shift/center the data, and thus does not destroy any sparsity.
- remove_outliers - outliers from the training data are removed using the Singular Value Decomposition.
- create_clusters - an additional feature is created in training dataset
- polynomial_features - new features are derived using existing numeric features
- trigonometry_features - new features are derived using existing numeric features
- fix_imbalance - When set to True, SMOTE (Synthetic Minority Over-sampling Technique) is applied by default


In [69]:
eda()

Shape of your Data Set loaded: (1029, 12)
############## C L A S S I F Y I N G  V A R I A B L E S  ####################
Classifying variables in data set...
    11 Predictors classified...
        No variables removed since no ID or low-information variables found in data set

################ Binary_Classification VISUALIZATION Started #####################


Time to run AutoViz (in seconds) = 4


**Note**: EDA - https://pycaret.readthedocs.io/en/latest/api/classification.html?highlight=eda#pycaret.classification.eda
- this function generates AutoEDA using AutoVIZ library.

In [71]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.7824,0.8535,0.7527,0.7623,0.7557,0.5596,0.5617,0.486
et,Extra Trees Classifier,0.7811,0.8593,0.7384,0.7696,0.7509,0.5559,0.5594,0.445
xgboost,Extreme Gradient Boosting,0.7773,0.8421,0.7584,0.7531,0.7539,0.5505,0.5529,0.732
catboost,CatBoost Classifier,0.776,0.849,0.7498,0.7529,0.7496,0.547,0.549,5.105
lightgbm,Light Gradient Boosting Machine,0.7683,0.845,0.747,0.743,0.743,0.5321,0.5344,0.165
gbc,Gradient Boosting Classifier,0.767,0.8293,0.7441,0.7421,0.7416,0.5295,0.5313,0.549
lda,Linear Discriminant Analysis,0.749,0.837,0.7725,0.7029,0.734,0.4974,0.5023,0.039
ada,Ada Boost Classifier,0.7324,0.8054,0.7299,0.6932,0.71,0.4618,0.4637,0.262
ridge,Ridge Classifier,0.7286,0.0,0.7784,0.6737,0.7207,0.4594,0.4662,0.036
lr,Logistic Regression,0.7247,0.8199,0.7812,0.6682,0.7187,0.4524,0.4601,0.056


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

**Note**: - compare_models - https://pycaret.readthedocs.io/en/latest/api/classification.html?highlight=compare_models#pycaret.classification.compare_models
- This function trains and evaluates performance of all estimators available in the model library using cross validation

In [72]:
rf = create_model("rf")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7215,0.8269,0.6944,0.6944,0.6944,0.4386,0.4386
1,0.8333,0.8816,0.8611,0.7949,0.8267,0.6667,0.6686
2,0.7692,0.8266,0.6571,0.7931,0.7188,0.526,0.5327
3,0.7821,0.8458,0.6857,0.8,0.7385,0.5535,0.5584
4,0.7692,0.8558,0.7714,0.7297,0.75,0.536,0.5367
5,0.8333,0.8973,0.8286,0.8056,0.8169,0.664,0.6642
6,0.8462,0.9196,0.8857,0.7949,0.8378,0.6923,0.696
7,0.7692,0.8465,0.7143,0.7576,0.7353,0.5311,0.5318
8,0.7436,0.8472,0.7429,0.7027,0.7222,0.4845,0.4851
9,0.7564,0.788,0.6857,0.75,0.7164,0.5037,0.5052


**Note**: create_model - This function trains and evaluates the performance of a given estimator using cross validation.

In [73]:
tuned_rf = tune_model(rf)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7089,0.8114,0.75,0.6585,0.7013,0.4197,0.4231
1,0.8077,0.8479,0.8889,0.7442,0.8101,0.6184,0.6284
2,0.7436,0.7821,0.5714,0.8,0.6667,0.4676,0.4851
3,0.7692,0.8166,0.6857,0.7742,0.7273,0.5285,0.5315
4,0.7436,0.8439,0.8,0.6829,0.7368,0.4899,0.4957
5,0.8333,0.8824,0.8286,0.8056,0.8169,0.664,0.6642
6,0.7564,0.899,0.8571,0.6818,0.7595,0.5191,0.5332
7,0.7821,0.8379,0.8,0.7368,0.7671,0.563,0.5646
8,0.7564,0.8219,0.7143,0.7353,0.7246,0.5063,0.5065
9,0.7179,0.7581,0.7143,0.6757,0.6944,0.4329,0.4335


**Note**: tune_model - tunes the hyperparameters of a given estimator.

In [74]:
plot_model(tuned_rf, 'confusion_matrix')

In [75]:
evaluate_model(tuned_rf)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [76]:
predict_model(tuned_rf)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.733,0.8521,0.7041,0.7263,0.715,0.464,0.4642


Unnamed: 0,fixed acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,...,cos(volatile acidity),cos(total sulfur dioxide),cos(alcohol),data_cluster_0,data_cluster_1,data_cluster_2,data_cluster_3,a_quality,Label,Score
0,0.477987,0.11,0.167742,0.129296,0.181818,0.176259,0.993135,0.800499,0.285,0.671141,...,0.902543,0.300604,-0.839329,1.0,0.0,0.0,0.0,good,better,0.5045
1,0.452830,0.03,0.141935,0.130933,0.257576,0.316547,0.993783,0.880299,0.290,0.657718,...,0.813880,0.999412,-0.930712,1.0,0.0,0.0,0.0,better,good,0.6405
2,0.635220,0.34,0.154839,0.139116,0.075758,0.061151,0.993165,0.790524,0.325,0.711409,...,0.939081,-0.275174,-0.385456,0.0,0.0,0.0,1.0,better,better,0.8654
3,0.490566,0.14,0.154839,0.140753,0.045455,0.053957,0.993833,0.852868,0.300,0.724832,...,0.831314,-0.759718,-0.194389,0.0,0.0,0.0,1.0,better,good,0.5688
4,0.503145,0.37,0.058065,0.080196,0.545455,0.392086,0.986430,0.720698,0.220,0.852349,...,0.990970,-0.577025,0.991389,0.0,1.0,0.0,0.0,better,better,0.9303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,0.433962,0.03,0.148387,0.088380,0.106061,0.057554,0.991422,0.860349,0.315,0.771812,...,0.743818,-0.957697,0.483453,0.0,1.0,0.0,0.0,better,better,0.8457
202,0.540881,0.24,0.425806,0.191489,0.469697,0.482014,0.997718,0.827930,0.535,0.624161,...,0.753921,-0.463847,-0.992530,0.0,0.0,0.0,1.0,good,good,0.8589
203,0.773585,0.49,0.141935,0.145663,0.075758,0.050360,0.996523,0.795511,0.220,0.644295,...,0.883939,0.136743,-0.984990,1.0,0.0,0.0,0.0,good,good,0.8348
204,0.383648,0.10,0.180645,0.132570,0.196970,0.100719,0.992647,0.897756,0.330,0.684564,...,0.767128,-0.962644,-0.714485,0.0,0.0,0.0,1.0,good,good,0.5449


In [77]:
final_rf = finalize_model(tuned_rf)

In [78]:
unseen_predictions = predict_model(final_rf, data=data_2)
unseen_predictions.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,a_quality,Label,Score
0,7.9,0.43,0.21,1.6,0.106,10.0,37.0,0.9966,3.17,0.91,9.5,good,better,0.5298
1,7.1,0.71,0.0,1.9,0.08,14.0,35.0,0.9972,3.47,0.55,9.4,good,good,0.8398
2,7.8,0.645,0.0,5.5,0.086,5.0,18.0,0.9986,3.4,0.55,9.6,better,good,0.8113
3,7.8,0.56,0.19,1.8,0.104,12.0,47.0,0.9964,3.19,0.93,9.5,good,good,0.6862
4,5.2,0.34,0.0,1.8,0.05,27.0,63.0,0.9916,3.68,0.79,14.0,better,better,0.9379


In [79]:
save_model(final_rf,'Final rf')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='a_quality',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_st...
                  RandomForestClassifier(bootstrap=False, ccp_alpha=0.0,
                                         class_weight={}, criterion='entropy',
                                         max_depth=6, max_features='sqrt',
                                         max_le

In [80]:
saved_final_rf = load_model('Final rf')

Transformation Pipeline and Model Successfully Loaded


In [81]:
new_prediction = predict_model(saved_final_rf, data=data_2)

In [82]:
new_prediction.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,a_quality,Label,Score
0,7.9,0.43,0.21,1.6,0.106,10.0,37.0,0.9966,3.17,0.91,9.5,good,better,0.5298
1,7.1,0.71,0.0,1.9,0.08,14.0,35.0,0.9972,3.47,0.55,9.4,good,good,0.8398
2,7.8,0.645,0.0,5.5,0.086,5.0,18.0,0.9986,3.4,0.55,9.6,better,good,0.8113
3,7.8,0.56,0.19,1.8,0.104,12.0,47.0,0.9964,3.19,0.93,9.5,good,good,0.6862
4,5.2,0.34,0.0,1.8,0.05,27.0,63.0,0.9916,3.68,0.79,14.0,better,better,0.9379
