# Wine Quality Dataset - multi-class classification and regression problem

# created by Nikolay K. MTK: 673010

This notebook involves predicting the quality of white wines on a scale given chemical measures of each wine.

Steps we will take:
    1. Standard library import and get the data ready
    2. Choose the right estimator or algorithm for our problem
    3. Fit the model/algorithm and use it to make predictions on our data
    4. Evaluating a model
    5. Improve a model
    6. Save and load a trained model
    7. Putting it all together.

In [52]:
# 1. Standard library import and get the data ready
import pandas as pd
import numpy as np
wine_quality = pd.read_csv("data/winequality-white.csv", ";")
wine_quality

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.270,0.36,20.70,0.045,45.0,170.0,1.00100,3.00,0.45,8.800000,6
1,6.3,0.300,0.34,1.60,0.049,14.0,132.0,0.99400,3.30,0.49,9.500000,6
2,8.1,0.280,0.40,6.90,0.050,30.0,97.0,0.99510,3.26,0.44,10.100000,6
3,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.900000,6
4,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.900000,6
5,8.1,0.280,0.40,6.90,0.050,30.0,97.0,0.99510,3.26,0.44,10.100000,6
6,6.2,0.320,0.16,7.00,0.045,30.0,136.0,0.99490,3.18,0.47,9.600000,6
7,7.0,0.270,0.36,20.70,0.045,45.0,170.0,1.00100,3.00,0.45,8.800000,6
8,6.3,0.300,0.34,1.60,0.049,14.0,132.0,0.99400,3.30,0.49,9.500000,6
9,8.1,0.220,0.43,1.50,0.044,28.0,129.0,0.99380,3.22,0.45,11.000000,6


In [29]:
## Get the data ready 

# Create X (features matrix)
X = wine_quality.drop("quality", axis=1)

# Create y (labels)
y = wine_quality["quality"]

In [30]:
# 2. Choose the right model or hyperparamaters
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)

# We will keep the default hyperparameters
model.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [46]:
# 3. Fit the model to the training data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [47]:
model.fit(X_train, y_train);

In [33]:
X_train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
3022,8.00,0.740,0.21,4.00,0.050,24.0,133.0,0.99418,3.06,0.38,9.7
943,6.90,0.480,0.36,3.50,0.030,31.0,135.0,0.99040,3.14,0.38,12.2
4570,5.80,0.290,0.15,1.10,0.029,12.0,83.0,0.98980,3.30,0.40,11.4
3865,6.00,0.310,0.38,4.80,0.040,41.0,101.0,0.98968,3.24,0.56,13.1
866,6.90,0.180,0.36,1.30,0.036,40.0,117.0,0.99340,3.27,0.95,9.5
1336,7.80,0.290,0.36,7.00,0.042,38.0,161.0,0.99410,3.26,0.37,11.2
710,6.10,0.250,0.24,12.10,0.046,51.0,172.0,0.99800,3.35,0.45,9.5
2380,6.40,0.270,0.19,1.90,0.085,21.0,196.0,0.99516,3.49,0.64,9.5
3411,6.00,0.140,0.17,5.60,0.036,37.0,127.0,0.99373,3.05,0.57,9.8
4137,7.40,0.310,0.26,8.60,0.048,47.0,206.0,0.99640,3.26,0.36,9.1


In [38]:
# Make a prediction 

y_label = model.predict(np.array([0, 2, 3, 4]))

ValueError: Expected 2D array, got 1D array instead:
array=[0. 2. 3. 4.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [39]:
y_preds = model.predict(X_test)
y_preds

array([6, 5, 7, 5, 5, 6, 7, 7, 5, 5, 7, 6, 5, 6, 6, 6, 6, 6, 5, 5, 5, 6,
       6, 6, 6, 6, 6, 7, 6, 6, 6, 5, 6, 6, 6, 6, 7, 6, 7, 6, 6, 5, 6, 6,
       6, 6, 4, 5, 6, 5, 5, 5, 6, 6, 6, 6, 5, 5, 6, 5, 6, 6, 6, 6, 7, 6,
       7, 6, 6, 7, 6, 5, 6, 6, 5, 5, 6, 5, 6, 7, 6, 6, 6, 5, 6, 7, 6, 6,
       5, 6, 7, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 6, 6, 6, 7,
       6, 6, 5, 6, 6, 7, 6, 6, 5, 6, 6, 5, 6, 7, 5, 5, 6, 6, 6, 6, 7, 5,
       5, 6, 6, 5, 6, 6, 6, 6, 6, 7, 6, 7, 6, 5, 5, 6, 5, 7, 5, 6, 7, 6,
       7, 7, 7, 6, 6, 5, 7, 5, 6, 6, 5, 6, 6, 6, 6, 6, 5, 6, 6, 5, 6, 6,
       6, 7, 5, 6, 6, 7, 5, 5, 7, 5, 7, 6, 7, 6, 6, 6, 5, 6, 6, 6, 6, 7,
       7, 5, 6, 6, 6, 7, 6, 6, 5, 6, 6, 5, 6, 7, 6, 5, 5, 6, 6, 6, 6, 8,
       5, 6, 5, 7, 6, 6, 5, 6, 5, 6, 6, 6, 5, 6, 6, 6, 5, 6, 6, 7, 6, 6,
       7, 6, 5, 5, 6, 5, 6, 6, 6, 5, 5, 5, 4, 7, 7, 5, 5, 5, 6, 7, 8, 6,
       7, 6, 5, 6, 5, 6, 6, 5, 6, 6, 5, 6, 7, 7, 6, 5, 5, 6, 6, 6, 6, 5,
       7, 7, 5, 6, 7, 6, 6, 8, 6, 6, 6, 5, 6, 5, 6,

In [40]:
# Evaluate the model on the training data and test data
model.score(X_train, y_train)

1.0

In [41]:
model.score(X_test, y_test)

0.6928571428571428

In [42]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

             precision    recall  f1-score   support

          3       0.00      0.00      0.00         5
          4       0.50      0.12      0.19        25
          5       0.73      0.70      0.71       283
          6       0.66      0.82      0.73       448
          7       0.76      0.53      0.62       187
          8       0.80      0.38      0.51        32

avg / total       0.70      0.69      0.68       980



  'precision', 'predicted', average, warn_for)


In [43]:
confusion_matrix(y_test, y_preds)

array([[  0,   0,   3,   2,   0,   0],
       [  0,   3,   9,  13,   0,   0],
       [  0,   3, 198,  82,   0,   0],
       [  0,   0,  57, 367,  23,   1],
       [  0,   0,   4,  82,  99,   2],
       [  0,   0,   1,  10,   9,  12]])

In [44]:
accuracy_score(y_test, y_preds)

0.6928571428571428

In [49]:
# 5. Improve the model
# Try a different amount of n_estimators 
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    model = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy on test set: {model.score(X_test, y_test) * 100:.2f}%")
    print("")

Trying model with 10 estimators...
Model accuracy on test set: 64.29%

Trying model with 20 estimators...
Model accuracy on test set: 67.86%

Trying model with 30 estimators...
Model accuracy on test set: 69.08%

Trying model with 40 estimators...
Model accuracy on test set: 68.88%

Trying model with 50 estimators...
Model accuracy on test set: 69.49%

Trying model with 60 estimators...
Model accuracy on test set: 70.41%

Trying model with 70 estimators...
Model accuracy on test set: 68.98%

Trying model with 80 estimators...
Model accuracy on test set: 69.90%

Trying model with 90 estimators...
Model accuracy on test set: 71.02%



In [53]:
# 6. Save model and load it
import pickle 

pickle.dump(model, open("random_forest_model_1.pkl", "wb"))

In [51]:
loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
loaded_model.score(X_test, y_test)

0.710204081632653