In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline
import numpy as np

In [2]:
# Loading data
wine_df = pd.read_csv("winequality_combined.csv")
wine_df.head()

Unnamed: 0,wine_type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,r,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,r,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,r,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,r,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,r,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
# Define features set
y = wine_df['quality']
X = wine_df.drop(columns='quality')
X[:5]

Unnamed: 0,wine_type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,r,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,r,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,r,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,r,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,r,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [4]:
X = pd.get_dummies(X, columns=['wine_type'], dtype='int', drop_first=True)
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,wine_type_w
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


In [5]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

In [6]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [7]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [8]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=200, random_state=10)

In [10]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [11]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [12]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 3", "Actual 4", "Actual 5", "Actual 6", "Actual 7", "Actual 8", "Actual 9"], 
        columns=["Predicted 3", "Predicted 4", "Predicted 5", "Predicted 6", "Predicted 7", "Predicted 8", "Predicted 9"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [13]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 3,Predicted 4,Predicted 5,Predicted 6,Predicted 7,Predicted 8,Predicted 9
Actual 3,0,1,2,5,0,0,0
Actual 4,0,3,38,25,0,0,0
Actual 5,2,1,379,148,2,0,0
Actual 6,0,1,97,552,35,1,0
Actual 7,0,0,7,128,141,3,0
Actual 8,0,0,0,23,10,19,0
Actual 9,0,0,0,1,1,0,0


Accuracy Score : 0.6732307692307692
Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         8
           4       0.50      0.05      0.08        66
           5       0.72      0.71      0.72       532
           6       0.63      0.80      0.70       686
           7       0.75      0.51      0.60       279
           8       0.83      0.37      0.51        52
           9       0.00      0.00      0.00         2

    accuracy                           0.67      1625
   macro avg       0.49      0.35      0.37      1625
weighted avg       0.68      0.67      0.66      1625



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# Optimize the random forest model with n_estimators of 500
rf_model_2 = RandomForestClassifier(n_estimators=1000, random_state=10)

In [15]:
# Fitting the model
rf_model_2 = rf_model_2.fit(X_train_scaled, y_train)

In [16]:
# Making predictions using the testing data
predictions_2 = rf_model_2.predict(X_test_scaled)

In [17]:
# Calculating the confusion matrix
cm_2 = confusion_matrix(y_test, predictions_2)

cm_df_2 = pd.DataFrame(
    cm_2, index=["Actual 3", "Actual 4", "Actual 5", "Actual 6", "Actual 7", "Actual 8", "Actual 9"], 
        columns=["Predicted 3", "Predicted 4", "Predicted 5", "Predicted 6", "Predicted 7", "Predicted 8", "Predicted 9"]
)

# Calculating the accuracy score
acc_score_2 = accuracy_score(y_test, predictions_2)

In [18]:
# Displaying results
print("Confusion Matrix")
display(cm_df_2)
print(f"Accuracy Score : {acc_score_2}")
print("Classification Report")
print(classification_report(y_test, predictions_2))

Confusion Matrix


Unnamed: 0,Predicted 3,Predicted 4,Predicted 5,Predicted 6,Predicted 7,Predicted 8,Predicted 9
Actual 3,0,1,1,6,0,0,0
Actual 4,0,4,37,25,0,0,0
Actual 5,2,1,379,148,2,0,0
Actual 6,0,1,96,560,28,1,0
Actual 7,0,0,8,126,142,3,0
Actual 8,0,0,0,22,11,19,0
Actual 9,0,0,0,1,1,0,0


Accuracy Score : 0.6793846153846154
Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         8
           4       0.57      0.06      0.11        66
           5       0.73      0.71      0.72       532
           6       0.63      0.82      0.71       686
           7       0.77      0.51      0.61       279
           8       0.83      0.37      0.51        52
           9       0.00      0.00      0.00         2

    accuracy                           0.68      1625
   macro avg       0.50      0.35      0.38      1625
weighted avg       0.69      0.68      0.66      1625



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
