In [31]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.ensemble import EasyEnsembleClassifier

In [15]:
red_df = pd.read_csv(Path("resources/winequality-red.csv"))
red_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [16]:
red_df= red_df.drop_duplicates()
red_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5


In [17]:
red_df["quality"].value_counts()

5    577
6    535
7    167
4     53
8     17
3     10
Name: quality, dtype: int64

In [18]:
# Define the features set.
X = red_df.copy()
X = X.drop("quality", axis=1)
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4


In [19]:
# Define the target set.
y = red_df['quality'].ravel()
y[:5]

array([5, 5, 5, 6, 5], dtype=int64)

In [20]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [21]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [23]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [24]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([5, 6, 6, 5, 5, 6, 5, 5, 5, 6, 5, 5, 6, 6, 6, 6, 5, 5, 7, 6, 6, 5,
       5, 7, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 5, 5, 6, 5, 5, 5, 6, 6, 5, 6,
       6, 6, 6, 5, 6, 6, 6, 5, 5, 5, 5, 6, 5, 7, 5, 6, 6, 6, 6, 6, 5, 5,
       7, 6, 6, 5, 6, 5, 6, 6, 5, 6, 6, 5, 5, 5, 5, 5, 6, 5, 5, 5, 6, 6,
       5, 5, 5, 5, 6, 5, 5, 5, 6, 6, 5, 6, 6, 6, 6, 6, 5, 5, 5, 6, 6, 5,
       6, 6, 6, 5, 5, 7, 7, 6, 6, 5, 6, 5, 6, 5, 5, 6, 5, 6, 5, 6, 6, 6,
       5, 6, 5, 5, 6, 5, 6, 5, 5, 5, 5, 5, 6, 8, 5, 5, 7, 6, 5, 7, 5, 5,
       8, 5, 5, 6, 5, 6, 7, 5, 6, 5, 5, 6, 5, 6, 5, 5, 6, 5, 6, 5, 6, 6,
       6, 6, 5, 6, 6, 5, 7, 6, 5, 6, 6, 5, 5, 5, 6, 5, 7, 7, 6, 5, 5, 5,
       5, 5, 6, 5, 6, 5, 6, 5, 6, 5, 5, 6, 5, 5, 5, 5, 5, 6, 6, 5, 5, 5,
       7, 5, 5, 5, 5, 5, 5, 5, 7, 5, 6, 5, 5, 5, 5, 5, 5, 6, 6, 5, 5, 6,
       7, 6, 6, 7, 5, 6, 6, 6, 6, 5, 6, 5, 5, 6, 5, 6, 6, 6, 5, 7, 5, 6,
       6, 7, 5, 6, 6, 5, 6, 5, 6, 5, 6, 5, 5, 6, 5, 5, 6, 6, 5, 7, 5, 7,
       6, 7, 6, 5, 5, 5, 6, 5, 6, 6, 5, 5, 6, 6, 5,

In [25]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 3", "Actual 4", "Actual 5", "Actual 6", "Actual 7", "Actual 8" ], columns=["Predicted 3", "Predicted 4", "Predicted 5", "Predicted 6", "Predicted 7", "Predicted 8"])

cm_df

Unnamed: 0,Predicted 3,Predicted 4,Predicted 5,Predicted 6,Predicted 7,Predicted 8
Actual 3,0,0,2,0,0,0
Actual 4,0,0,9,4,1,0
Actual 5,0,0,111,34,0,0
Actual 6,0,0,48,81,5,0
Actual 7,0,0,3,22,13,2
Actual 8,0,0,0,3,2,0


In [26]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [27]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 3,Predicted 4,Predicted 5,Predicted 6,Predicted 7,Predicted 8
Actual 3,0,0,2,0,0,0
Actual 4,0,0,9,4,1,0
Actual 5,0,0,111,34,0,0
Actual 6,0,0,48,81,5,0
Actual 7,0,0,3,22,13,2
Actual 8,0,0,0,3,2,0


Accuracy Score : 0.6029411764705882
Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00        14
           5       0.64      0.77      0.70       145
           6       0.56      0.60      0.58       134
           7       0.62      0.33      0.43        40
           8       0.00      0.00      0.00         5

    accuracy                           0.60       340
   macro avg       0.30      0.28      0.28       340
weighted avg       0.57      0.60      0.58       340



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.07602231, 0.10367175, 0.07439238, 0.07079806, 0.08460108,
       0.06651416, 0.1004586 , 0.09246068, 0.07559149, 0.11536365,
       0.14012584])

In [29]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.14012583952089483, 'alcohol'),
 (0.11536365217787334, 'sulphates'),
 (0.10367174511837561, 'volatile acidity'),
 (0.10045859624755059, 'total sulfur dioxide'),
 (0.09246067610501367, 'density'),
 (0.08460108058878285, 'chlorides'),
 (0.07602231471678975, 'fixed acidity'),
 (0.07559149400674796, 'pH'),
 (0.07439238262720813, 'citric acid'),
 (0.07079806373751925, 'residual sugar'),
 (0.06651415515324405, 'free sulfur dioxide')]

In [32]:
# Train the EasyEnsembleClassifier--did not improve model
classifier = EasyEnsembleClassifier(n_estimators=100,
   random_state=1)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

In [33]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm

array([[ 1,  1,  0,  0,  0,  0],
       [ 3,  5,  1,  3,  1,  1],
       [38, 26, 45, 30,  4,  2],
       [16, 14,  9, 61, 22, 12],
       [ 0,  2,  0,  7, 14, 17],
       [ 0,  0,  0,  0,  3,  2]], dtype=int64)

In [37]:
# Calculated the balanced accuracy score
accuracy_score(y_test, predictions)

0.3764705882352941

In [39]:
# Print the imbalanced classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           3       0.02      0.50      0.03         2
           4       0.10      0.36      0.16        14
           5       0.82      0.31      0.45       145
           6       0.60      0.46      0.52       134
           7       0.32      0.35      0.33        40
           8       0.06      0.40      0.10         5

    accuracy                           0.38       340
   macro avg       0.32      0.40      0.27       340
weighted avg       0.63      0.38      0.44       340

