In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.ensemble import EasyEnsembleClassifier

In [2]:
white_df = pd.read_csv(Path("resources/winequality-white.csv"))
white_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [4]:
white_df=white_df.drop_duplicates()
white_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
6,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47,9.6,6


In [5]:
white_df["quality"].value_counts()

6    1788
5    1175
7     689
4     153
8     131
3      20
9       5
Name: quality, dtype: int64

In [6]:
# Define the features set.
X = white_df.copy()
X = X.drop("quality", axis=1)
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
6,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47,9.6


In [7]:
# Define the target set.
y = white_df['quality'].ravel()
y[:5]

array([6, 6, 6, 6, 6], dtype=int64)

In [8]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [9]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [11]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [12]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([7, 6, 5, 6, 8, 5, 6, 5, 6, 6, 6, 7, 5, 6, 6, 6, 6, 5, 6, 6, 7, 6,
       6, 6, 6, 5, 7, 5, 6, 5, 6, 6, 6, 6, 5, 6, 7, 6, 5, 6, 5, 5, 6, 6,
       6, 6, 6, 5, 6, 7, 5, 5, 4, 6, 6, 6, 5, 7, 6, 4, 6, 6, 6, 6, 6, 5,
       6, 6, 6, 5, 7, 5, 6, 6, 6, 6, 5, 6, 7, 6, 6, 6, 6, 5, 7, 6, 6, 7,
       7, 5, 6, 5, 6, 6, 6, 6, 6, 5, 7, 5, 5, 6, 6, 6, 6, 6, 6, 5, 5, 6,
       5, 5, 6, 5, 6, 6, 5, 6, 6, 5, 5, 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 5, 7, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 7, 6, 5, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6,
       6, 5, 6, 5, 6, 6, 6, 5, 6, 6, 7, 6, 5, 6, 7, 5, 5, 6, 6, 6, 5, 6,
       5, 7, 6, 5, 5, 6, 7, 6, 5, 6, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6,
       7, 6, 6, 6, 6, 6, 5, 6, 6, 5, 6, 6, 6, 7, 6, 5, 6, 5, 6, 6, 6, 7,
       5, 6, 5, 6, 5, 5, 5, 6, 5, 6, 5, 6, 6, 6, 5, 5, 7, 5, 6, 5, 6, 7,
       6, 5, 7, 5, 6, 6, 5, 6, 6, 7, 6, 7, 6, 7, 6, 6, 6, 5, 6, 6, 5, 5,
       6, 6, 6, 6, 7, 5, 5, 5, 6, 6, 5, 6, 6, 5, 5,

In [13]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 3", "Actual 4", "Actual 5", "Actual 6", "Actual 7", "Actual 8", "Actual 9" ], columns=["Predicted 3", "Predicted 4", "Predicted 5", "Predicted 6", "Predicted 7", "Predicted 8", "Predicted 9"])

cm_df

Unnamed: 0,Predicted 3,Predicted 4,Predicted 5,Predicted 6,Predicted 7,Predicted 8,Predicted 9
Actual 3,0,0,5,2,0,0,0
Actual 4,0,1,29,13,1,0,0
Actual 5,0,1,147,123,0,0,0
Actual 6,0,1,102,323,29,0,0
Actual 7,0,0,5,126,49,1,0
Actual 8,0,0,0,14,16,1,0
Actual 9,0,0,0,1,1,0,0


In [14]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [15]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 3,Predicted 4,Predicted 5,Predicted 6,Predicted 7,Predicted 8,Predicted 9
Actual 3,0,0,5,2,0,0,0
Actual 4,0,1,29,13,1,0,0
Actual 5,0,1,147,123,0,0,0
Actual 6,0,1,102,323,29,0,0
Actual 7,0,0,5,126,49,1,0
Actual 8,0,0,0,14,16,1,0
Actual 9,0,0,0,1,1,0,0


Accuracy Score : 0.5257315842583249
Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         7
           4       0.33      0.02      0.04        44
           5       0.51      0.54      0.53       271
           6       0.54      0.71      0.61       455
           7       0.51      0.27      0.35       181
           8       0.50      0.03      0.06        31
           9       0.00      0.00      0.00         2

    accuracy                           0.53       991
   macro avg       0.34      0.23      0.23       991
weighted avg       0.51      0.53      0.49       991



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.07461921, 0.0994214 , 0.08055148, 0.08583531, 0.08730137,
       0.09322863, 0.09022336, 0.10787838, 0.08685735, 0.07975163,
       0.11433187])

In [17]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.11433186946011366, 'alcohol'),
 (0.10787837721358089, 'density'),
 (0.09942139704545995, 'volatile acidity'),
 (0.09322863045761325, 'free sulfur dioxide'),
 (0.0902233588196305, 'total sulfur dioxide'),
 (0.08730137429168225, 'chlorides'),
 (0.08685735437557486, 'pH'),
 (0.08583531360038102, 'residual sugar'),
 (0.08055148303689608, 'citric acid'),
 (0.07975163124937355, 'sulphates'),
 (0.07461921044969402, 'fixed acidity')]

In [18]:
# Train the EasyEnsembleClassifier--did not improve model
classifier = EasyEnsembleClassifier(n_estimators=100,
   random_state=1)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

In [19]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm

array([[  1,   2,   3,   1,   0,   0,   0],
       [  8,  21,   9,   2,   1,   1,   2],
       [ 21,  43, 154,  29,   5,  14,   5],
       [ 31,  31, 143,  96,  61,  43,  50],
       [ 11,   4,  13,  34,  29,  36,  54],
       [  0,   0,   3,   5,   5,   8,  10],
       [  1,   0,   0,   0,   0,   0,   1]], dtype=int64)

In [20]:
# Calculated the balanced accuracy score
accuracy_score(y_test, predictions)

0.31281533804238143

In [21]:
# Print the imbalanced classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           3       0.01      0.14      0.03         7
           4       0.21      0.48      0.29        44
           5       0.47      0.57      0.52       271
           6       0.57      0.21      0.31       455
           7       0.29      0.16      0.21       181
           8       0.08      0.26      0.12        31
           9       0.01      0.50      0.02         2

    accuracy                           0.31       991
   macro avg       0.23      0.33      0.21       991
weighted avg       0.46      0.31      0.34       991

