In [2]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.ensemble import EasyEnsembleClassifier

In [3]:

wine_df = pd.read_csv(Path("resources/wine2.csv"),index_col=0)
wine_df.head()

FileNotFoundError: [Errno 2] No such file or directory: Path('resources/wine2.csv')

In [None]:
wine_df= wine_df.drop_duplicates()
wine_df.head()

In [None]:
wine_df["quality"].value_counts()

In [None]:
reduced_df = wine_df[['alcohol', 'volatile acidity', 'density', 'total sulfur dioxide', 'quality']]
print("reduced_df")
reduced_df.head()

In [43]:
reduced_df["quality"].value_counts()

6    2323
5    1752
7     856
4     206
8     148
3      30
9       5
Name: quality, dtype: int64

In [44]:
# Define the features set.
X = reduced_df.copy()
X = X.drop("quality", axis=1)
X.head()

Unnamed: 0,alcohol,volatile acidity,density,total sulfur dioxide
0,9.4,0.7,0.9978,34.0
1,9.8,0.88,0.9968,67.0
2,9.8,0.76,0.997,54.0
3,9.8,0.28,0.998,60.0
5,9.4,0.66,0.9978,40.0


In [45]:
# Define the target set.
y = reduced_df['quality'].ravel()
y[:5]

array([5, 5, 5, 6, 5], dtype=int64)

In [46]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [47]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [48]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [49]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [50]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([6, 5, 5, ..., 7, 5, 5], dtype=int64)

In [52]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm)

cm_df

Unnamed: 0,0,1,2,3,4,5
0,0,0,7,1,0,0
1,0,5,25,25,0,0
2,0,0,275,176,4,1
3,0,3,134,350,68,3
4,0,2,11,124,75,2
5,0,1,2,21,15,0


In [53]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [54]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,0,1,2,3,4,5
0,0,0,7,1,0,0
1,0,5,25,25,0,0
2,0,0,275,176,4,1
3,0,3,134,350,68,3
4,0,2,11,124,75,2
5,0,1,2,21,15,0


Accuracy Score : 0.5300751879699248
Classification Report
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         8
           4       0.45      0.09      0.15        55
           5       0.61      0.60      0.60       456
           6       0.50      0.63      0.56       558
           7       0.46      0.35      0.40       214
           8       0.00      0.00      0.00        39

    accuracy                           0.53      1330
   macro avg       0.34      0.28      0.29      1330
weighted avg       0.51      0.53      0.51      1330



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.2282569 , 0.22545158, 0.27971596, 0.26657555])

In [56]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.2797159644980613, 'density'),
 (0.26657555332715877, 'total sulfur dioxide'),
 (0.22825689756961548, 'alcohol'),
 (0.22545158460516437, 'volatile acidity')]

In [57]:
# Train the EasyEnsembleClassifier--did not improve model
classifier = EasyEnsembleClassifier(n_estimators=100,
   random_state=1)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

In [58]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm

array([[  3,   2,   0,   2,   0,   0,   1],
       [ 19,  17,   1,   7,   4,   0,   7],
       [ 82, 245,  15,  67,  10,   2,  35],
       [ 66, 146,   9, 142,  72,  25,  98],
       [ 20,  13,   0,  44,  30,  20,  87],
       [  3,   1,   0,   5,   3,   6,  21],
       [  0,   0,   0,   0,   0,   0,   0]], dtype=int64)

In [59]:
# Calculated the balanced accuracy score
accuracy_score(y_test, predictions)

0.16015037593984963

In [60]:
# Print the imbalanced classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           3       0.02      0.38      0.03         8
           4       0.04      0.31      0.07        55
           5       0.60      0.03      0.06       456
           6       0.53      0.25      0.34       558
           7       0.25      0.14      0.18       214
           8       0.11      0.15      0.13        39
           9       0.00      0.00      0.00         0

    accuracy                           0.16      1330
   macro avg       0.22      0.18      0.12      1330
weighted avg       0.47      0.16      0.20      1330



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
