In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.ensemble import EasyEnsembleClassifier

In [20]:
# Import dataset
wine_df = pd.read_csv(Path("resources/wine2.csv"),index_col=0)
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,0


In [21]:
# Drop duplicate entries
wine_df= wine_df.drop_duplicates()
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,0


In [22]:
# Applying the condition for quality
wine_df.loc[wine_df["quality"] < 7, "quality"] = 1
wine_df.loc[wine_df["quality"] >= 7, "quality"] = 0


In [23]:
wine_df["quality"].value_counts()

1    4311
0    1009
Name: quality, dtype: int64

In [25]:
# Define the features set.
X = wine_df.copy()
X = X.drop("quality", axis=1)
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,0


In [26]:
# Define the target set.
y = wine_df['quality'].ravel()
y[:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [27]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [28]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [29]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [30]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [31]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [33]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,100,153
Actual 1,52,1025


In [34]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [35]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,100,153
Actual 1,52,1025


Accuracy Score : 0.8458646616541353
Classification Report
              precision    recall  f1-score   support

           0       0.66      0.40      0.49       253
           1       0.87      0.95      0.91      1077

    accuracy                           0.85      1330
   macro avg       0.76      0.67      0.70      1330
weighted avg       0.83      0.85      0.83      1330



In [36]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.06530836, 0.08020435, 0.07134246, 0.081628  , 0.08612455,
       0.08185435, 0.08507329, 0.11799766, 0.08021482, 0.08568112,
       0.16157294, 0.0029981 ])

In [37]:
# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.16157294163051544, 'alcohol'),
 (0.11799766038378016, 'density'),
 (0.0861245515682511, 'chlorides'),
 (0.08568112100031045, 'sulphates'),
 (0.08507329063842427, 'total sulfur dioxide'),
 (0.08185434904130762, 'free sulfur dioxide'),
 (0.08162800042019658, 'residual sugar'),
 (0.08021482207356466, 'pH'),
 (0.08020434591976978, 'volatile acidity'),
 (0.07134246196090638, 'citric acid'),
 (0.06530835601608387, 'fixed acidity'),
 (0.0029980993468896717, 'type')]