In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.ensemble import EasyEnsembleClassifier

In [2]:
# Import dataset
wine_df = pd.read_csv(Path("resources/wine2.csv"),index_col=0)
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,0


In [3]:
# Drop duplicate entries
wine_df= wine_df.drop_duplicates()
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,0


In [4]:
# Define the features set.
X = wine_df.copy()
X = X.drop("type", axis=1)
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5


In [5]:
# Define the target set.
y = wine_df['type'].ravel()
y[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [6]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [7]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [9]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [10]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([1, 0, 0, ..., 1, 0, 1], dtype=int64)

In [11]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Red", "Actual White"], columns=["Predicted Red", "Predicted White"])

cm_df

Unnamed: 0,Predicted Red,Predicted White
Actual Red,349,7
Actual White,1,973


In [12]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [13]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted Red,Predicted White
Actual Red,349,7
Actual White,1,973


Accuracy Score : 0.9939849624060151
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       356
           1       0.99      1.00      1.00       974

    accuracy                           0.99      1330
   macro avg       0.99      0.99      0.99      1330
weighted avg       0.99      0.99      0.99      1330



In [14]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.04816743, 0.10856719, 0.01707837, 0.05139775, 0.22286144,
       0.04679593, 0.3374749 , 0.07482624, 0.02107746, 0.0573633 ,
       0.01097802, 0.00341199])

In [15]:
# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.33747489931297353, 'total sulfur dioxide'),
 (0.222861439754702, 'chlorides'),
 (0.10856718765640555, 'volatile acidity'),
 (0.07482623762860222, 'density'),
 (0.05736329603263875, 'sulphates'),
 (0.051397745107831, 'residual sugar'),
 (0.048167431761886086, 'fixed acidity'),
 (0.04679592666492493, 'free sulfur dioxide'),
 (0.021077458510889137, 'pH'),
 (0.017078365275658636, 'citric acid'),
 (0.010978018571926533, 'alcohol'),
 (0.0034119937215616875, 'quality')]