In [4]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.ensemble import EasyEnsembleClassifier
from sqlalchemy import create_engine
from config import db_password

In [5]:
# Create Database connection
engine = create_engine(f'postgresql://Wine:{db_password}@my-postgres-db.cyw40qiv0pda.us-east-2.rds.amazonaws.com:5432/postgres')
red_df = pd.read_sql("SELECT * FROM red", con=engine)
white_df = pd.read_sql("SELECT * FROM white", con=engine)
wine_df = pd.read_sql("SELECT * FROM (SELECT * FROM red UNION ALL SELECT * FROM white) total_table", con=engine)


In [7]:
# Drop duplicate entries
wine_df= wine_df.drop_duplicates()
wine_df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,0


In [8]:
# Define the features set.
X = wine_df.copy()
X = X.drop("type", axis=1)
X.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5


In [9]:
# Define the target set.
y = wine_df['type']
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: type, dtype: int64

In [10]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [11]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [15]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [16]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([1, 0, 0, ..., 1, 0, 1], dtype=int64)

In [17]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Red", "Actual White"], columns=["Predicted Red", "Predicted White"])

cm_df

Unnamed: 0,Predicted Red,Predicted White
Actual Red,349,7
Actual White,1,973


In [18]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [19]:
# Displaying results
print("Wine Type Prediction")
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Wine Type Prediction
Confusion Matrix


Unnamed: 0,Predicted Red,Predicted White
Actual Red,349,7
Actual White,1,973


Accuracy Score : 0.9939849624060151
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       356
           1       0.99      1.00      1.00       974

    accuracy                           0.99      1330
   macro avg       0.99      0.99      0.99      1330
weighted avg       0.99      0.99      0.99      1330



In [21]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.04816743, 0.10856719, 0.01707837, 0.05139775, 0.22286144,
       0.04679593, 0.3374749 , 0.07482624, 0.02107746, 0.0573633 ,
       0.01097802, 0.00341199])

In [22]:
# Sort the features by their importance.
print("Wine Type Prediction Feature Importance")
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

Wine Type Prediction Feature Importance


[(0.33747489931297353, 'total_sulfur_dioxide'),
 (0.222861439754702, 'chlorides'),
 (0.10856718765640555, 'volatile_acidity'),
 (0.07482623762860222, 'density'),
 (0.05736329603263875, 'sulphates'),
 (0.051397745107831, 'residual_sugar'),
 (0.048167431761886086, 'fixed_acidity'),
 (0.04679592666492493, 'free_sulfur_dioxide'),
 (0.021077458510889137, 'pH'),
 (0.017078365275658636, 'citric_acid'),
 (0.010978018571926533, 'alcohol'),
 (0.0034119937215616875, 'quality')]

In [23]:
# checking for data leakage (corr > .90)
wine_df.corr()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,type
fixed_acidity,1.0,0.214752,0.330328,-0.104439,0.288918,-0.28159,-0.327471,0.47818,-0.271182,0.304844,-0.102573,-0.080092,-0.486253
volatile_acidity,0.214752,1.0,-0.384192,-0.163696,0.367626,-0.349039,-0.400716,0.308452,0.246687,0.227772,-0.065411,-0.265205,-0.645335
citric_acid,0.330328,-0.384192,1.0,0.146189,0.055199,0.130849,0.194835,0.094758,-0.344735,0.059183,-0.005496,0.097954,0.183759
residual_sugar,-0.104439,-0.163696,0.146189,1.0,-0.123094,0.398717,0.487519,0.52093,-0.234522,-0.174717,-0.305242,-0.05683,0.328695
chlorides,0.288918,0.367626,0.055199,-0.123094,1.0,-0.186615,-0.269817,0.371867,0.025823,0.405051,-0.269601,-0.202137,-0.499517
free_sulfur_dioxide,-0.28159,-0.349039,0.130849,0.398717,-0.186615,1.0,0.720488,0.006166,-0.141747,-0.198244,-0.170012,0.054002,0.465326
total_sulfur_dioxide,-0.327471,-0.400716,0.194835,0.487519,-0.269817,0.720488,1.0,0.006711,-0.222956,-0.275836,-0.249004,-0.050296,0.694229
density,0.47818,0.308452,0.094758,0.52093,0.371867,0.006166,0.006711,1.0,0.034273,0.28269,-0.667811,-0.326434,-0.429377
pH,-0.271182,0.246687,-0.344735,-0.234522,0.025823,-0.141747,-0.222956,0.034273,1.0,0.16815,0.097314,0.039733,-0.310919
sulphates,0.304844,0.227772,0.059183,-0.174717,0.405051,-0.198244,-0.275836,0.28269,0.16815,1.0,-0.017232,0.041884,-0.490364
