In [168]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

In [169]:
# Load the dataset
df = pd.read_csv("WineQT.csv")

# Preprocessing: Drop unnecessary column and separate features and target
df.drop(columns="Id", inplace=True)  # Drop the Id column
X = df.drop(columns="quality")
y = df["quality"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [170]:
# Handling class imbalance with SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [171]:
# Define the Random Forest model
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
# Define hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)



In [172]:
# Fit the model using the best parameters found through GridSearchCV
grid_search.fit(X_train_resampled, y_train_resampled)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


In [173]:
# Get the best model from grid search
best_rf = grid_search.best_estimator_

In [174]:
# Make predictions
y_pred = best_rf.predict(X_test)

# Print the classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))


Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         0
           4       0.06      0.11      0.08         9
           5       0.67      0.71      0.69       143
           6       0.64      0.49      0.56       146
           7       0.49      0.61      0.54        41
           8       0.14      0.25      0.18         4

    accuracy                           0.58       343
   macro avg       0.33      0.36      0.34       343
weighted avg       0.61      0.58      0.59       343



In [175]:
# Print the best hyperparameters
print(f"Best hyperparameters: {random_search.best_params_}")


Best hyperparameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 30, 'bootstrap': True}


In [176]:
joblib.dump(random_search.best_estimator_, 'wineQT_model.pkl')

['wineQT_model.pkl']