In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Random Forest Method

### Baseline Random Forest Model
This is run on the cleaned/normalized data as well as sklearn default parameters. This gives us an idea of how this model may perform and potential hyperparameters to focus on during fine-tuning.

In [15]:
from sklearn.model_selection import train_test_split

# load data
df_norm = pd.read_csv("../../data/processed/winequality-red-normalized.csv")

# split into features and target
X_norm = df_norm.drop(columns=['quality'])
y_norm = df_norm['quality']

X_train, X_test, y_train, y_test = train_test_split(
    X_norm, y_norm, test_size=0.2, random_state=42, stratify=y_norm
)

In [16]:
from sklearn.ensemble import RandomForestClassifier

# fit baseline random forest
rf_baseline = RandomForestClassifier(random_state=42)

rf_baseline.fit(X_train, y_train)

In [17]:
from sklearn.metrics import accuracy_score, f1_score

y_pred = rf_baseline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average='macro'))

Accuracy: 0.684375
F1 Score: 0.40597237221330523


In [18]:
from sklearn.metrics import classification_report

report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
df_report = pd.DataFrame(report_dict).transpose()
df_report

Unnamed: 0,precision,recall,f1-score,support
3,0.0,0.0,0.0,2.0
4,0.0,0.0,0.0,11.0
5,0.725352,0.757353,0.741007,136.0
6,0.643836,0.734375,0.686131,128.0
7,0.724138,0.525,0.608696,40.0
8,0.5,0.333333,0.4,3.0
accuracy,0.684375,0.684375,0.684375,0.684375
macro avg,0.432221,0.391677,0.405972,320.0
weighted avg,0.661014,0.684375,0.669218,320.0


The random forest baseline model performs well on the most common wine qualities (5 and 6), achieving F1 scores of above 0.66, meaning that it's doing a pretty good job at correctly identifying the class while balancing false positives and false negatives. It struggles more with the minority classes (3, 4, and 8) due to the class imbalance present. The overall accuracy of 68.4% is pretty strong as a baseline for this dataset, but further fine-tuning is needed to improve performance further. However, the overall macro F1 score of 0.40 indicates that the model is again, performing poorly across all classes, indicating issues with minority classes.

### Improving the baseline model
Next, we will perform Grid Search Cross Validation on each of the three datasets (our original, PCA, and interaction term).

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [9]:
datasets = {
    "normalized": pd.read_csv("../../data/processed/winequality-red-normalized.csv"),
    "pca": pd.read_csv("../../data/processed/winequality-red-pca.csv"),
    "interactions": pd.read_csv("../../data/processed/winequality-red-interactions.csv")
}

In [22]:
param_grid = {
    "n_estimators": [50, 100, 150, 200],
    "max_depth": [None, 15, 25, 50],
    "class_weight": [None, "balanced"]
}

In [26]:
results_table = []

for name, df in datasets.items():
    print(f"\n{name} dataset")

    # split into X and y
    X = df.drop(columns=["quality"])
    y = df["quality"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # grid search
    grid = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        scoring="f1_macro",
        cv=5,
        n_jobs=-1,
        return_train_score=False
    )

    grid.fit(X_train, y_train)

    best_params = grid.best_params_
    best_cv_score = grid.best_score_
    cv_std = grid.cv_results_['std_test_score'][grid.best_index_]

    print("Best parameters:", best_params)
    print(f"CV Macro F1: {best_cv_score:.4f} (+/- {cv_std:.4f})")

    # train best
    best_model = RandomForestClassifier(**best_params, random_state=42)
    best_model.fit(X_train, y_train)

    # predict
    y_pred = best_model.predict(X_test)

    # metrics
    test_acc = accuracy_score(y_test, y_pred)
    test_macro_f1 = f1_score(y_test, y_pred, average="macro")

    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Test Macro F1: {test_macro_f1:.4f}\n")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))

    # store results in table
    results_table.append({
        "Dataset": name,
        "CV Macro F1 (mean)": round(best_cv_score, 4),
        "CV Macro F1 (std)": round(cv_std, 4),
        "Test Accuracy": round(test_acc, 4),
        "Test Macro F1": round(test_macro_f1, 4)
    })

results_df = pd.DataFrame(results_table)


normalized dataset
Best parameters: {'class_weight': None, 'max_depth': 15, 'n_estimators': 50}
CV Macro F1: 0.3359 (+/- 0.0208)
Test Accuracy: 0.6594
Test Macro F1: 0.3923

Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00        11
           5       0.71      0.74      0.73       136
           6       0.62      0.69      0.65       128
           7       0.64      0.53      0.58        40
           8       0.50      0.33      0.40         3

    accuracy                           0.66       320
   macro avg       0.41      0.38      0.39       320
weighted avg       0.63      0.66      0.65       320


pca dataset
Best parameters: {'class_weight': None, 'max_depth': 15, 'n_estimators': 50}
CV Macro F1: 0.3625 (+/- 0.0693)
Test Accuracy: 0.6562
Test Macro F1: 0.3880

Classification Report:
              precision    recall  f1-score   support

           3 

In [27]:
results_df

Unnamed: 0,Dataset,CV Macro F1 (mean),CV Macro F1 (std),Test Accuracy,Test Macro F1
0,normalized,0.3359,0.0208,0.6594,0.3923
1,pca,0.3625,0.0693,0.6562,0.388
2,interactions,0.3477,0.0491,0.6844,0.4018
