# Wine Quality Classification
## ESE417 Final Project

## 1. Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42

## 2. Load Data

In [2]:
DATA_PATH = 'winequality-red.csv'  # Update path if needed

columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 
           'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 
           'pH', 'sulphates', 'alcohol', 'quality']

df = pd.read_csv(DATA_PATH, sep=';', skiprows=1, names=columns)

print(f"Dataset shape: {df.shape}")
print(f"\nQuality distribution:")
print(df['quality'].value_counts().sort_index())

Dataset shape: (1599, 12)

Quality distribution:
quality
3     10
4     53
5    681
6    638
7    199
8     18
Name: count, dtype: int64


---
# PART 1: BASELINE MODELS (No Preprocessing)
---

## 3. Baseline Preprocessing

In [3]:
# Split features and target (original 6 classes)
X = df.drop('quality', axis=1)
y = df['quality']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Number of classes: {y.nunique()}")

Training samples: 1279
Test samples: 320
Number of classes: 6


## 4. Baseline Model Training

In [4]:
# Store baseline results
baseline_results = {}

# SVM
svm_base = SVC(kernel='rbf', random_state=RANDOM_STATE)
svm_base.fit(X_train_scaled, y_train)
svm_base_pred = svm_base.predict(X_test_scaled)
baseline_results['SVM'] = accuracy_score(y_test, svm_base_pred)
print(f"SVM Baseline Accuracy: {baseline_results['SVM']:.4f}")

# Random Forest
rf_base = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
rf_base.fit(X_train_scaled, y_train)
rf_base_pred = rf_base.predict(X_test_scaled)
baseline_results['Random Forest'] = accuracy_score(y_test, rf_base_pred)
print(f"Random Forest Baseline Accuracy: {baseline_results['Random Forest']:.4f}")

# Neural Network
mlp_base = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=RANDOM_STATE)
mlp_base.fit(X_train_scaled, y_train)
mlp_base_pred = mlp_base.predict(X_test_scaled)
baseline_results['Neural Network'] = accuracy_score(y_test, mlp_base_pred)
print(f"Neural Network Baseline Accuracy: {baseline_results['Neural Network']:.4f}")

# Logistic Regression
lr_base = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
lr_base.fit(X_train_scaled, y_train)
lr_base_pred = lr_base.predict(X_test_scaled)
baseline_results['Logistic Regression'] = accuracy_score(y_test, lr_base_pred)
print(f"Logistic Regression Baseline Accuracy: {baseline_results['Logistic Regression']:.4f}")

# KNN
knn_base = KNeighborsClassifier(n_neighbors=5)
knn_base.fit(X_train_scaled, y_train)
knn_base_pred = knn_base.predict(X_test_scaled)
baseline_results['KNN'] = accuracy_score(y_test, knn_base_pred)
print(f"KNN Baseline Accuracy: {baseline_results['KNN']:.4f}")

SVM Baseline Accuracy: 0.6250
Random Forest Baseline Accuracy: 0.6813
Neural Network Baseline Accuracy: 0.6375
Logistic Regression Baseline Accuracy: 0.5906
KNN Baseline Accuracy: 0.6094


In [5]:
print("\n" + "="*40)
print("BASELINE RESULTS SUMMARY")
print("="*40)
for model, acc in sorted(baseline_results.items(), key=lambda x: x[1], reverse=True):
    print(f"{model:<20} {acc:.4f}")


BASELINE RESULTS SUMMARY
Random Forest        0.6813
Neural Network       0.6375
SVM                  0.6250
KNN                  0.6094
Logistic Regression  0.5906


---
# PART 2: IMPROVED MODELS (With Data Cleaning + Tuning)
---

## 5. Data Cleaning - Outlier Removal Using IQR

In [6]:
# Remove outliers using IQR method
def remove_outliers(df, columns):
    df_clean = df.copy()
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
    return df_clean

# Apply to feature columns
feature_cols = df.columns.drop('quality')
df_clean = remove_outliers(df, feature_cols)

print(f"Original samples: {len(df)}")
print(f"After outlier removal: {len(df_clean)}")
print(f"Samples removed: {len(df) - len(df_clean)} ({100*(len(df)-len(df_clean))/len(df):.1f}%)")
print(f"\nCleaned quality distribution:")
print(df_clean['quality'].value_counts().sort_index())

Original samples: 1599
After outlier removal: 1135
Samples removed: 464 (29.0%)

Cleaned quality distribution:
quality
3      2
4     32
5    490
6    472
7    130
8      9
Name: count, dtype: int64


## 6. Preprocessing on Cleaned Data

In [7]:
# Split features and target
X_clean = df_clean.drop('quality', axis=1)
y_clean = df_clean['quality']

# Train-test split
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=RANDOM_STATE, stratify=y_clean
)

# Feature scaling
scaler_c = StandardScaler()
X_train_c_scaled = scaler_c.fit_transform(X_train_c)
X_test_c_scaled = scaler_c.transform(X_test_c)

print(f"Training samples: {len(X_train_c)}")
print(f"Test samples: {len(X_test_c)}")

Training samples: 908
Test samples: 227


## 7. Tuned Model Training (On Cleaned Data)

In [8]:
# Store tuned results
tuned_results = {}

# SVM - increase C for better fit
svm_tuned = SVC(kernel='rbf', C=10, random_state=RANDOM_STATE)
svm_tuned.fit(X_train_c_scaled, y_train_c)
svm_tuned_pred = svm_tuned.predict(X_test_c_scaled)
tuned_results['SVM'] = accuracy_score(y_test_c, svm_tuned_pred)
print(f"SVM Tuned Accuracy: {tuned_results['SVM']:.4f}")
print(classification_report(y_test_c, svm_tuned_pred, zero_division=0))

SVM Tuned Accuracy: 0.6300
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.50      0.17      0.25         6
           5       0.67      0.71      0.69        98
           6       0.60      0.62      0.61        94
           7       0.64      0.54      0.58        26
           8       0.00      0.00      0.00         2

    accuracy                           0.63       227
   macro avg       0.40      0.34      0.36       227
weighted avg       0.62      0.63      0.62       227



In [9]:
# Random Forest - more trees, limit depth
rf_tuned = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=RANDOM_STATE)
rf_tuned.fit(X_train_c_scaled, y_train_c)
rf_tuned_pred = rf_tuned.predict(X_test_c_scaled)
tuned_results['Random Forest'] = accuracy_score(y_test_c, rf_tuned_pred)
print(f"Random Forest Tuned Accuracy: {tuned_results['Random Forest']:.4f}")
print(classification_report(y_test_c, rf_tuned_pred, zero_division=0))

Random Forest Tuned Accuracy: 0.6872
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         6
           5       0.70      0.80      0.75        98
           6       0.66      0.69      0.68        94
           7       0.71      0.46      0.56        26
           8       1.00      0.50      0.67         2

    accuracy                           0.69       227
   macro avg       0.51      0.41      0.44       227
weighted avg       0.67      0.69      0.67       227



In [10]:
# Neural Network - add layer, regularization
mlp_tuned = MLPClassifier(hidden_layer_sizes=(100, 50), alpha=0.01, max_iter=1000, random_state=RANDOM_STATE)
mlp_tuned.fit(X_train_c_scaled, y_train_c)
mlp_tuned_pred = mlp_tuned.predict(X_test_c_scaled)
tuned_results['Neural Network'] = accuracy_score(y_test_c, mlp_tuned_pred)
print(f"Neural Network Tuned Accuracy: {tuned_results['Neural Network']:.4f}")
print(classification_report(y_test_c, mlp_tuned_pred, zero_division=0))

Neural Network Tuned Accuracy: 0.6211
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.17      0.17      0.17         6
           5       0.63      0.67      0.65        98
           6       0.62      0.64      0.63        94
           7       0.68      0.50      0.58        26
           8       1.00      0.50      0.67         2

    accuracy                           0.62       227
   macro avg       0.52      0.41      0.45       227
weighted avg       0.62      0.62      0.62       227



In [11]:
# Logistic Regression - adjust regularization
lr_tuned = LogisticRegression(C=1.0, max_iter=1000, random_state=RANDOM_STATE)
lr_tuned.fit(X_train_c_scaled, y_train_c)
lr_tuned_pred = lr_tuned.predict(X_test_c_scaled)
tuned_results['Logistic Regression'] = accuracy_score(y_test_c, lr_tuned_pred)
print(f"Logistic Regression Tuned Accuracy: {tuned_results['Logistic Regression']:.4f}")
print(classification_report(y_test_c, lr_tuned_pred, zero_division=0))

Logistic Regression Tuned Accuracy: 0.5771
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         6
           5       0.63      0.72      0.68        98
           6       0.53      0.59      0.56        94
           7       0.45      0.19      0.27        26
           8       0.00      0.00      0.00         2

    accuracy                           0.58       227
   macro avg       0.27      0.25      0.25       227
weighted avg       0.54      0.58      0.55       227



In [12]:
# KNN - adjust k and use distance weighting
knn_tuned = KNeighborsClassifier(n_neighbors=7, weights='distance')
knn_tuned.fit(X_train_c_scaled, y_train_c)
knn_tuned_pred = knn_tuned.predict(X_test_c_scaled)
tuned_results['KNN'] = accuracy_score(y_test_c, knn_tuned_pred)
print(f"KNN Tuned Accuracy: {tuned_results['KNN']:.4f}")
print(classification_report(y_test_c, knn_tuned_pred, zero_division=0))

KNN Tuned Accuracy: 0.6872
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         6
           5       0.73      0.77      0.75        98
           6       0.66      0.73      0.70        94
           7       0.58      0.42      0.49        26
           8       1.00      0.50      0.67         2

    accuracy                           0.69       227
   macro avg       0.50      0.40      0.43       227
weighted avg       0.66      0.69      0.67       227



## 8. Final Comparison

In [13]:
print("="*60)
print("FINAL RESULTS: BASELINE vs TUNED (with Data Cleaning)")
print("="*60)
print(f"{'Model':<20} {'Baseline':<12} {'Tuned':<12} {'Improvement':<12}")
print("-"*60)
for model in baseline_results:
    base = baseline_results[model]
    tuned = tuned_results[model]
    diff = tuned - base
    print(f"{model:<20} {base:<12.4f} {tuned:<12.4f} {diff:+.4f}")

print("\nNote: Tuned models use IQR outlier removal + hyperparameter tuning")

FINAL RESULTS: BASELINE vs TUNED (with Data Cleaning)
Model                Baseline     Tuned        Improvement 
------------------------------------------------------------
SVM                  0.6250       0.6300       +0.0050
Random Forest        0.6813       0.6872       +0.0060
Neural Network       0.6375       0.6211       -0.0164
Logistic Regression  0.5906       0.5771       -0.0135
KNN                  0.6094       0.6872       +0.0778

Note: Tuned models use IQR outlier removal + hyperparameter tuning
