# Wine Quality Classification
## ESE417 Final Project

## 1. Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42

## 2. Load Data

In [2]:
DATA_PATH = 'winequality-red.csv'  # Update path if needed

columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 
           'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 
           'pH', 'sulphates', 'alcohol', 'quality']

df = pd.read_csv(DATA_PATH, sep=';', skiprows=1, names=columns)

print(f"Original dataset shape: {df.shape}")
print(f"\nOriginal quality distribution:")
print(df['quality'].value_counts().sort_index())

Original dataset shape: (1599, 12)

Original quality distribution:
quality
3     10
4     53
5    681
6    638
7    199
8     18
Name: count, dtype: int64


## 3. Data Cleaning - Outlier Removal

Remove outliers using the IQR method to reduce noise in the data.

In [3]:
# Remove outliers using IQR method
def remove_outliers(df, columns):
    df_clean = df.copy()
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
    return df_clean

# Apply to feature columns only
feature_cols = df.columns.drop('quality')
df_clean = remove_outliers(df, feature_cols)

print(f"Samples removed: {len(df) - len(df_clean)}")
print(f"Clean dataset shape: {df_clean.shape}")

Samples removed: 464
Clean dataset shape: (1135, 12)


## 4. Class Grouping

Based on EDA, the original 6 quality classes are imbalanced. We group them into 3 classes:
- Low (3-4)
- Medium (5-6)  
- High (7-8)

In [4]:
# Group quality into 3 classes
def group_quality(q):
    if q <= 4:
        return 0  # Low
    elif q <= 6:
        return 1  # Medium
    else:
        return 2  # High

df_clean['quality_group'] = df_clean['quality'].apply(group_quality)

print("New class distribution:")
print(df_clean['quality_group'].value_counts().sort_index())
print("\n0 = Low (3-4), 1 = Medium (5-6), 2 = High (7-8)")

New class distribution:
quality_group
0     34
1    962
2    139
Name: count, dtype: int64

0 = Low (3-4), 1 = Medium (5-6), 2 = High (7-8)


## 5. Preprocessing

In [5]:
# Split features and target
X = df_clean.drop(['quality', 'quality_group'], axis=1)
y = df_clean['quality_group']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

Training samples: 908
Test samples: 227


## 6. Model Training and Evaluation

### 6.1 Support Vector Machine (SVM)

In [11]:
# Baseline SVM
svm = SVC(kernel='rbf', random_state=RANDOM_STATE)
svm.fit(X_train_scaled, y_train)
svm_pred = svm.predict(X_test_scaled)

print("SVM Baseline:")
print(f"Accuracy: {accuracy_score(y_test, svm_pred):.4f}")

SVM Baseline:
Accuracy: 0.8767


In [18]:
# Tuned SVM - adjusted C
svm_tuned = SVC(kernel='rbf', C=10, random_state=RANDOM_STATE)
svm_tuned.fit(X_train_scaled, y_train)
svm_tuned_pred = svm_tuned.predict(X_test_scaled)

print("SVM Tuned (C=10):")
print(f"Accuracy: {accuracy_score(y_test, svm_tuned_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, svm_tuned_pred, target_names=['Low', 'Medium', 'High']))

SVM Tuned (C=10):
Accuracy: 0.8811

Classification Report:
              precision    recall  f1-score   support

         Low       0.25      0.14      0.18         7
      Medium       0.91      0.95      0.93       192
        High       0.73      0.57      0.64        28

    accuracy                           0.88       227
   macro avg       0.63      0.56      0.58       227
weighted avg       0.87      0.88      0.87       227



### 6.2 Random Forest

In [8]:
# Baseline Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
rf.fit(X_train_scaled, y_train)
rf_pred = rf.predict(X_test_scaled)

print("Random Forest Baseline:")
print(f"Accuracy: {accuracy_score(y_test, rf_pred):.4f}")

Random Forest Baseline:
Accuracy: 0.8722


In [9]:
# Tuned Random Forest - more trees and limit depth to prevent overfitting(small dataset)
rf_tuned = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
rf_tuned.fit(X_train_scaled, y_train)
rf_tuned_pred = rf_tuned.predict(X_test_scaled)

print("Random Forest Tuned (n=200, depth=15):")
print(f"Accuracy: {accuracy_score(y_test, rf_tuned_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, rf_tuned_pred, target_names=['Low', 'Medium', 'High']))

Random Forest Tuned (n=200, depth=15):
Accuracy: 0.8855

Classification Report:
              precision    recall  f1-score   support

         Low       0.00      0.00      0.00         7
      Medium       0.89      0.98      0.94       192
        High       0.80      0.43      0.56        28

    accuracy                           0.89       227
   macro avg       0.56      0.47      0.50       227
weighted avg       0.85      0.89      0.86       227



### 6.3 Artificial Neural Network (MLP)

In [None]:
# Baseline Neural Network
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=RANDOM_STATE)
mlp.fit(X_train_scaled, y_train)
mlp_pred = mlp.predict(X_test_scaled)

print("Neural Network Baseline:")
print(f"Accuracy: {accuracy_score(y_test, mlp_pred):.4f}")

In [None]:
# Tuned Neural Network - add second layer and regularization
mlp_tuned = MLPClassifier(hidden_layer_sizes=(100, 50), alpha=0.01, max_iter=1000, random_state=RANDOM_STATE)
mlp_tuned.fit(X_train_scaled, y_train)
mlp_tuned_pred = mlp_tuned.predict(X_test_scaled)

print("Neural Network Tuned (layers=100,50, alpha=0.01):")
print(f"Accuracy: {accuracy_score(y_test, mlp_tuned_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, mlp_tuned_pred, target_names=['Low', 'Medium', 'High']))

### 6.4 Logistic Regression

In [None]:
# Baseline Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)

print("Logistic Regression Baseline:")
print(f"Accuracy: {accuracy_score(y_test, lr_pred):.4f}")

In [None]:
# Tuned Logistic Regression - adjust regularization
lr_tuned = LogisticRegression(C=0.5, max_iter=1000, random_state=RANDOM_STATE)
lr_tuned.fit(X_train_scaled, y_train)
lr_tuned_pred = lr_tuned.predict(X_test_scaled)

print("Logistic Regression Tuned (C=0.5):")
print(f"Accuracy: {accuracy_score(y_test, lr_tuned_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, lr_tuned_pred, target_names=['Low', 'Medium', 'High']))

### 6.5 K-Nearest Neighbors (KNN)

In [None]:
# Baseline KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
knn_pred = knn.predict(X_test_scaled)

print("KNN Baseline:")
print(f"Accuracy: {accuracy_score(y_test, knn_pred):.4f}")

In [None]:
# Tuned KNN - adjust k and use distance weighting
knn_tuned = KNeighborsClassifier(n_neighbors=7, weights='distance')
knn_tuned.fit(X_train_scaled, y_train)
knn_tuned_pred = knn_tuned.predict(X_test_scaled)

print("KNN Tuned (k=7, weights=distance):")
print(f"Accuracy: {accuracy_score(y_test, knn_tuned_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, knn_tuned_pred, target_names=['Low', 'Medium', 'High']))

## 7. Summary

In [None]:
print("Final Results Comparison")
print("=" * 50)
print(f"{'Model':<25} {'Baseline':<12} {'Tuned':<12}")
print("-" * 50)
print(f"{'SVM':<25} {accuracy_score(y_test, svm_pred):<12.4f} {accuracy_score(y_test, svm_tuned_pred):<12.4f}")
print(f"{'Random Forest':<25} {accuracy_score(y_test, rf_pred):<12.4f} {accuracy_score(y_test, rf_tuned_pred):<12.4f}")
print(f"{'Neural Network':<25} {accuracy_score(y_test, mlp_pred):<12.4f} {accuracy_score(y_test, mlp_tuned_pred):<12.4f}")
print(f"{'Logistic Regression':<25} {accuracy_score(y_test, lr_pred):<12.4f} {accuracy_score(y_test, lr_tuned_pred):<12.4f}")
print(f"{'KNN':<25} {accuracy_score(y_test, knn_pred):<12.4f} {accuracy_score(y_test, knn_tuned_pred):<12.4f}")