In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score


In [24]:
df = pd.read_csv('WineQT.csv')
eda_df = df.copy()
df = df.drop(columns=['Id'])

In [25]:
# ─── EDA ────────────────────────────────────────────────────────────────────
# Data types
print("\nData Types:\n", eda_df.dtypes)

# Missing values
print("\nMissing Values:\n", eda_df.isnull().sum())

# Descriptive stats
print("\nDescriptive Statistics:\n", eda_df.describe())

# Duplicates
print("\nNumber of Duplicate Rows:", eda_df.duplicated().sum())

# Class balance
print("\nClass Balance (quality):\n", eda_df['quality'].value_counts().sort_index())

# Correlation matrix
correlation_matrix = eda_df.corr(numeric_only=True)
print("\nCorrelation Matrix:\n", correlation_matrix)

# Create EDA visuals folder
os.makedirs("eda_outputs", exist_ok=True)

# Histograms
for col in eda_df.select_dtypes(include='float64').columns:
    plt.figure(figsize=(6, 4))
    sns.histplot(eda_df[col], kde=True, bins=30)
    plt.title(f'Histogram of {col}')
    plt.tight_layout()
    plt.savefig(f"eda_outputs/histogram_{col}.png")
    plt.close()

# Boxplots
for col in eda_df.select_dtypes(include='float64').columns:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=eda_df[col])
    plt.title(f'Boxplot of {col}')
    plt.tight_layout()
    plt.savefig(f"eda_outputs/boxplot_{col}.png")
    plt.close()

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.savefig("eda_outputs/correlation_heatmap.png")
plt.close()

# Class balance plot
plt.figure(figsize=(6, 4))
sns.countplot(x='quality', data=eda_df, palette='Set2')
plt.title('Class Balance of Quality')
plt.tight_layout()
plt.savefig("eda_outputs/class_balance.png")
plt.close()


Data Types:
 fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
Id                        int64
dtype: object

Missing Values:
 fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
Id                      0
dtype: int64

Descriptive Statistics:
        fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1143.000000       1143.000000  1143.000000     1143.000000   
mean        8.31111


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x='quality', data=eda_df, palette='Set2')


In [26]:
# ─── SPLIT FEATURES AND TARGET ─────────────────────────────────────────────
X = df.drop('quality', axis=1)
y = df['quality']
numeric_cols = X.columns.tolist()

# ─── PREPROCESSING ─────────────────────────────────────────────────────────
num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, numeric_cols)
])

X_processed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

In [21]:
# ─── MODEL TRAINING AND EVALUATION ─────────────────────────────────────────
models = {
    'Decision Tree': (
        DecisionTreeClassifier(random_state=42),
        {
            'clf__max_depth': [None, 5, 10],
            'clf__min_samples_split': [2, 5, 10]
        }
    ),
    'Random Forest': (
        RandomForestClassifier(random_state=42),
        {
            'clf__n_estimators': [50, 100],
            'clf__max_depth': [None, 5, 10]
        }
    ),
    'KNN': (
        KNeighborsClassifier(),
        {'clf__n_neighbors': [3, 5, 7]}
    ),
    'Logistic Regression': (
        LogisticRegression(max_iter=1000, random_state=42),
        {'clf__C': [0.01, 0.1, 1, 10]}
    )
}

results = []

for model_name, (model, param_grid) in models.items():
    pipe = Pipeline([
        ('clf', model)
    ])
    grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    results.append((model_name, grid.best_params_, acc, f1))

                 Model                                        Best Params  \
0        Decision Tree  {'clf__max_depth': 10, 'clf__min_samples_split...   
1        Random Forest  {'clf__max_depth': None, 'clf__n_estimators': ...   
2                  KNN                            {'clf__n_neighbors': 7}   
3  Logistic Regression                                     {'clf__C': 10}   

   Accuracy  F1 Score  
0  0.550218  0.551190  
1  0.694323  0.680455  
2  0.602620  0.590095  
3  0.633188  0.622892  


In [27]:
# ─── DISPLAY RESULTS ───────────────────────────────────────────────────────
results_df = pd.DataFrame(results, columns=['Model', 'Best Params', 'Accuracy', 'F1 Score'])
print("\nModel Comparison:\n", results_df)


Model Comparison:
                  Model                                        Best Params  \
0        Decision Tree  {'clf__max_depth': 10, 'clf__min_samples_split...   
1        Random Forest  {'clf__max_depth': None, 'clf__n_estimators': ...   
2                  KNN                            {'clf__n_neighbors': 7}   
3  Logistic Regression                                     {'clf__C': 10}   

   Accuracy  F1 Score  
0  0.550218  0.551190  
1  0.694323  0.680455  
2  0.602620  0.590095  
3  0.633188  0.622892  
