In [1]:
import numpy as np
import os
import time
from datetime import datetime

from src.data_preprocessing import load_data, create_binary_labels, preprocess_data
from src.logistic_regression import LogisticRegression, KernelLogisticRegression
from src.svm import SVM, KernelSVM
from src.kernels import create_kernel_function
from src.evaluation import (
    calculate_metrics, 
    print_metrics,
    plot_confusion_matrix,
    plot_training_curves,
    plot_all_metrics_comparison,
    cross_validate,
    analyze_misclassifications
)

if not os.path.exists('results'):
    os.makedirs('results')

## Load and Preprocess Data

In [2]:
wines = load_data('data/winequality-red.csv', 'data/winequality-white.csv')

# Converting to binary classification problem
wines = create_binary_labels(wines, threshold=6)

# Splitting and normalizing
X_train, X_test, y_train, y_test, mean, std = preprocess_data(
    wines, test_size=0.2, random_state=42, normalize=True
)
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

Loaded 1599 red wines and 4898 white wines
Total dataset size: 6497 samples

Dataset shape: (6497, 13)

Binary label distribution:
Bad wines (quality < 6): 2384 (36.7%)
Good wines (quality >= 6): 4113 (63.3%)

Train set size: 5199 samples
Test set size: 1298 samples
Train label distribution: Bad=1908, Good=3291
Test label distribution: Bad=476, Good=822

Features normalized (standardization: mean=0, std=1)
Training set: (5199, 12)
Test set: (1298, 12)
