# Advanced ML Project #

In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.datasets import make_classification 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score

from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import Pipeline

### 1. Preprocessing of the data ###

In [None]:
path = "winequality-white.csv"
data = pd.read_csv(path, sep=";")
#print(data.describe())
#print(data["quality"].value_counts().sort_index())
data.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [3]:
# Create binary categories low/high wine quality
data["quality"] = np.where(data["quality"] <= 6, -1, 1)

# Train/Test split
X, y = data.drop("quality", axis=1), data["quality"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

#print(y.value_counts().sort_index())

## 2. Binary classification ##

### 2.a Classical SVC ###

In [4]:
def evaluate_baseline_model():
    # Training
    svm_model = SVC(kernel='linear', C=1.0, random_state=10)
    svm_model.fit(X_train, y_train)
    y_pred = svm_model.predict(X_test)
    return y_test, y_pred

### 2.b Nyström kernel approximation SVC ###

In [5]:
def evaluate_nystrom_model(p):
    # Nyström approximation
    nystroem_approx = Nystroem(kernel='rbf', gamma=0.1, n_components=p)

    # SVM model
    svm_model = SVC(kernel='linear', C=1.0, random_state=10)

    # Combine Nystroem and LinearSVC in a pipeline
    svc_pipeline = Pipeline([
        ('nystroem', nystroem_approx),
        ('linear_svm', svm_model)
    ])

    # Fit
    svc_pipeline.fit(X_train, y_train)
    y_pred = svc_pipeline.predict(X_test)
    return y_test, y_pred

## 3. Results ##

In [6]:
import numpy as np
import time
from memory_profiler import memory_usage

In [7]:
def measure_performance(func, *args, **kwargs):
    """
    Measure the clock time and memory usage of a function.
    
    Parameters:
    func (callable): The function to measure.
    *args: Positional arguments to pass to the function.
    **kwargs: Keyword arguments to pass to the function.
    
    Returns:
    result: The result of the function call.
    clock_time: The time taken to execute the function (in seconds).
    mem_usage: The memory used by the function (in MiB).
    """
    # Measure clock time
    start_time = time.time()
    # Training of the model
    y_test, y_pred = func(*args, **kwargs)
    end_time = time.time()
    clock_time = end_time - start_time

    # Evaluate the performances
    accuracy = accuracy_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # # Measure memory usage
    mem_usage = memory_usage((func, args, kwargs))
    mem_usage = max(mem_usage) - min(mem_usage)
    
    return accuracy, balanced_accuracy, f1, clock_time, mem_usage

In [8]:
# Measure performance for evaluate_baseline_model
baseline_accuracy, baseline_balanced_accuracy, baseline_f1, baseline_clock_time, baseline_mem_usage \
    = measure_performance(evaluate_baseline_model)
baseline_df = pd.DataFrame([{
    'Model': 'Baseline',
    'Clock Time (seconds)': baseline_clock_time,
    'Memory Usage (MiB)': baseline_mem_usage,
    'Accuracy': baseline_accuracy,
    'Balanced Accuracy': baseline_balanced_accuracy,
    'F1 Score': baseline_f1
}])

In [9]:
# Define a function to measure performance and store results in a DataFrame
def measure_and_store_performance(model_func, params):
    results = []
    for param in params:
        accuracy, balanced_accuracy, f1, clock_time, mem_usage = measure_performance(model_func, param)
        results.append({
            'param_name': "Nystrom p=" + str(param),
            'Clock Time (seconds)': clock_time,
            'Memory Usage (MiB)': mem_usage,
            'Accuracy': accuracy,
            'Balanced Accuracy': balanced_accuracy,
            'F1 Score': f1
        })
    return pd.DataFrame(results)

# Measure performance for evaluate_nytrsom_model with different values of p
p_values = [1, 10, 100, 1000, 3428] #print(y_train.shape[0]) = 3428
nytrsom_df = measure_and_store_performance(evaluate_nystrom_model, p_values)

# Combine both DataFrames
combined_df = pd.concat([baseline_df, nytrsom_df], ignore_index=True)

In [12]:
print(combined_df)

      Model  Clock Time (seconds)  Memory Usage (MiB)  Accuracy  \
0  Baseline             39.043817            9.558594  0.787075   
1       NaN              0.105694            1.429688  0.787075   
2       NaN              0.122153            5.558594  0.787075   
3       NaN              0.415570           11.777344  0.789796   
4       NaN              5.995281           64.175781  0.810204   
5       NaN             46.173906          536.878906  0.828571   

   Balanced Accuracy  F1 Score      param_name  
0           0.500000  0.000000             NaN  
1           0.500000  0.000000     Nystrom p=1  
2           0.500000  0.000000    Nystrom p=10  
3           0.506390  0.025237   Nystrom p=100  
4           0.576454  0.275325  Nystrom p=1000  
5           0.644056  0.444934  Nystrom p=3428  


In [13]:
combined_df

Unnamed: 0,Model,Clock Time (seconds),Memory Usage (MiB),Accuracy,Balanced Accuracy,F1 Score,param_name
0,Baseline,39.043817,9.558594,0.787075,0.5,0.0,
1,,0.105694,1.429688,0.787075,0.5,0.0,Nystrom p=1
2,,0.122153,5.558594,0.787075,0.5,0.0,Nystrom p=10
3,,0.41557,11.777344,0.789796,0.50639,0.025237,Nystrom p=100
4,,5.995281,64.175781,0.810204,0.576454,0.275325,Nystrom p=1000
5,,46.173906,536.878906,0.828571,0.644056,0.444934,Nystrom p=3428


In [16]:
output_path = "winequality-white_results.csv"
combined_df.to_csv(output_path, index=False)