In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score

### winequality-white dataset

In [None]:
path = "svm/winequality-white.csv"
data = pd.read_csv(path, sep=";")
#print(data.describe())
#print(data["quality"].value_counts().sort_index())
data.columns

In [3]:
# Create binary categories low/high wine quality
data["quality"] = np.where(data["quality"] <= 6, -1, 1)

# Train/Test split
X, y = data.drop("quality", axis=1), data["quality"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [None]:
type(X_train)

In [5]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # Fit on training data and transform
X_test = scaler.transform(X_test) 

In [None]:
type(X_train)

### Performance

In [7]:
import sys
import os
sys.path.append(os.path.abspath('src'))

In [8]:
from src.approximations import RandomFourierFeatures, NystromApproximation
from src.tasks import KernelRidgeRegression
from src.utils import *

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [10]:
import time
from memory_profiler import memory_usage

In [28]:
def measure_performances(func, *args, **kwargs):  # doest not allow the acces to the output of the function
    """
    Measures both memory usage and execution time of a function.
    
    Parameters:
    func: The function to execute.
    *args: Positional arguments for the function.
    **kwargs: Keyword arguments for the function.
    
    Returns:
    A tuple (memory_growth, execution_time):
        - memory_growth: Peak memory used by the function (in MiB).
        - execution_time: Time taken to execute the function (in seconds).
    """
    start_time = time.time()  # Record the start time
    mem_usage, result = memory_usage((func, args, kwargs), retval=True)  # Measure memory usage
    execution_time = time.time() - start_time  # Calculate elapsed time
    memory_growth = max(mem_usage) - min(mem_usage)  # Calculate memory growth
    return memory_growth, execution_time, result

In [12]:
def profile_function(func, *args, **kwargs):
    """
    Profiles memory usage of a function and captures its return value.
    
    :param func: The function to profile.
    :param args: Positional arguments for the function.
    :param kwargs: Keyword arguments for the function.
    :return: Tuple of (memory_usage_list, function_result).
    """
    result_container = {}

    def wrapper():
        # Call the function and store the result in the container
        result_container['result'] = func(*args, **kwargs)

    # Monitor memory usage while executing the wrapper
    start_time = time.time()
    mem_usage = memory_usage(wrapper, retval=True, max_usage=True)
    execution_time = time.time() - start_time

    print("mem_usage", mem_usage)

    memory_growth = max(mem_usage) - min(mem_usage)
    return memory_growth, execution_time, result_container['result']

In [36]:
# Define a function to measure performance and store results in a DataFrame
def measure_and_store_performance(model_func, params, param_name="Number of components", model_name = None):
    results = []
    for param in params:
        memory_growth, execution_time, scores = measure_performances(model_func, param)
        accuracy, balanced_accuracy, f1 = scores[0], scores[1], scores[2]
        results.append({
            'Model' : f"{model_name}",
            f"{param_name}": "=" + str(param),
            'Clock Time (seconds)': execution_time,
            'Memory Usage (MiB)': memory_growth,
            'Accuracy': float(accuracy),
            'Balanced Accuracy': float(balanced_accuracy),
            'F1 Score': float(f1)
        })
    return pd.DataFrame(results)

In [20]:
def run_nystrom_model(n_components):
    # Nyström approximation    
    nystroem = NystromApproximation(kernel='rbf', gamma=1, n_components=n_components, random_state=42)

    X_train_nystroem = nystroem.fit_transform(X_train)
    X_test_nystroem = nystroem.transform(X_test)

    svm_nystrom = SVC(kernel='linear', C=1.0, random_state=42)
    svm_nystrom.fit(X_train_nystroem, y_train)

    y_pred_nystrom = svm_nystrom.predict(X_test_nystroem)

    return accuracy_score(y_test, y_pred_nystrom), balanced_accuracy_score(y_test, y_pred_nystrom), f1_score(y_test, y_pred_nystrom)

In [21]:
def run_rff_model(n_components):
    rff = RandomFourierFeatures(n_components=n_components, gamma=1.0, kernel='rbf', random_state=42)
    X_train_rff = rff.fit_transform(X_train)
    X_test_rff = rff.transform(X_test)

    svm_rff = SVC(kernel='linear', C=1.0, random_state=42)
    svm_rff.fit(X_train_rff, y_train)
    y_pred_rff = svm_rff.predict(X_test_rff)

    return accuracy_score(y_test, y_pred_rff), balanced_accuracy_score(y_test, y_pred_rff), f1_score(y_test, y_pred_rff)

In [39]:
p_values = [1, 10, 100, 1000, 3428]
nytrsom_df = measure_and_store_performance(run_nystrom_model, p_values, model_name="Nystrom")

In [None]:
nytrsom_df

In [37]:
rff_df = measure_and_store_performance(run_rff_model, p_values, model_name="RFF")

In [None]:
rff_df

In [41]:
def run_baseline_kernel():
    # Complete kernel    
    gaussian_kernel = GaussianKernel(gamma=1)

    # Compute the full kernel matrices
    K_train = gaussian_kernel.get_Kxx(X_train)
    K_test = gaussian_kernel.get_Kxz(X_train, X_test)

    # Train the SVM using the precomputed kernel
    svm = SVC(kernel='precomputed', C=1.0, random_state=42)
    svm.fit(K_train, y_train)

    # Predict on the test set using the precomputed kernel
    y_pred = svm.predict(K_test.T)

    return accuracy_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

In [32]:
baseline_mem_usage, baseline_clock_time, scores = measure_performances(run_baseline_kernel)
accuracy, balanced_accuracy, f1 = scores[0], scores[1], scores[2]

baseline_df = pd.DataFrame([{
    'Model': 'Baseline',
    'Clock Time (seconds)': baseline_clock_time,
    'Memory Usage (MiB)': baseline_mem_usage,
    'Accuracy': float(accuracy),
    'Balanced Accuracy': float(balanced_accuracy),
    'F1 Score': float(f1)
}])

In [None]:
# Combine both DataFrames
nytrsom_vs_baseline = pd.concat([baseline_df, nytrsom_df], ignore_index=True)

In [None]:
nytrsom_vs_baseline

In [None]:
# Combine both DataFrames
rff_vs_baseline = pd.concat([baseline_df, rff_df], ignore_index=True)

In [None]:
rff_vs_baseline