In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, mean_squared_error

### winequality-white dataset

In [2]:
path = "svm/winequality-white.csv"
data = pd.read_csv(path, sep=";")
#print(data.describe())
#print(data["quality"].value_counts().sort_index())
data.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [3]:
# Create binary categories low/high wine quality
data["quality"] = np.where(data["quality"] <= 6, -1, 1)

# Train/Test split
X, y = data.drop("quality", axis=1), data["quality"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [4]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # Fit on training data and transform
X_test = scaler.transform(X_test) 

In [5]:
X_train.shape

(3428, 11)

### Performance

In [6]:
import sys
import os
sys.path.append(os.path.abspath('src'))

In [7]:
from src.approximations import RandomFourierFeatures, NystromApproximation
from src.tasks import KernelRidgeRegression
from src.utils import *

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [9]:
import time
from memory_profiler import memory_usage

In [10]:
def measure_performances(func, *args, **kwargs):  # doest not allow the acces to the output of the function
    """
    Measures both memory usage and execution time of a function.
    
    Parameters:
    func: The function to execute.
    *args: Positional arguments for the function.
    **kwargs: Keyword arguments for the function.
    
    Returns:
    A tuple (memory_growth, execution_time):
        - memory_growth: Peak memory used by the function (in MiB).
        - execution_time: Time taken to execute the function (in seconds).
    """
    start_time = time.time()  # Record the start time
    mem_usage, result = memory_usage((func, args, kwargs), retval=True)  # Measure memory usage
    execution_time = time.time() - start_time  # Calculate elapsed time
    memory_growth = max(mem_usage) - min(mem_usage)  # Calculate memory growth
    return memory_growth, execution_time, result

In [11]:
# def profile_function(func, *args, **kwargs):
#     """
#     Profiles memory usage of a function and captures its return value.
    
#     :param func: The function to profile.
#     :param args: Positional arguments for the function.
#     :param kwargs: Keyword arguments for the function.
#     :return: Tuple of (memory_usage_list, function_result).
#     """
#     result_container = {}

#     def wrapper():
#         # Call the function and store the result in the container
#         result_container['result'] = func(*args, **kwargs)

#     # Monitor memory usage while executing the wrapper
#     start_time = time.time()
#     mem_usage = memory_usage(wrapper, retval=True, max_usage=True)
#     execution_time = time.time() - start_time

#     print("mem_usage", mem_usage)

#     memory_growth = max(mem_usage) - min(mem_usage)
#     return memory_growth, execution_time, result_container['result']

In [12]:
# Define a function to measure performance and store results in a DataFrame
def measure_and_store_performance(model_func, params, param_name="Number of components", model_name = None):
    results = []
    for param in params:
        memory_growth, execution_time, scores = measure_performances(model_func, param)
        accuracy, balanced_accuracy, f1 = scores[0], scores[1], scores[2]
        results.append({
            'Model' : f"{model_name}",
            f"{param_name}": "=" + str(param),
            'Clock Time (seconds)': execution_time,
            'Memory Usage (MiB)': memory_growth,
            'Accuracy': float(accuracy),
            'Balanced Accuracy': float(balanced_accuracy),
            'F1 Score': float(f1)
        })
    return pd.DataFrame(results)

In [13]:
def run_nystrom_model(n_components):
    # Nyström approximation    
    nystroem = NystromApproximation(kernel='rbf', gamma=1, n_components=n_components, random_state=42)

    X_train_nystroem = nystroem.fit_transform(X_train)
    X_test_nystroem = nystroem.transform(X_test)

    svm_nystrom = SVC(kernel='linear', C=1.0, random_state=42)
    svm_nystrom.fit(X_train_nystroem, y_train)

    y_pred_nystrom = svm_nystrom.predict(X_test_nystroem)

    return accuracy_score(y_test, y_pred_nystrom), balanced_accuracy_score(y_test, y_pred_nystrom), f1_score(y_test, y_pred_nystrom)

In [14]:
def run_nystrom_baseline(n_components):
    from sklearn.kernel_approximation import Nystroem
    # Nyström approximation    
    nystroem = Nystroem(n_components=n_components, random_state=42)
    X_train_nystroem = nystroem.fit_transform(X_train)
    X_test_nystroem = nystroem.transform(X_test)

    svm_nystrom = SVC(kernel='linear', C=1.0, random_state=42)
    svm_nystrom.fit(X_train_nystroem, y_train)

    y_pred_nystrom = svm_nystrom.predict(X_test_nystroem)

    return accuracy_score(y_test, y_pred_nystrom), balanced_accuracy_score(y_test, y_pred_nystrom), f1_score(y_test, y_pred_nystrom)

In [15]:
def run_rff_model(n_components):
    rff = RandomFourierFeatures(n_components=n_components, gamma=1.0, kernel='rbf', random_state=42)
    X_train_rff = rff.fit_transform(X_train)
    X_test_rff = rff.transform(X_test)

    svm_rff = SVC(kernel='linear', C=1.0, random_state=42)
    svm_rff.fit(X_train_rff, y_train)
    y_pred_rff = svm_rff.predict(X_test_rff)

    return accuracy_score(y_test, y_pred_rff), balanced_accuracy_score(y_test, y_pred_rff), f1_score(y_test, y_pred_rff)

In [16]:
def run_baseline_kernel_ridge():
    gaussian_kernel = GaussianKernel(gamma=0.5)
    model = KernelRidgeRegression(kernel=gaussian_kernel)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [17]:
def run_baseline_kernel_ridge_sklearn():
    from sklearn.kernel_ridge import KernelRidge

    krr = KernelRidge(kernel='rbf')
    krr.fit(X_train, y_train)

    # Predictions
    y_pred = krr.predict(X_test)

    # Evaluation
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [18]:
p_values = [1, 10, 100, 1000, 3428]
nytrsom_df = measure_and_store_performance(run_nystrom_model, p_values, model_name="Nystrom")

In [19]:
nytrsom_df

Unnamed: 0,Model,Number of components,Clock Time (seconds),Memory Usage (MiB),Accuracy,Balanced Accuracy,F1 Score
0,Nystrom,=1,0.308917,19.0,0.787075,0.5,0.0
1,Nystrom,=10,0.359451,2.378906,0.787075,0.5,0.0
2,Nystrom,=100,2.478507,45.054688,0.795238,0.520335,0.079511
3,Nystrom,=1000,26.013391,86.289062,0.821088,0.595021,0.323907
4,Nystrom,=3428,148.058461,712.761719,0.868027,0.723889,0.604082


In [20]:
rff_df = measure_and_store_performance(run_rff_model, p_values, model_name="RFF")

In [21]:
rff_df

Unnamed: 0,Model,Number of components,Clock Time (seconds),Memory Usage (MiB),Accuracy,Balanced Accuracy,F1 Score
0,RFF,=1,0.239182,0.0,0.787075,0.5,0.0
1,RFF,=10,0.278221,0.0,0.787075,0.5,0.0
2,RFF,=100,0.326655,0.023438,0.787075,0.5,0.0
3,RFF,=1000,5.128077,70.617188,0.814966,0.616768,0.384615
4,RFF,=3428,18.46604,179.09375,0.842857,0.663619,0.487805


In [22]:
nystrom_baseline_df = measure_and_store_performance(run_nystrom_baseline, p_values, model_name="Nystrom_baseline")

In [23]:
nystrom_baseline_df

Unnamed: 0,Model,Number of components,Clock Time (seconds),Memory Usage (MiB),Accuracy,Balanced Accuracy,F1 Score
0,Nystrom_baseline,=1,0.237437,0.003906,0.787075,0.5,0.0
1,Nystrom_baseline,=10,0.272858,0.0,0.787075,0.5,0.0
2,Nystrom_baseline,=100,0.411893,18.425781,0.819048,0.63451,0.424242
3,Nystrom_baseline,=1000,5.394924,47.667969,0.822449,0.647158,0.450526
4,Nystrom_baseline,=3428,39.607424,517.503906,0.82517,0.652383,0.461216


In [24]:
def run_baseline_kernel():
    # Complete kernel    
    gaussian_kernel = GaussianKernel(gamma=1)

    # Compute the full kernel matrices
    K_train = gaussian_kernel.get_Kxx(X_train)
    K_test = gaussian_kernel.get_Kxz(X_train, X_test)

    # Train the SVM using the precomputed kernel
    svm = SVC(kernel='precomputed', C=1.0, random_state=42)
    svm.fit(K_train, y_train)

    # Predict on the test set using the precomputed kernel
    y_pred = svm.predict(K_test.T)

    return accuracy_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

In [25]:
baseline_mem_usage, baseline_clock_time, scores = measure_performances(run_baseline_kernel)
accuracy, balanced_accuracy, f1 = scores[0], scores[1], scores[2]

baseline_df = pd.DataFrame([{
    'Model': 'Baseline',
    'Clock Time (seconds)': baseline_clock_time,
    'Memory Usage (MiB)': baseline_mem_usage,
    'Accuracy': float(accuracy),
    'Balanced Accuracy': float(balanced_accuracy),
    'F1 Score': float(f1)
}])

In [26]:
# Combine both DataFrames
nytrsom_vs_baseline = pd.concat([baseline_df, nytrsom_df], ignore_index=True)

In [27]:
nytrsom_vs_baseline

Unnamed: 0,Model,Clock Time (seconds),Memory Usage (MiB),Accuracy,Balanced Accuracy,F1 Score,Number of components
0,Baseline,59.229833,128.027344,0.868027,0.723889,0.604082,
1,Nystrom,0.308917,19.0,0.787075,0.5,0.0,=1
2,Nystrom,0.359451,2.378906,0.787075,0.5,0.0,=10
3,Nystrom,2.478507,45.054688,0.795238,0.520335,0.079511,=100
4,Nystrom,26.013391,86.289062,0.821088,0.595021,0.323907,=1000
5,Nystrom,148.058461,712.761719,0.868027,0.723889,0.604082,=3428


In [28]:
# Combine both DataFrames
rff_vs_baseline = pd.concat([baseline_df, rff_df], ignore_index=True)

In [29]:
rff_vs_baseline

Unnamed: 0,Model,Clock Time (seconds),Memory Usage (MiB),Accuracy,Balanced Accuracy,F1 Score,Number of components
0,Baseline,59.229833,128.027344,0.868027,0.723889,0.604082,
1,RFF,0.239182,0.0,0.787075,0.5,0.0,=1
2,RFF,0.278221,0.0,0.787075,0.5,0.0,=10
3,RFF,0.326655,0.023438,0.787075,0.5,0.0,=100
4,RFF,5.128077,70.617188,0.814966,0.616768,0.384615,=1000
5,RFF,18.46604,179.09375,0.842857,0.663619,0.487805,=3428


## Kernel Ridge regression

In [30]:
def measure_and_store_performance_ridge(model_func, params, param_name="Number of components", model_name = None):
    results = []
    for param in params:
        memory_growth, execution_time, scores = measure_performances(model_func, param)
        results.append({
            'Model' : f"{model_name}",
            f"{param_name}": "=" + str(param),
            'Clock Time (seconds)': execution_time,
            'Memory Usage (MiB)': memory_growth,
            'Mean Squared Error': float(scores)
        })
    return pd.DataFrame(results)

In [31]:
def run_baseline_kernel_ridge_sklearn():
    from sklearn.kernel_ridge import KernelRidge

    krr = KernelRidge(kernel='rbf')
    krr.fit(X_train, y_train)

    # Predictions
    y_pred = krr.predict(X_test)

    # Evaluation
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [32]:
def run_baseline_kernel_ridge():
    gaussian_kernel = GaussianKernel(gamma=0.5)
    model = KernelRidgeRegression(kernel=gaussian_kernel)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [33]:
def run_rff_kernel_ridge(n_components):
    # Initialize RFF Approximation
    rff = RandomFourierFeatures(n_components=n_components, kernel="rbf", random_state=42, gamma=1.0)

    # Fit RFF on the training data
    rff.fit(X_train)

    # Transform the training and test data
    X_train_approx = rff.transform(X_train)
    X_test_approx = rff.transform(X_test)

    # Initialize Kernel Ridge Regression with linear kernel (approximation already done)
    krr = KernelRidgeRegression(kernel=X_train_approx)

    # Fit the Kernel Ridge Regression model
    krr.fit(X_train_approx, y_train)

    # Predict on the test set
    y_pred = krr.predict(X_test_approx)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [34]:
def run_nystrom_kernel_ridge(n_components):
    # Initialize Nyström Approximation
    nystrom = NystromApproximation(n_components=n_components, kernel="rbf", random_state=42, gamma=1.0)

    # Fit Nyström on the training data
    nystrom.fit(X_train)

    # Transform the training and test data
    X_train_approx = nystrom.transform(X_train)
    X_test_approx = nystrom.transform(X_test)

    # Initialize Kernel Ridge Regression with linear kernel (approximation already done)
    krr = KernelRidgeRegression(kernel=X_train_approx)

    # Fit the Kernel Ridge Regression model
    krr.fit(X_train_approx, y_train)

    # Predict on the test set
    y_pred = krr.predict(X_test_approx)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [35]:
p_values = [1, 10, 100, 1000, 3428]
nystrom_ridge_df = measure_and_store_performance_ridge(run_nystrom_kernel_ridge, p_values, model_name="Nystrom Kernel Ridge")

In [36]:
nystrom_ridge_df

Unnamed: 0,Model,Number of components,Clock Time (seconds),Memory Usage (MiB),Mean Squared Error
0,Nystrom Kernel Ridge,=1,3.639736,458.992188,0.670498
1,Nystrom Kernel Ridge,=10,3.970834,448.28125,0.663702
2,Nystrom Kernel Ridge,=100,5.520784,459.804688,0.598257
3,Nystrom Kernel Ridge,=1000,24.297197,500.429688,0.480758
4,Nystrom Kernel Ridge,=3428,119.465353,759.511719,0.381449


In [37]:
rff_ridge_df = measure_and_store_performance_ridge(run_rff_kernel_ridge, p_values, model_name="RFF Kernel Ridge")

!!!!!!!!!!!!!!!!!!!!!!!!!!
Performance anormale

In [38]:
rff_ridge_df

Unnamed: 0,Model,Number of components,Clock Time (seconds),Memory Usage (MiB),Mean Squared Error
0,RFF Kernel Ridge,=1,3.895092,448.253906,0.671153
1,RFF Kernel Ridge,=10,3.702591,452.265625,0.669555
2,RFF Kernel Ridge,=100,3.677833,448.277344,0.64199
3,RFF Kernel Ridge,=1000,4.009969,504.429688,0.712394
4,RFF Kernel Ridge,=3428,4.663168,666.457031,0.717537


!!!!!!!!!!! ne pas utiliser ce dataframe

In [39]:
baseline_mem_usage, baseline_clock_time, mse = measure_performances(run_baseline_kernel_ridge)

baseline_df_ridge = pd.DataFrame([{
    'Model': 'Baseline',
    'Clock Time (seconds)': baseline_clock_time,
    'Memory Usage (MiB)': baseline_mem_usage,
    'Mean Squared Error': float(mse)
}])

In [40]:
baseline_df_ridge

Unnamed: 0,Model,Clock Time (seconds),Memory Usage (MiB),Mean Squared Error
0,Baseline,60.68605,452.640625,0.403872


In [41]:
nystrom_vs_baseline_ridge = pd.concat([baseline_df_ridge, nystrom_ridge_df], ignore_index=True)

In [42]:
nystrom_vs_baseline_ridge

Unnamed: 0,Model,Clock Time (seconds),Memory Usage (MiB),Mean Squared Error,Number of components
0,Baseline,60.68605,452.640625,0.403872,
1,Nystrom Kernel Ridge,3.639736,458.992188,0.670498,=1
2,Nystrom Kernel Ridge,3.970834,448.28125,0.663702,=10
3,Nystrom Kernel Ridge,5.520784,459.804688,0.598257,=100
4,Nystrom Kernel Ridge,24.297197,500.429688,0.480758,=1000
5,Nystrom Kernel Ridge,119.465353,759.511719,0.381449,=3428
