In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, mean_squared_error

### winequality-white dataset

In [None]:
path = "winequality-white.csv"
data = pd.read_csv(path, sep=";")
#print(data.describe())
#print(data["quality"].value_counts().sort_index())
data.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [3]:
# Create binary categories low/high wine quality
data["quality"] = np.where(data["quality"] <= 6, -1, 1)

# Train/Test split
X, y = data.drop("quality", axis=1), data["quality"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [4]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # Fit on training data and transform
X_test = scaler.transform(X_test) 

In [5]:
X_train.shape

(3428, 11)

### Performance

In [6]:
import sys
import os

# Dynamically determine the path to the 'src' directory
notebook_dir = os.path.dirname(os.path.abspath('__file__'))  # Current notebook directory
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))  # Move up one level
src_path = os.path.join(project_root, 'src')  # Path to src directory

# Add the src path to sys.path
sys.path.append(os.path.abspath(src_path))

In [7]:
from approximations import RandomFourierFeatures, NystromApproximation
from tasks import KernelRidgeRegression
from utils import *

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [None]:
import time
from memory_profiler import memory_usage

In [10]:
def measure_performances(func, *args, **kwargs):  # doest not allow the acces to the output of the function
    """
    Measures both memory usage and execution time of a function.
    
    Parameters:
    func: The function to execute.
    *args: Positional arguments for the function.
    **kwargs: Keyword arguments for the function.
    
    Returns:
    A tuple (memory_growth, execution_time):
        - memory_growth: Peak memory used by the function (in MiB).
        - execution_time: Time taken to execute the function (in seconds).
    """
    start_time = time.time()  # Record the start time
    mem_usage, result = memory_usage((func, args, kwargs), retval=True)  # Measure memory usage
    execution_time = time.time() - start_time  # Calculate elapsed time
    memory_growth = max(mem_usage) - min(mem_usage)  # Calculate memory growth
    return memory_growth, execution_time, result

In [11]:
# def profile_function(func, *args, **kwargs):
#     """
#     Profiles memory usage of a function and captures its return value.
    
#     :param func: The function to profile.
#     :param args: Positional arguments for the function.
#     :param kwargs: Keyword arguments for the function.
#     :return: Tuple of (memory_usage_list, function_result).
#     """
#     result_container = {}

#     def wrapper():
#         # Call the function and store the result in the container
#         result_container['result'] = func(*args, **kwargs)

#     # Monitor memory usage while executing the wrapper
#     start_time = time.time()
#     mem_usage = memory_usage(wrapper, retval=True, max_usage=True)
#     execution_time = time.time() - start_time

#     print("mem_usage", mem_usage)

#     memory_growth = max(mem_usage) - min(mem_usage)
#     return memory_growth, execution_time, result_container['result']

In [12]:
# Define a function to measure performance and store results in a DataFrame
def measure_and_store_performance(model_func, params, param_name="Number of components", model_name = None):
    results = []
    for param in params:
        memory_growth, execution_time, scores = measure_performances(model_func, param)
        accuracy, balanced_accuracy, f1 = scores[0], scores[1], scores[2]
        results.append({
            'Model' : f"{model_name}",
            f"{param_name}": "=" + str(param),
            'Clock Time (seconds)': execution_time,
            'Memory Usage (MiB)': memory_growth,
            'Accuracy': float(accuracy),
            'Balanced Accuracy': float(balanced_accuracy),
            'F1 Score': float(f1)
        })
    return pd.DataFrame(results)

In [13]:
def run_nystrom_model(n_components):
    # Nyström approximation    
    nystroem = NystromApproximation(kernel='rbf', gamma=1, n_components=n_components, random_state=42)

    X_train_nystroem = nystroem.fit_transform(X_train)
    X_test_nystroem = nystroem.transform(X_test)

    svm_nystrom = SVC(kernel='linear', C=1.0, random_state=42)
    svm_nystrom.fit(X_train_nystroem, y_train)

    y_pred_nystrom = svm_nystrom.predict(X_test_nystroem)

    return accuracy_score(y_test, y_pred_nystrom), balanced_accuracy_score(y_test, y_pred_nystrom), f1_score(y_test, y_pred_nystrom)

In [14]:
def run_nystrom_baseline(n_components):
    from sklearn.kernel_approximation import Nystroem
    # Nyström approximation    
    nystroem = Nystroem(n_components=n_components, random_state=42)
    X_train_nystroem = nystroem.fit_transform(X_train)
    X_test_nystroem = nystroem.transform(X_test)

    svm_nystrom = SVC(kernel='linear', C=1.0, random_state=42)
    svm_nystrom.fit(X_train_nystroem, y_train)

    y_pred_nystrom = svm_nystrom.predict(X_test_nystroem)

    return accuracy_score(y_test, y_pred_nystrom), balanced_accuracy_score(y_test, y_pred_nystrom), f1_score(y_test, y_pred_nystrom)

In [15]:
def run_rff_model(n_components):
    rff = RandomFourierFeatures(n_components=n_components, gamma=1.0, kernel='rbf', random_state=42)
    X_train_rff = rff.fit_transform(X_train)
    X_test_rff = rff.transform(X_test)

    svm_rff = SVC(kernel='linear', C=1.0, random_state=42)
    svm_rff.fit(X_train_rff, y_train)
    y_pred_rff = svm_rff.predict(X_test_rff)

    return accuracy_score(y_test, y_pred_rff), balanced_accuracy_score(y_test, y_pred_rff), f1_score(y_test, y_pred_rff)

In [16]:
p_values = [10, 100, 500, 1000, 2000, 3000]

In [17]:
nytrsom_df = measure_and_store_performance(run_nystrom_model, p_values, model_name="Nystrom")

In [18]:
nytrsom_df

Unnamed: 0,Model,Number of components,Clock Time (seconds),Memory Usage (MiB),Accuracy,Balanced Accuracy,F1 Score
0,Nystrom,=10,0.585263,10.003906,0.787075,0.5,0.0
1,Nystrom,=100,3.853825,284.050781,0.795238,0.520335,0.079511
2,Nystrom,=500,23.431118,46.730469,0.805442,0.551288,0.19209
3,Nystrom,=1000,119.446229,66.960938,0.821088,0.595021,0.323907
4,Nystrom,=2000,391.005656,223.894531,0.848299,0.668241,0.498876
5,Nystrom,=3000,768.571542,485.820312,0.861224,0.70675,0.573222


In [19]:
rff_df = measure_and_store_performance(run_rff_model, p_values, model_name="RFF")

In [20]:
rff_df

Unnamed: 0,Model,Number of components,Clock Time (seconds),Memory Usage (MiB),Accuracy,Balanced Accuracy,F1 Score
0,RFF,=10,0.440636,0.007812,0.787075,0.5,0.0
1,RFF,=100,0.471053,2.5,0.787075,0.5,0.0
2,RFF,=500,2.051641,2.71875,0.791837,0.527496,0.12069
3,RFF,=1000,5.983666,32.945312,0.814966,0.616768,0.384615
4,RFF,=2000,14.360994,55.261719,0.831293,0.652776,0.463203
5,RFF,=3000,22.60671,156.675781,0.845578,0.672339,0.505447


In [21]:
nystrom_baseline_df = measure_and_store_performance(run_nystrom_baseline, p_values, model_name="Nystrom_baseline")

In [22]:
nystrom_baseline_df

Unnamed: 0,Model,Number of components,Clock Time (seconds),Memory Usage (MiB),Accuracy,Balanced Accuracy,F1 Score
0,Nystrom_baseline,=10,0.382352,0.0,0.787075,0.5,0.0
1,Nystrom_baseline,=100,2.038634,2.539062,0.819048,0.63451,0.424242
2,Nystrom_baseline,=500,14.358615,2.953125,0.822449,0.645993,0.448203
3,Nystrom_baseline,=1000,96.858542,25.203125,0.822449,0.647158,0.450526
4,Nystrom_baseline,=2000,342.470441,174.226562,0.82517,0.652383,0.461216
5,Nystrom_baseline,=3000,650.584729,400.191406,0.82517,0.652383,0.461216


In [23]:
def run_baseline_kernel():
    # Complete kernel    
    gaussian_kernel = GaussianKernel(gamma=1)

    # Compute the full kernel matrices
    K_train = gaussian_kernel.get_Kxx(X_train)
    K_test = gaussian_kernel.get_Kxz(X_train, X_test)

    # Train the SVM using the precomputed kernel
    svm = SVC(kernel='precomputed', C=1.0, random_state=42)
    svm.fit(K_train, y_train)

    # Predict on the test set using the precomputed kernel
    y_pred = svm.predict(K_test.T)

    return accuracy_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)

In [24]:
baseline_mem_usage, baseline_clock_time, scores = measure_performances(run_baseline_kernel)
accuracy, balanced_accuracy, f1 = scores[0], scores[1], scores[2]

baseline_df = pd.DataFrame([{
    'Model': 'Baseline',
    'Clock Time (seconds)': baseline_clock_time,
    'Memory Usage (MiB)': baseline_mem_usage,
    'Accuracy': float(accuracy),
    'Balanced Accuracy': float(balanced_accuracy),
    'F1 Score': float(f1)
}])

In [25]:
# Combine both DataFrames
nytrsom_vs_baseline = pd.concat([baseline_df, nytrsom_df], ignore_index=True)

In [26]:
nytrsom_vs_baseline

Unnamed: 0,Model,Clock Time (seconds),Memory Usage (MiB),Accuracy,Balanced Accuracy,F1 Score,Number of components
0,Baseline,55.592319,128.105469,0.868027,0.723889,0.604082,
1,Nystrom,0.585263,10.003906,0.787075,0.5,0.0,=10
2,Nystrom,3.853825,284.050781,0.795238,0.520335,0.079511,=100
3,Nystrom,23.431118,46.730469,0.805442,0.551288,0.19209,=500
4,Nystrom,119.446229,66.960938,0.821088,0.595021,0.323907,=1000
5,Nystrom,391.005656,223.894531,0.848299,0.668241,0.498876,=2000
6,Nystrom,768.571542,485.820312,0.861224,0.70675,0.573222,=3000


In [27]:
# Combine both DataFrames
rff_vs_baseline = pd.concat([baseline_df, rff_df], ignore_index=True)

In [28]:
rff_vs_baseline

Unnamed: 0,Model,Clock Time (seconds),Memory Usage (MiB),Accuracy,Balanced Accuracy,F1 Score,Number of components
0,Baseline,55.592319,128.105469,0.868027,0.723889,0.604082,
1,RFF,0.440636,0.007812,0.787075,0.5,0.0,=10
2,RFF,0.471053,2.5,0.787075,0.5,0.0,=100
3,RFF,2.051641,2.71875,0.791837,0.527496,0.12069,=500
4,RFF,5.983666,32.945312,0.814966,0.616768,0.384615,=1000
5,RFF,14.360994,55.261719,0.831293,0.652776,0.463203,=2000
6,RFF,22.60671,156.675781,0.845578,0.672339,0.505447,=3000


## Kernel Ridge regression

This is now a regression problem

In [29]:
data = pd.read_csv(path, sep=";")

In [30]:
# Train/Test split
X, y = data.drop("quality", axis=1), data["quality"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [31]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # Fit on training data and transform
X_test = scaler.transform(X_test) 

In [32]:
def measure_and_store_performance_ridge(model_func, params, param_name="Number of components", model_name = None):
    results = []
    for param in params:
        memory_growth, execution_time, scores = measure_performances(model_func, param)
        results.append({
            'Model' : f"{model_name}",
            f"{param_name}": "=" + str(param),
            'Clock Time (seconds)': execution_time,
            'Memory Usage (MiB)': memory_growth,
            'Mean Squared Error': float(scores)
        })
    return pd.DataFrame(results)

In [33]:
def run_baseline_kernel_ridge_sklearn():
    from sklearn.kernel_ridge import KernelRidge

    krr = KernelRidge(kernel='rbf')
    krr.fit(X_train, y_train)

    # Predictions
    y_pred = krr.predict(X_test)

    # Evaluation
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [34]:
def run_baseline_kernel_ridge():
    gaussian_kernel = GaussianKernel(gamma=0.5)
    model = KernelRidgeRegression(kernel=gaussian_kernel)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [35]:
def run_nystrom_kernel_ridge(n_components):
    # Initialize Nyström Approximation
    nystrom = NystromApproximation(n_components=n_components, kernel="rbf", random_state=42, gamma=1.0)

    # Fit Nyström on the training data
    nystrom.fit(X_train)

    # Transform the training and test data
    X_train_approx = nystrom.transform(X_train)
    X_test_approx = nystrom.transform(X_test)

    # Initialize Kernel Ridge Regression with linear kernel (approximation already done)
    krr = KernelRidgeRegression(kernel=X_train_approx)

    # Fit the Kernel Ridge Regression model
    krr.fit(X_train_approx, y_train)

    # Predict on the test set
    y_pred = krr.predict(X_test_approx)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [46]:
def run_rff_kernel_ridge(n_components=200):
    # Initialize RFF Approximation
    rff = RandomFourierFeatures(n_components=n_components, gamma=1.0, kernel='rbf', random_state=42)

    # Transform the training data
    Kxx_approx = rff.fit_transform(X_train)
    krr = KernelRidgeRegression(kernel=Kxx_approx)

    krr.fit(X_train, y_train)

    # Predict on the test set
    Kxz_approx = rff.transform(X_test)
    y_pred = krr.predict(Kxz_approx)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [37]:
p_values = [10, 100, 500, 1000, 2000, 3000]

In [38]:
nystrom_ridge_df = measure_and_store_performance_ridge(run_nystrom_kernel_ridge, p_values, model_name="Nystrom Kernel Ridge")

In [39]:
nystrom_ridge_df

Unnamed: 0,Model,Number of components,Clock Time (seconds),Memory Usage (MiB),Mean Squared Error
0,Nystrom Kernel Ridge,=10,5.224622,462.660156,0.752851
1,Nystrom Kernel Ridge,=100,7.61241,461.804688,0.662282
2,Nystrom Kernel Ridge,=500,24.687021,451.246094,0.571416
3,Nystrom Kernel Ridge,=1000,110.301888,510.832031,0.521588
4,Nystrom Kernel Ridge,=2000,377.516958,652.480469,0.45848
5,Nystrom Kernel Ridge,=3000,712.487411,711.855469,0.414589


In [40]:
rff_ridge_df = measure_and_store_performance_ridge(run_rff_kernel_ridge, p_values, model_name="RFF Kernel Ridge")

!!!!!!!!!!!!!!!!!!!!!!!!!!
Performance anormale

In [41]:
rff_ridge_df

Unnamed: 0,Model,Number of components,Clock Time (seconds),Memory Usage (MiB),Mean Squared Error
0,RFF Kernel Ridge,=10,4.670395,453.796875,0.756634
1,RFF Kernel Ridge,=100,4.800008,451.789062,0.72793
2,RFF Kernel Ridge,=500,4.788363,451.789062,0.737234
3,RFF Kernel Ridge,=1000,5.199105,503.960938,0.789709
4,RFF Kernel Ridge,=2000,5.89952,504.148438,0.880966
5,RFF Kernel Ridge,=3000,5.800811,528.125,0.830724


In [47]:
a, b, c = measure_performances(run_rff_kernel_ridge)

In [48]:
Test = pd.DataFrame([{
    'Model': 'Baseline',
    'Clock Time (seconds)': a,
    'Memory Usage (MiB)': b,
    'Mean Squared Error': float(c)
}])

In [49]:
Test

Unnamed: 0,Model,Clock Time (seconds),Memory Usage (MiB),Mean Squared Error
0,Baseline,455.554688,5.394611,0.765251


!!!!!!!!!!! ne pas utiliser ce dataframe

In [42]:
baseline_mem_usage, baseline_clock_time, mse = measure_performances(run_baseline_kernel_ridge)

baseline_df_ridge = pd.DataFrame([{
    'Model': 'Baseline',
    'Clock Time (seconds)': baseline_clock_time,
    'Memory Usage (MiB)': baseline_mem_usage,
    'Mean Squared Error': float(mse)
}])

In [43]:
baseline_df_ridge

Unnamed: 0,Model,Clock Time (seconds),Memory Usage (MiB),Mean Squared Error
0,Baseline,55.76686,449.191406,0.411796


In [50]:
nystrom_vs_baseline_ridge = pd.concat([baseline_df_ridge, nystrom_ridge_df], ignore_index=True)

In [51]:
nystrom_vs_baseline_ridge

Unnamed: 0,Model,Clock Time (seconds),Memory Usage (MiB),Mean Squared Error,Number of components
0,Baseline,55.76686,449.191406,0.411796,
1,Nystrom Kernel Ridge,5.224622,462.660156,0.752851,=10
2,Nystrom Kernel Ridge,7.61241,461.804688,0.662282,=100
3,Nystrom Kernel Ridge,24.687021,451.246094,0.571416,=500
4,Nystrom Kernel Ridge,110.301888,510.832031,0.521588,=1000
5,Nystrom Kernel Ridge,377.516958,652.480469,0.45848,=2000
6,Nystrom Kernel Ridge,712.487411,711.855469,0.414589,=3000
