# (Kernel) Ridge Regression
Download the Spotify Tracks Dataset and perform ridge regression to predict the tracks’ popularity. Note that this dataset contains both numerical and categorical features. The student is thus required to follow these guidelines:
- first, train the model using only the numerical features,
- second, appropriately handle the categorical features (for example, with one-hot encoding or other techniques) and use them together with the numerical ones to train the model, in both cases, experiment with different training parameters, 
- use 5-fold cross validation to compute your risk estimates, thoroughly discuss and compare the performance of the model

The student is required to implement from scratch (without using libraries, such as Scikit-learn) the code for the ridge regression, while it is not mandatory to do so for the implementation of the 5-fold cross-validation.

Optional: Instead of regular ridge regression, implement kernel ridge regression using a Gaussian kernel.


## TODOS
 - CV kernel on gamma and alpha on a small number of datapoints (train and test on more datatpoints)
 - Retest and download plots (pick, except for kernel, the same sizes for train and test sets)
 - Fix markdown, comments and structure
 - Add README

# Initialization


Google Colab Cell

In [None]:
if "google.colab" in str(get_ipython()):
    !git clone https://github.com/lukebella/SpotifyRegression.git
    !mv SpotifyRegression/* .
    !rm -fr SpotifyRegression

Insert your Kaggle credentials for downloading the dataset:

In [None]:
import os

os.environ['KAGGLE_USERNAME'] = "xxxxxx"
os.environ['KAGGLE_KEY'] = "xxxxxx"
!kaggle datasets download -p ./data -d maharshipandya/-spotify-tracks-dataset
!unzip -n ./data/-spotify-tracks-dataset.zip -d ./data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import mean_squared_error
%matplotlib inline

In [None]:
# Open the dataset

dataset_file = "data/dataset.csv"

dataset_df = pd.read_csv(dataset_file).drop(columns='Unnamed: 0')
dataset_df = dataset_df.sample(frac = 1, random_state=0).dropna()

dataset_df

# Split the dataset

In [None]:
# Normalize loudness and tempo

loudness_norm = dataset_df["loudness"]
tempo_norm = dataset_df["tempo"]

tempo_norm = tempo_norm/tempo_norm.max(axis=0)

#(abs(min(col)) + i)/max+abs(min(col))

min_loud = abs(loudness_norm.min(axis=0))
max_loud = abs(loudness_norm.max(axis=0))

loudness_norm = (min_loud+loudness_norm)/(min_loud+max_loud)
dataset_df["loudness"] = loudness_norm
dataset_df["tempo"] = tempo_norm

#print(dataset_df["loudness"])

In [None]:
# Create a DataFrame with only track_id and track_genre one-hot encoded
track_genre_df = pd.get_dummies(dataset_df[["track_id", "track_genre"]], columns=['track_genre'], dtype=int)

# Merge the track with the same id
track_genre_df = track_genre_df.groupby("track_id", as_index=False)[[i for i in track_genre_df.columns if i.startswith("track_genre_")]].agg(np.sum)

# Create a partial DataFrame with all feature except track_genre
partial_df = pd.get_dummies(dataset_df.drop(columns=["artists", "album_name", "track_name", 'track_genre']),
                            columns=['explicit', 'key', 'mode', 'time_signature'], dtype=int) \
    .drop_duplicates(subset=['track_id'])

# Merge partial_df and track_genre_df to create the new dataset
categorical_df = pd.merge(partial_df, track_genre_df, on=['track_id'], how='inner').drop(columns="track_id")

# Removed explicit_False because haveing explicit_True this features is useless
categorical_df = categorical_df.drop(columns=["explicit_False"])
categorical_df = categorical_df.rename(columns={"explicit_True": "explicit"})

categorical_df


In [None]:
# Create a mask to split the dataset into training set and test set

np.random.seed(0)
mask = np.random.rand(len(categorical_df))<0.7

In [None]:
# Categorical features

train_cat_df = categorical_df[mask]
test_cat_df = categorical_df[~mask]

In [None]:
# Numerical features
numerical_df = categorical_df[["popularity", "duration_ms", "danceability", "energy", "loudness",
                           "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]]

train_num_df = numerical_df[mask]
test_num_df = numerical_df[~mask]

# Defining functions

In [None]:
# Create the hyperplane using regular ridge regression
def ridge_regression(alpha, train_set):
    y = train_set[["popularity"]]
    train_set = train_set.drop(columns='popularity')
    n_rows, n_cols = train_set.shape  # Get the dimensions of the input matrix s
    s_t = train_set.transpose()  # Transpose of matrix s
    
    # Calculate the identity matrix with the appropriate size
    identity = np.identity(n_cols)
    
    # Calculate the ridge regression coefficients using matrix operations
    w = (np.linalg.inv(alpha * identity + np.dot(s_t, train_set)).dot(s_t)).dot(y) 
    
    # Convert the coefficients to a DataFrame for better presentation
    w_df = pd.DataFrame(w, columns=["Values"], index=train_set.columns)
    
    return w_df

In [None]:
# Predict the popularity of a track x using an hyperplane w
def predict(w, x):
    pred = w.transpose().dot(x.drop(labels='popularity'))[0]
    #pred = max(0, pred)
    #return min(100, pred)
    return pred

In [None]:
# Compute the average square loss of the hyperplane w
def avg_square_loss(w, test_set):
    y = test_set[["popularity"]]
    test_set = test_set.drop(columns='popularity')
    # Convert the DataFrame to a numpy array
    x = test_set.values  
    # Calculate predictions for all rows at once
    predictions = np.dot(x, w)
    
    squared_diff = (predictions -  y)**2
    total_loss = np.sum(squared_diff)
    return total_loss.values[0]/test_set.shape[0]

# Ridge Regression

## Ridge Regression using only numerical features

In [None]:
# Compute the hyperplane for the numercal dataset
result_numeric = ridge_regression(0.5, train_num_df)
result_numeric

In [None]:
# Predict the first row of the training set
predicted_y = predict(result_numeric, train_num_df.iloc[4])
print(f"Predicted y: \t{predicted_y}\nReal y: \t{train_num_df.iloc[4]['popularity']}")

In [None]:
# Compute the Average square loss of the hyperplane (numerical)
print("Average square loss: ", avg_square_loss(result_numeric, test_num_df))

In [None]:
alphas = 10**np.linspace(10, -2, 100)*0.5

num_train_losses = []
num_test_losses = []

for a in alphas:
    ridge = ridge_regression(a, train_num_df)
    num_train_losses.append(avg_square_loss(ridge, train_num_df))
    num_test_losses.append(avg_square_loss(ridge, test_num_df))

In [None]:
plt.title('MSE on numerical features')
plt.plot(alphas, num_train_losses, label='Training accuracy')
plt.plot(alphas, num_test_losses, label='Testing accuracy')
plt.xscale('log')
plt.legend()
plt.xlabel('Alpha')
plt.ylabel('Mean Squared Error')
plt.show()

## Scikitlearn Ridge regression on numerical features


In [None]:
#Scikitlearn Ridge regression

alphas = 10**np.linspace(10, -2, 100)*0.5

sk_num_train_losses = []
sk_num_test_losses = []

for a in alphas:
    clf = Ridge(alpha = a)
    clf.fit(train_num_df.drop(columns='popularity'), train_num_df['popularity'])
    sk_num_train_losses.append(mean_squared_error(train_num_df['popularity'], clf.predict(train_num_df.drop(columns='popularity'))))
    sk_num_test_losses.append(mean_squared_error(test_num_df['popularity'], clf.predict(test_num_df.drop(columns='popularity'))))

In [None]:
plt.title('ScikitLearn: MSE on numerical features')
plt.plot(alphas, sk_num_train_losses, label='Training accuracy')
plt.plot(alphas, sk_num_test_losses, label='Testing accuracy')
plt.xscale('log')
plt.legend()
plt.xlabel('Alpha')
plt.ylabel('Mean Squared Error')
plt.show()

## Ridge regression considering all features

In [None]:
# Compute the hyperplane for the numercal dataset
result_categoric = ridge_regression(0.005, train_cat_df)
result_categoric

In [None]:
# Predict the first row of the training set
predicted_y = predict(result_categoric, train_cat_df.iloc[0])
print(f"Predicted y: \t{predicted_y}\nReal y: \t{train_cat_df.iloc[0]['popularity']}")

In [None]:
# Compute the Average square loss of the hyperplane (categorical)
print("Average square loss: ", avg_square_loss(result_categoric, test_cat_df))


In [None]:
alphas = 10**np.linspace(10, -2, 100)*0.5

cat_train_losses = []
cat_test_losses = []

for a in alphas:
    ridge = ridge_regression(a, train_cat_df)
    cat_train_losses.append(avg_square_loss(ridge, train_cat_df))
    cat_test_losses.append(avg_square_loss(ridge, test_cat_df))

In [None]:
plt.title('MSE on all features')
plt.plot(alphas, cat_train_losses, label='Training accuracy')
plt.plot(alphas, cat_test_losses, label='Testing accuracy')
plt.xscale('log')
plt.legend()
plt.xlabel('Alpha')
plt.ylabel('Mean Squared Error')
plt.show()

## ScikitLearn Ridge regression on all features

In [None]:
#Scikitlearn Ridge regression

alphas = 10**np.linspace(10, -2, 100)*0.5

sk_cat_train_losses = []
sk_cat_test_losses = []

for a in alphas:
    clf = Ridge(alpha = a)
    clf.fit(train_cat_df.drop(columns='popularity'), train_cat_df['popularity'])
    sk_cat_train_losses.append(mean_squared_error(train_cat_df['popularity'], clf.predict(train_cat_df.drop(columns='popularity'))))
    sk_cat_test_losses.append(mean_squared_error(test_cat_df['popularity'], clf.predict(test_cat_df.drop(columns='popularity'))))


In [None]:
plt.title('ScikitLearn: MSE on all features')
plt.plot(alphas, sk_cat_train_losses, label='Training accuracy')
plt.plot(alphas, sk_cat_test_losses, label='Testing accuracy')
plt.xscale('log')
plt.legend()
plt.xlabel('Alpha')
plt.ylabel('Mean Squared Error')
plt.show()

## Numerical vs All features

In [None]:
plt.title('MSE: numerical features vs. all features')
plt.plot(alphas, num_train_losses, label='Num training accuracy')
plt.plot(alphas, num_test_losses, label='Num testing accuracy')
plt.plot(alphas, cat_train_losses, label='All training accuracy')
plt.plot(alphas, cat_test_losses, label='All testing accuracy')
plt.xscale('log')
plt.legend()
plt.xlabel('Alpha')
plt.ylabel('Mean squared error')
plt.show()

In [None]:
plt.title('MSE: numerical features vs. all features x my vs. sk')
plt.plot(alphas, sk_num_train_losses, label='SK Num training accuracy')
plt.plot(alphas, sk_num_test_losses, label= 'SK Num testing accuracy')
plt.plot(alphas, sk_cat_train_losses, label='SK All training accuracy')
plt.plot(alphas, sk_cat_test_losses, label= 'SK All testing accuracy')
plt.plot(alphas, num_train_losses, label='Num training accuracy')
plt.plot(alphas, num_test_losses, label='Num testing accuracy')
plt.plot(alphas, cat_train_losses, label='All training accuracy')
plt.plot(alphas, cat_test_losses, label='All testing accuracy')
plt.xscale('log')
# plt.legend()
plt.xlabel('Alpha')
plt.ylabel('Mean squared error')
plt.show()

In [None]:
plt.title('MSE: numerical features my vs. sk')
plt.plot(alphas, sk_num_train_losses, label='SK Num training accuracy')
plt.plot(alphas, sk_num_test_losses, label= 'SK Num testing accuracy')
# plt.plot(alphas, sk_cat_train_losses, label='SK All training accuracy')
# plt.plot(alphas, sk_cat_test_losses, label= 'SK All testing accuracy')
plt.plot(alphas, num_train_losses, label='Num training accuracy')
plt.plot(alphas, num_test_losses, label='Num testing accuracy')
# plt.plot(alphas, cat_train_losses, label='All training accuracy')
# plt.plot(alphas, cat_test_losses, label='All testing accuracy')
plt.xscale('log')
# plt.legend()
plt.xlabel('Alpha')
plt.ylabel('Mean squared error')
plt.show()

In [None]:
plt.title('MSE: all features my vs. sk')
# plt.plot(alphas, sk_num_train_losses, label='SK Num training accuracy')
# plt.plot(alphas, sk_num_test_losses, label= 'SK Num testing accuracy')
plt.plot(alphas, sk_cat_train_losses, label='SK All training accuracy')
plt.plot(alphas, sk_cat_test_losses, label= 'SK All testing accuracy')
# plt.plot(alphas, num_train_losses, label='Num training accuracy')
# plt.plot(alphas, num_test_losses, label='Num testing accuracy')
plt.plot(alphas, cat_train_losses, label='All training accuracy')
plt.plot(alphas, cat_test_losses, label='All testing accuracy')
plt.xscale('log')
# plt.legend()
plt.xlabel('Alpha')
plt.ylabel('Mean squared error')
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.suptitle('MSE on numerical features')
fig.set_size_inches(10, 5)

ax1.plot(alphas, num_train_losses, label='Training accuracy')
ax1.plot(alphas, num_test_losses, label='Testing accuracy')
ax1.set_xscale('log')
ax1.set_xlabel('Alpha')
ax1.set_ylabel('Mean squared error')
ax1.legend()
ax1.set_title("Scratch")

ax2.plot(alphas, sk_num_train_losses, label='Training accuracy')
ax2.plot(alphas, sk_num_test_losses, label='Testing accuracy')
ax2.set_xscale('log')
ax2.set_xlabel('Alpha')
ax2.set_ylabel('Mean squared error')
ax2.legend()
ax2.set_title("ScikitLearn")


In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.suptitle('MSE on all features')
fig.set_size_inches(10, 5)

ax1.plot(alphas, cat_train_losses, label='Training accuracy')
ax1.plot(alphas, cat_test_losses, label='Testing accuracy')
ax1.set_xscale('log')
ax1.set_xlabel('Alpha')
ax1.set_ylabel('Mean squared error')
ax1.legend()
ax1.set_title("Scratch")

ax2.plot(alphas, sk_cat_train_losses, label='Training accuracy')
ax2.plot(alphas, sk_cat_test_losses, label='Testing accuracy')
ax2.set_xscale('log')
ax2.set_xlabel('Alpha')
ax2.set_ylabel('Mean squared error')
ax2.legend()
ax2.set_title("ScikitLearn")


# (Nested) Cross Validation

In [None]:
def cross_validation(k, dataset, alphas):

    # Return a df from an arraty of df excepr the i-th
    def get_set_except_i(dataset_array, i):
        return pd.concat(dataset_array[j] for j in range(len(dataset_array)) if i!=j)
    
    # Split the dataset into k parts
    dataset_array = np.array_split(dataset, k)
    
    losses = []

    for i in range(k):
        # In the i-th iteration, Si is the test and S\Si is the training
        test_cv = dataset_array[i]
        train_cv = get_set_except_i(dataset_array, i)

        # Split the training set into a new training set and a valid set (nested CV)
        train_cv_array = np.array_split(train_cv, k-1)
        dev_cv = train_cv_array[0]
        nested_cv = get_set_except_i(train_cv_array, 0)
        
        # Find the best hyperparameter of your alphas
        loss = float("inf")
        alpha = 0
        for a in alphas:
            predictor = ridge_regression(a, nested_cv)

            local_loss = avg_square_loss(predictor, dev_cv)
            if loss > local_loss:
                loss = local_loss
                alpha = a
                
        # Compute k predictors and their losses
        prediction = ridge_regression(alpha, train_cv)
        losses.append(avg_square_loss(prediction, test_cv))

    #Find the avg loss of the predictors
    return np.mean(losses), prediction, alpha

In [None]:
#CV Ridge regression on numerical features

# TODO WIP ...

In [None]:
K = 5
alphas = 10**np.linspace(10, -2, 100)*0.5
loss_cv, pred_cv, a_cv = cross_validation(K, categorical_df[:2000], alphas)
print("Average loss with nested CV: ", loss_cv)
print("Best alpha with nested CV: ", a_cv)

# no shuffle
# Average loss with nested CV:  499.37945948919185
# Best alpha with nested CV:  201.85086292982749

# shuffle
# Average loss with nested CV:  369.03813448560055
# Best alpha with nested CV:  0.005

# shuffle + group by track_genre
# Average loss with nested CV:  275.4156551707504
# Best alpha with nested CV:  5.361336110051605


## ScikitLearn Ridge CV

In [None]:
alphas = 10**np.linspace(10, -2, 100)*0.5

clf = RidgeCV(alphas=alphas, cv=5)
clf.fit(categorical_df.drop(columns='popularity'),categorical_df['popularity'])

sk_loss_cv = mean_squared_error(categorical_df['popularity'][:2000], clf.predict(categorical_df.drop(columns='popularity')[:2000]))
sk_loss_cv

# no shuffle
# 392.4801619277324

# shuffle
# 367.9301797580014

# shuffle + group by track_genre
# 273.3852050852446


# Kernel Ridge Regression


Function for calculating thorugh permutations gamma.

In [None]:
def avg_norm(dataset, n_samples):
    d = dataset.sample(n = n_samples, random_state = 0).values
    norm = 0
    for i in range(n_samples):
        for j in range(i,n_samples):
            norm += (np.linalg.norm(d[i] - d[j]))**2
    return norm/n_samples

In [None]:
avg_gamma = avg_norm(categorical_df, 4000)
avg_gamma

### These kernel functions take the hyperparameter gamma as input.

In [None]:
def gaussian_kernel(gamma, v1, v2):
    norm = np.square(np.linalg.norm(v1 - v2))
    return np.exp((norm)/-(2 * (gamma))) # maybe we need to add **2 to gamma (or **0.5 to ScikitLearn RBF)


def kernel_ridge_regression(dataset, alpha, gamma):
    y = dataset["popularity"]
    dataset_values = dataset.drop(columns='popularity').values
    n_samples = dataset.shape[0]

    kernel = np.zeros((n_samples, n_samples))
    for i in range(n_samples):
        for j in range(i, n_samples):
            kernel[i, j] = gaussian_kernel(gamma, dataset_values[i], dataset_values[j])

    # we consider half of the datapoints since it is the 'specular'
    kernel = np.triu(kernel, 1) + kernel.transpose()
    
    identity = np.identity(n_samples)

    #alpha is the best hyperparmeter achieved by the cv process above
    #w = y.transpose() @ np.linalg.inv((alpha * identity + kernel)) 
    w = np.linalg.solve(alpha * identity + kernel, y)
 
    w_df = pd.DataFrame(w, columns=['weights'])
    return w_df


def kernel_predict(w, dataset, x, gamma):
    x_values = x.drop(labels='popularity').values
    dataset_values = dataset.drop(columns='popularity').values
    kernel_values = np.array([gaussian_kernel(gamma, x_values, x_i) for x_i in dataset_values])
    prediction = w['weights'] @ kernel_values
    # prediction = max(0, prediction)  # ReLU
    # prediction = min(100, prediction)
    return prediction


def kernel_avg_square_loss(w, train_set, test_set, gamma):
    y = test_set[["popularity"]]
    predictions = test_set.apply(lambda r: kernel_predict(w, train_set, r, gamma), 1)
    squared_diff = (predictions - y.transpose())**2
    return np.mean(squared_diff, axis=1)[0]


In [None]:
# train_set = train_cat_df[:5000]
# x = categorical_df.iloc[90]
# gamma = 10000000


# #1 test
# w = kernel_ridge_regression(train_set, 1, gamma)

# print(x['popularity'])
# print(kernel_predict(w, train_set, x, gamma))

# kernel_loss = kernel_avg_square_loss(w, train_set, test_cat_df[:1000], gamma)
# print("AVG Square loss: ", kernel_loss)
# print("AVG loss: ", kernel_loss**(1/2))


### Test Kernel only on numerical features

In [None]:
# Test kernel on numerical features

train_set = train_num_df[:2000]
x = numerical_df.iloc[90]

gamma = avg_norm(train_set, train_set.shape[0])
w = kernel_ridge_regression(train_set, 1, gamma)

print(x['popularity'])
print(kernel_predict(w, train_set, x, gamma))

kernel_loss = kernel_avg_square_loss(w, train_set, test_num_df[:500], gamma)
print("AVG Square loss: ", kernel_loss)
print("AVG loss: ", kernel_loss**(1/2))

### Test Kernel on all features

In [None]:
# Test kernel on all features

train_set = train_cat_df[:6000]
x = categorical_df.iloc[90]

gamma = avg_norm(train_set, train_set.shape[0])
w = kernel_ridge_regression(train_set, 1, gamma)

print(x['popularity'])
print(kernel_predict(w, train_set, x, gamma))

kernel_loss = kernel_avg_square_loss(w, train_set, test_cat_df[:1000], gamma)
print("AVG Square loss: ", kernel_loss)
print("AVG loss: ", kernel_loss**(1/2))

# 1000 train on all test set
# AVG Square loss:  556.4917928496213
# AVG loss:  23.590078271375475

# 6000 train 1000 test
# AVG Square loss:  502.54840986535186
# AVG loss:  22.41759152686461

## ScikitLearn Kernel Ridge regression

In [None]:
#Scikitlearn Kernel Ridge regression on numerical features

# TODO WIP ...

In [None]:
#Scikitlearn Kernel Ridge regression on all features

train_set = train_cat_df[:6000]
test_set = test_cat_df[:1000]

# test with linear kernel
clf = KernelRidge(kernel=RBF(avg_gamma), alpha=a_cv, gamma=avg_gamma)
clf.fit(train_set.drop(columns='popularity'), train_set['popularity'])
sk_kernel_train_loss = mean_squared_error(train_set['popularity'], clf.predict(train_set.drop(columns='popularity')))
sk_kernel_test_loss = mean_squared_error(test_set['popularity'], clf.predict(test_set.drop(columns='popularity')))
print("Train ", sk_kernel_train_loss)
print("Test ", sk_kernel_test_loss)

# train 6000 test 1000
# Train  488.59196560321664
# Test  502.69249755574214

## Kernel Cross Validation

In [None]:
alphas = 10. ** np.arange(-2, 5)
# gammas = 10. ** np.arange(0, 5)
gammas = np.linspace(1, avg_gamma*2, 5)
print(alphas, gammas)


In [None]:
# train 5000

# alphas
# gammas

# re train 20000
# re test  4000


def kernel_cross_validation(k, dataset, alpha, gammas):
    # Return a df from an arraty of df excepr the i-th
    def get_set_except_i(dataset_array, i):
        return pd.concat(dataset_array[j] for j in range(len(dataset_array)) if i!=j)

    # Split the dataset into k parts
    dataset_array = np.array_split(dataset, k)

    losses = []

    for i in range(k):
        # In the i-th iteration, Si is the test and S\Si is the training
        test_cv = dataset_array[i]
        train_cv = get_set_except_i(dataset_array, i)

        # Split the training set into a new training set and a valid set (nested CV)
        train_cv_array = np.array_split(train_cv, k-1)
        dev_cv = train_cv_array[0]
        nested_cv = get_set_except_i(train_cv_array, 0)

        # Find the best hyperparameter of your alphas
        loss = float("inf")
        gamma = 0
        for g in gammas:
            predictor = kernel_ridge_regression(nested_cv, alpha, g)

            local_loss = kernel_avg_square_loss(predictor, nested_cv, dev_cv, g)
            if loss > local_loss:
                loss = local_loss
                gamma = g

        # Compute k predictors and their losses
        prediction = kernel_ridge_regression(train_cv, alpha, gamma)
        losses.append(kernel_avg_square_loss(prediction, train_cv, test_cv, gamma))

    #Find the avg loss of the predictors
    return np.mean(losses), prediction, gamma

In [None]:
K = 5
kcv_size = 500
kcv_df = categorical_df.sample(kcv_size, random_state=0)
kernel_cv_losses = []
kernel_cv_predictor = []
kernel_cv_gamma = []
for a in alphas:
    tmp = kernel_cross_validation(K, kcv_df, a, gammas)
    kernel_cv_losses.append(tmp[0])
    kernel_cv_predictor.append(tmp[1])
    kernel_cv_gamma.append(tmp[2])

In [None]:
print(alphas)
print(kernel_cv_gamma)
print(kernel_cv_losses)
ind = kernel_cv_losses.index(min(kernel_cv_losses))
best_alpha = alphas[ind]
best_gamma = kernel_cv_gamma[ind]
best_gamma


In [None]:
bar_alpha = [f'{i}' for i in alphas] 
bar_name = [str(i) for i in kernel_cv_gamma]
bar_value = kernel_cv_losses
bar_colors = ['tab:green', 'tab:orange', 'tab:blue', 'tab:red', 'y', 'c', 'm']

bars = plt.bar(bar_alpha, bar_value, label=bar_name, color=bar_colors)
for b in bars:
    height = b.get_height()
    plt.text(b.get_x() + b.get_width() / 2.0, height, f'{height:.3f}', ha='center', va='bottom')
    plt.text(b.get_x() + b.get_width() / 2.0, height/2, f'{float(b.get_label()):.1}', ha='center', va='bottom')

plt.ylabel('MSE')
plt.xlabel('Alpha')
plt.title('Hyperparameter tuning in Kernel Ridge regression')
#plt.legend() #for report, comment this line and specify that the gamma value is the same

In [None]:
#Train and test Kernel CV with bigger set sizes
train_cv = train_cat_df[:6000]
test_cv = test_cat_df[:1000]
prediction = kernel_ridge_regression(train_cv, best_alpha, best_gamma)
kernel_avg_square_loss(prediction, train_cv, test_cv, best_gamma)

# 502.50438422998735 with 6000 and 1000

# CV vs. Kernel Ridge regression

In [None]:
bar_name = ['my cv', 'sk cv', 'my kernel', 'sk kernel']
bar_value = [loss_cv, sk_loss_cv, kernel_loss, sk_kernel_test_loss]
bar_colors = ['tab:green', 'tab:orange', 'tab:green', 'tab:orange']

bars = plt.bar(bar_name, bar_value, label=bar_name, color=bar_colors)
for b in bars:
    height = b.get_height()
    plt.text(b.get_x() + b.get_width() / 2.0, height, f'{height:.4f}', ha='center', va='bottom')
plt.ylabel('MSE')
plt.title('CV vs. Kernel Ridge regression')
plt.legend()