# (Kernel) Ridge Regression
Download the Spotify Tracks Dataset and perform ridge regression to predict the tracks’ popularity. Note that this dataset contains both numerical and categorical features. The student is thus required to follow these guidelines:
- first, train the model using only the numerical features,
- second, appropriately handle the categorical features (for example, with one-hot encoding or other techniques) and use them together with the numerical ones to train the model, in both cases, experiment with different training parameters, 
- use 5-fold cross validation to compute your risk estimates, thoroughly discuss and compare the performance of the model

The student is required to implement from scratch (without using libraries, such as Scikit-learn) the code for the ridge regression, while it is not mandatory to do so for the implementation of the 5-fold cross-validation.

Optional: Instead of regular ridge regression, implement kernel ridge regression using a Gaussian kernel.


In [None]:
import pandas as pd
import numpy as np

In [None]:
dataset = "data/dataset.csv"

dataset_df = pd.read_csv(dataset).drop(columns='Unnamed: 0')
dataset_df

In [None]:
np.random.seed(0)
mask = np.random.rand(len(dataset_df))<0.7

train_df = dataset_df[mask]
test_df = dataset_df[~mask]

y_train_df = train_df[["popularity"]]
y_train_df

In [None]:
def ridge_regression(alpha, y, s):
    n_rows, n_cols = s.shape  # Get the dimensions of the input matrix s
    s_t = s.transpose()  # Transpose of matrix s
    
    # Calculate the identity matrix with the appropriate size
    identity = np.identity(n_cols)
    
    # Calculate the ridge regression coefficients using matrix operations
    w = np.linalg.inv(alpha * identity + np.dot(s_t, s)).dot(s_t).dot(y)
    
    # Convert the coefficients to a DataFrame for better presentation
    w_df = pd.DataFrame(w, columns=["Values"], index=s.columns)
    
    return w_df


def predict(w, x):
    return w.transpose().dot(x)

In [None]:
#Numerical features

train_numeric_df = train_df[["duration_ms","danceability", "energy", "loudness","speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]]
test_numeric_df = test_df[["duration_ms","danceability", "energy", "loudness","speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]]


In [None]:
alpha = 0.9

result_numeric = ridge_regression(alpha, y_train_df, train_numeric_df)
result_numeric

In [None]:
predict(result_numeric, train_numeric_df.iloc[0])

In [None]:
def square_avg_loss(w, test_df, y):
    X = test_df.values  # Convert the DataFrame to a numpy array
    # Calculate predictions for all rows at once
    predictions = np.dot(X, w)
   
    squared_diff = (predictions -  y)**2
    total_loss = np.sum(squared_diff)
    return total_loss.values[0]/test_df.shape[0]

y_test_df= test_df[["popularity"]]
print("Loss: ",square_avg_loss(result_numeric, test_numeric_df, y_test_df))


#TODO for each alpha, save the computed loss in order to draw a graph 

In [None]:
categorical_df = pd.get_dummies(dataset_df.drop(columns= ["popularity", "track_id", "artists", "album_name", "track_name"]), 
                                columns = ['explicit','key', 'mode', 'time_signature', 'track_genre'], dtype=int)

train_cat_df = categorical_df[mask]
test_cat_df = categorical_df[~mask]

train_cat_df


In [None]:
result = ridge_regression(alpha, y_train_df, train_cat_df)
result


In [None]:
predict(result, train_cat_df.iloc[0])
print("Loss: ",square_avg_loss(result, test_cat_df, y_test_df))

#TODO try to avoid some features for seeing whether or not the loss decreases

# Cross Validation

In [None]:

def get_training_set(dataset_array, i):
    return pd.concat(dataset_array[j] for j in range(len(dataset_array)) if i!=j)


K = 5
alphas = 10**np.linspace(10,-2,100)*0.5

cv_df = categorical_df.copy()
cv_df.insert(0, "popularity", dataset_df['popularity'], True)
cv_df


In [None]:
dataset_array = np.array_split(cv_df, K)
#dataset_df.shape --> 114000 rows

losses = [] 
print(len(alphas))
for i in range(K):
    test_cv = dataset_array[i] 
    train_cv = get_training_set(dataset_array, i) 

    train_cv_array = np.array_split(train_cv, K-1) 
    dev_set = train_cv_array[0]
    nested_cv = get_training_set(train_cv_array, 0)
    
    loss = float("inf")
    a = 0
    for alpha in alphas:
        #hyperparameter, y_training, training_set
        predictor = ridge_regression(alpha, nested_cv[['popularity']], nested_cv.drop(columns='popularity'))  
        #result, test_set, y_test
        local_loss = square_avg_loss(predictor, dev_set.drop(columns='popularity'), dev_set[['popularity']])
        if loss > local_loss:
            loss = local_loss
            a = alpha
             
    
    prediction = ridge_regression(a, train_cv[['popularity']], train_cv.drop(columns='popularity'))
    losses.append(square_avg_loss(prediction, test_cv.drop(columns='popularity'),test_cv[['popularity']]))


np.mean(losses)
#Split the dataset into k parts
#In the i-th iteration, Si is the test and S\Si is the training
    #Split the training set into a new training set and a valid set
    #Find the best hyperparameter of your alphas
#Compute k predictors and their losses
#Find the avg loss of the predictors