In [18]:
# Data processing
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import scipy.stats

# Visualization
import seaborn as sns

# Similarity
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
%%capture
%run item_item_collaborative_filtering.ipynb

In [4]:
matrix = pd.read_pickle("../data/item_matrix.pkl")

In [19]:
transposed_matrix = matrix.transpose()
train_transposed, test_transposed = train_test_split(transposed_matrix, test_size=0.2, random_state=42)

# Transpose the train and test sets back to the original format
train_matrix = train_transposed.transpose()
test_matrix = test_transposed.transpose()

Train matrix shape: (56, 43170)
Test matrix shape: (56, 10793)


In [21]:
# finding RMSE
N = 10
actual_ratings_list = []
predicted_ratings_list = []

# Function to calculate RMSE
def calculate_rmse(actual_ratings, predicted_ratings):
    return np.sqrt(np.mean((actual_ratings - predicted_ratings) ** 2))

for userID in test_matrix.columns:
    for songID in test_matrix.index:
        actual_rating = test_matrix.loc[songID, userID]
        if not np.isnan(actual_rating): # disregarding NA values
            actual_ratings_list.append(actual_rating)
            
            # calculate the predicted rating
            predicted_rating = get_predicted_ratings(songID, N, userID, test_matrix)
            predicted_ratings_list.append(predicted_rating)

# Convert lists to numpy arrays for calculation
actual_ratings_array = np.array(actual_ratings_list)
predicted_ratings_array = np.array(predicted_ratings_list)
            
# Calculate RMSE only for non-zero and non-NaN ratings
non_zero_indices = np.where(actual_ratings_array != 0)[0]  # Indices of non-zero ratings
actual_ratings_non_zero = actual_ratings_array[non_zero_indices]
predicted_ratings_non_zero = predicted_ratings_array[non_zero_indices]

# Calculate RMSE
rmse = calculate_rmse(actual_ratings_non_zero, predicted_ratings_non_zero)
print("RMSE:", rmse)

RMSE: 3.686965051756149
