<a href="https://colab.research.google.com/github/mattfehr/anime_recommender/blob/main/anime_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error


In [2]:
#get data for users ratings
#-1 means unrated
rating_df = pd.read_csv('rating.csv')
rating_df.head(5)

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [3]:
#remove possible duplicate rows
rating_df = rating_df.drop_duplicates()
rating_df

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [4]:
#replace no ratings with NaNs and drop it
rating_df = rating_df.replace(-1, np.NaN)
rating_df = rating_df.dropna(subset=['rating'])
rating_df

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10.0
81,1,11617,10.0
83,1,11757,10.0
101,1,15451,10.0
153,2,11771,10.0
...,...,...,...
7813732,73515,16512,7.0
7813733,73515,17187,9.0
7813734,73515,22145,10.0
7813735,73516,790,9.0


In [5]:
#only include users who have rated at least 100 shows

#get number of ratings from each user
user_counts = rating_df['user_id'].value_counts()

#get users with at least 100 counts
active_users = user_counts[user_counts >= 100].index

#filter out ratings not from qualified users
rating_df = rating_df[rating_df['user_id'].isin(active_users)]
rating_df


Unnamed: 0,user_id,anime_id,rating
302,5,6,8.0
303,5,15,6.0
304,5,17,6.0
305,5,18,6.0
306,5,20,6.0
...,...,...,...
7813730,73515,13659,8.0
7813731,73515,14345,7.0
7813732,73515,16512,7.0
7813733,73515,17187,9.0


In [6]:
#get number of unique users and anime
print(rating_df.nunique())

user_id     19949
anime_id     9890
rating         10
dtype: int64


In [7]:
#create user ratings matrix
pivot_matrix = rating_df.pivot(index = 'user_id', columns ='anime_id', values = 'rating')
actual_data = pivot_matrix

#normalize the matrix with centered cosine similarity (Pearson Correlation)
user_means = pivot_matrix.mean(axis=1)
pivot_matrix = pivot_matrix.sub(user_means, axis=0)
pivot_matrix = pivot_matrix.fillna(0)
pivot_matrix

anime_id,1,5,6,7,8,15,16,17,18,19,...,34238,34239,34240,34252,34283,34324,34325,34349,34367,34475
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0.000000,0.000000,3.644880,0.000000,0.0,1.64488,0.0,1.64488,1.64488,0.000000,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.00000,0.00000,0.000000,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.00000,0.00000,0.000000,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.00000,0.00000,0.000000,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.000000,0.000000,0.098280,0.000000,0.0,0.00000,0.0,0.00000,0.00000,3.098280,...,0.0,0.0,1.09828,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73503,2.581301,0.581301,2.581301,0.000000,0.0,0.00000,0.0,0.00000,0.00000,0.000000,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73504,2.171429,2.171429,1.171429,-0.828571,0.0,0.00000,0.0,0.00000,0.00000,0.000000,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73507,1.185328,0.185328,1.185328,0.000000,0.0,0.00000,0.0,0.00000,0.00000,0.000000,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73510,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.0,0.00000,0.00000,0.000000,...,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
#find sparsity and convert to sparse matrix
total_elements = pivot_matrix.size
print(f"The total number of elements is: {total_elements}")
zero_elements = (pivot_matrix == 0).sum().sum()
print(f"The number of zero/NaN elements is: {zero_elements}")
sparsity = zero_elements/total_elements
print(f"The sparsity is: {sparsity*100}%")

M = pivot_matrix.to_numpy()
M = csr_matrix(pivot_matrix)

The total number of elements is: 197295610
The number of zero/NaN elements is: 192552702
The sparsity is: 97.59603976996752%


In [9]:
#encode user_id and anime_id

#user
#user_ids = rating_df["user_id"].unique()
user_ids = pivot_matrix.index
user_to_encode = {}
encode_to_user = {}
for i, x in enumerate(user_ids):
  user_to_encode[x] = i
  encode_to_user[i] = x
rating_df["user"] = rating_df["user_id"].map(user_to_encode)

#anime
#anime_ids = rating_df["anime_id"].unique()
anime_ids = pivot_matrix.columns
anime_to_encode = {}
encode_to_anime = {}
for i, x in enumerate(anime_ids):
  anime_to_encode[x] = i
  encode_to_anime[i] = x
rating_df["anime"] = rating_df["anime_id"].map(anime_to_encode)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_df["user"] = rating_df["user_id"].map(user_to_encode)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_df["anime"] = rating_df["anime_id"].map(anime_to_encode)


In [10]:
#decompose matrix
U, E, Vt = svds(M, k=100)
print(U.shape)
print(E.shape)
print(Vt.shape)

#change E to diagnonal matrix
E = np.diag(E)

#combine E and Vt for R = Q * Pt
Pt = np.dot(E, Vt)

(19949, 100)
(100,)
(100, 9890)


In [11]:
#reconstruct matrix with predictions
Q = U
R = np.dot(Q, Pt)
print(R.shape)

(19949, 9890)


In [12]:
#convert Reconstructed matrix into dataframe
R_df = pd.DataFrame(R, index=user_ids, columns=anime_ids)
R_df

anime_id,1,5,6,7,8,15,16,17,18,19,...,34238,34239,34240,34252,34283,34324,34325,34349,34367,34475
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,2.633714,0.891234,1.643608,0.087473,-0.048404,0.755997,0.054666,0.154617,0.564479,1.065556,...,0.018492,-0.001503,0.065403,0.000654,0.015399,-0.010882,0.018667,0.000419,-0.000461,0.010802
7,0.315011,0.093621,0.194380,0.037005,-0.025490,0.044542,-0.054575,-0.003945,-0.001697,-0.026398,...,0.003160,0.000084,0.039998,-0.000025,0.002071,-0.004462,-0.007103,-0.000548,-0.004701,0.000731
11,-0.304484,-0.144681,-0.100836,-0.015213,-0.005762,-0.060689,-0.001875,-0.009697,-0.007124,0.173228,...,-0.005868,0.000268,-0.005981,-0.000004,0.002624,0.001992,0.005389,-0.000193,-0.000297,0.002516
14,-0.159989,-0.051825,0.054112,0.050052,0.025368,-0.003166,0.074415,0.006894,0.048567,-0.025472,...,-0.005169,-0.000048,0.020264,0.000126,0.000278,0.005256,-0.001370,0.000250,0.005225,0.000680
17,0.335725,0.235946,-0.116124,-0.117191,0.012260,0.022814,-0.405215,-0.019718,0.000633,1.080254,...,0.001803,0.000060,0.163531,-0.000169,0.005111,-0.016081,0.000211,0.000092,-0.006726,-0.004224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73503,2.309704,0.919954,0.790791,-0.053644,-0.003021,-0.088001,0.217369,-0.003448,0.041833,0.677072,...,0.002133,-0.000555,0.120941,-0.000012,-0.001254,-0.002223,-0.001159,-0.001756,0.000768,-0.003474
73504,2.387911,0.896922,0.981058,-0.065503,0.000856,-0.031293,0.192481,-0.004050,-0.019974,-0.067745,...,-0.001000,0.000177,0.013898,0.000034,-0.002548,0.004148,-0.001304,0.000414,0.002457,-0.000508
73507,1.352706,0.555130,0.528728,-0.049609,0.008131,0.018777,0.408766,0.013487,0.090806,0.527186,...,-0.005044,-0.000344,0.027698,-0.000050,-0.000880,0.005694,0.004169,-0.000583,0.002889,0.002847
73510,-0.076108,-0.013979,0.070889,-0.016473,-0.002477,0.025561,-0.026252,0.000581,0.005667,-0.046344,...,-0.003076,-0.000115,0.019758,-0.000046,-0.001330,0.002035,0.000612,-0.000018,-0.000534,-0.000639


In [13]:
#find seen shows
def find_seen_shows(data, user_id):
  seen = set()
  for anime_id, rating in data.loc[user_id].items():
    if rating != 0:
      seen.add(anime_id)
  return seen



In [14]:
#function to get top recommended shows
def find_top_shows(data, R, user_id, top_n=10):

  #sort the predictions of the user from the reconstructed matrix
  sorted_predictions = R.loc[user_id].sort_values(ascending=False)

  #get seen shows
  seen = find_seen_shows(data, user_id)

  #filter out the seen shows
  recommendations = sorted_predictions[~sorted_predictions.index.isin(seen)]
  return recommendations.head(top_n)

print(find_top_shows(pivot_matrix, R_df, 5))

anime_id
11061    3.560439
30276    2.779196
1        2.633714
6880     2.426551
28977    2.051999
3002     1.626260
4155     1.593880
13601    1.557419
7472     1.533825
33       1.485479
Name: 5, dtype: float64


In [15]:
#calculate RMSE

# Ensure the DataFrames are aligned
common_index = pivot_matrix.index.intersection(R_df.index)
common_columns = pivot_matrix.columns.intersection(R_df.columns)

In [16]:
# Align the DataFrames
aligned_actual = pivot_matrix.loc[common_index, common_columns]
aligned_predicted = R_df.loc[common_index, common_columns]

In [17]:
# Convert to NumPy arrays
actual_ratings = aligned_actual.to_numpy()
predicted_ratings = aligned_predicted.to_numpy()

# Create a mask for non-zero entries (i.e., where actual ratings are present)
mask = actual_ratings > 0

# Apply mask to actual and predicted ratings
actual_ratings = actual_ratings[mask]
predicted_ratings = predicted_ratings[mask]

In [18]:
# Calculate RMSE
rmse = mean_squared_error(actual_ratings, predicted_ratings, squared=False)
print(f"RMSE: {rmse}")

RMSE: 0.8786403215271984
