# Collaborative filtering con surprise

Fuente: https://www.kaggle.com/code/ibtesama/getting-started-with-a-movie-recommendation-system

In [1]:
import pandas as pd
import numpy as np
import math
import scipy.stats as stats
from scipy import spatial
import surprise
import surprise.model_selection

from sklearn.metrics.pairwise import cosine_similarity

In [46]:
BASE_PATH = "dataset/"
movies = pd.read_csv(f"{BASE_PATH}/movies_metadata.csv", low_memory=False, dtype={'id':str, 'original_title':str})
user_ratings = pd.read_csv(f"{BASE_PATH}/ratings.csv", dtype={'userId': str, 'movieId': str, 'rating': float,'timestamp': int})
id_links = pd.read_csv(f"{BASE_PATH}/links.csv", dtype={'movieId': str, 'imdbId': str, 'tmdbId': str})

In [47]:
def get_movie_name(movie_tmdb_id: str) -> str:
    return movies[movies["id"] == movie_tmdb_id]["original_title"][0]

def get_movie_id(movie_name: str) -> str:
    return movies[movies["original_title"] == movie_name]["id"].iloc[0]

In [48]:
def decorate_with_titles(df: pd.DataFrame):
    df_with_titles = pd.merge(df, movies[["id", "original_title"]], left_on="tmdbId", right_on="id", how="left")
    return df_with_titles.drop('id', axis=1) # 1 = columns
    

In [49]:
# Extend user ratings with additional information

# movie ids
user_ratings_ids = pd.merge(user_ratings, id_links[["movieId", "tmdbId"]], left_on='movieId', right_on='movieId', how='left')

In [50]:
# movie names
user_ratings_titles = decorate_with_titles(user_ratings_ids)

In [51]:
user_ratings_small = user_ratings_titles[:len(user_ratings_titles) //128]

In [68]:
user_ratings_small_sup = user_ratings_small[['userId', 'tmdbId', 'rating']].dropna()
user_ratings_sup = user_ratings_titles[['userId', 'tmdbId', 'rating']].dropna()

In [53]:
user_ratings_small_sup.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203289 entries, 0 to 203388
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   userId  203289 non-null  object 
 1   tmdbId  203289 non-null  object 
 2   rating  203289 non-null  float64
dtypes: float64(1), object(2)
memory usage: 6.2+ MB


In [69]:
reader = surprise.Reader(rating_scale=(1, 5))
data = surprise.Dataset.load_from_df(user_ratings_sup, reader)

In [55]:
#svd = surprise.SVD()
#surprise.model_selection.cross_validate(svd, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

In [70]:
svd = surprise.SVD()
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f8e4aa729a0>

In [71]:
def predict_ratings_for_movies(user_id: str, movie_ids: pd.Series) -> pd.DataFrame:
    rows = []
    for _, movie_id in movie_ids.items():
        pred = svd.predict(user_id, movie_id)
        rows.append({"tmdbId": movie_id, "est_rating": pred.est})

    return decorate_with_titles(pd.DataFrame.from_records(rows))

In [75]:
all_movie_ids = movies["id"]
df = predict_ratings_for_movies("2", all_movie_ids)

In [77]:
df.sort_values(by="est_rating", ascending=False).head(15)

Unnamed: 0,tmdbId,est_rating,original_title
14761,20453,4.552063,3 Idiots
18501,77338,4.454476,Intouchables
43379,420714,4.420508,Planet Earth II
15111,141714,4.405043,Wild China
9292,40096,4.395886,O Auto da Compadecida
16988,367647,4.375887,Connections
34834,282758,4.372871,Doctor Who: The Runaway Bride
21502,122906,4.321955,About Time
41599,381284,4.31268,Hidden Figures
33473,359364,4.311578,Human


In [78]:
user_ratings_small[user_ratings_small['userId'] == "2"].sort_values(by="rating", ascending=False)

Unnamed: 0,userId,movieId,rating,timestamp,tmdbId,original_title
46,2,1356,5.0,867039288,199,Star Trek: First Contact
35,2,339,5.0,867041296,2064,While You Were Sleeping
38,2,628,4.0,867039325,1592,Primal Fear
31,2,64,4.0,867039612,19760,Two If by Sea
32,2,79,4.0,867039325,9623,The Juror
45,2,1233,4.0,867039820,387,Das Boot
34,2,260,4.0,867039249,11,Star Wars
36,2,377,4.0,867041121,1637,Speed
37,2,605,4.0,867039973,7300,One Fine Day
44,2,1210,4.0,867039325,1892,Return of the Jedi


In [32]:
algo = surprise.NormalPredictor()
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.random_pred.NormalPredictor at 0x7f8e18422760>