# Collaborative filtering con surprise

Fuente: https://www.kaggle.com/code/ibtesama/getting-started-with-a-movie-recommendation-system

In [2]:
import pandas as pd
import numpy as np
import math
import scipy.stats as stats
from scipy import spatial
import surprise
import surprise.model_selection

from sklearn.metrics.pairwise import cosine_similarity

In [3]:
BASE_PATH = "dataset/"
movies = pd.read_csv(f"{BASE_PATH}/movies_metadata.csv", low_memory=False, dtype={'id':str, 'original_title':str})
user_ratings = pd.read_csv(f"{BASE_PATH}/ratings.csv", dtype={'userId': str, 'movieId': str, 'rating': float,'timestamp': int})
id_links = pd.read_csv(f"{BASE_PATH}/links.csv", dtype={'movieId': str, 'imdbId': str, 'tmdbId': str})

In [4]:
def get_movie_name(movie_tmdb_id: str) -> str:
    return movies[movies["id"] == movie_tmdb_id]["original_title"][0]

def get_movie_id(movie_name: str) -> str:
    return movies[movies["original_title"] == movie_name]["id"].iloc[0]

In [5]:
def decorate_with_titles(df: pd.DataFrame):
    df_with_titles = pd.merge(df, movies[["id", "original_title"]], left_on="tmdbId", right_on="id", how="left")
    return df_with_titles.drop('id', axis=1) # 1 = columns
    

In [6]:
# Extend user ratings with additional information

# movie ids
user_ratings_ids = pd.merge(user_ratings, id_links[["movieId", "tmdbId"]], left_on='movieId', right_on='movieId', how='left')

In [7]:
# movie names
user_ratings_titles = decorate_with_titles(user_ratings_ids)

In [8]:
user_ratings_small = user_ratings_titles[:len(user_ratings_titles) //128]

In [9]:
user_ratings_small_sup = user_ratings_small[['userId', 'tmdbId', 'rating']].dropna()
user_ratings_sup = user_ratings_titles[['userId', 'tmdbId', 'rating']].dropna()

In [10]:
user_ratings_small_sup.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203289 entries, 0 to 203388
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   userId  203289 non-null  object 
 1   tmdbId  203289 non-null  object 
 2   rating  203289 non-null  float64
dtypes: float64(1), object(2)
memory usage: 6.2+ MB


In [11]:
reader = surprise.Reader(rating_scale=(1, 5))
data = surprise.Dataset.load_from_df(user_ratings_small_sup, reader)

In [12]:
#svd = surprise.SVD()
#surprise.model_selection.cross_validate(svd, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

In [13]:
svd = surprise.SVD()
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd298acf790>

In [31]:
def predict_ratings_for_movies(model, user_id: str, movie_ids: pd.Series) -> pd.DataFrame:
    rows = []
    movies_rated_by_user = user_ratings_small[user_ratings_small['userId'] == user_id]["tmdbId"]
    movie_ids = remove_rated_movies(movie_ids, movies_rated_by_user)

    for _, movie_id in movie_ids.items():
        pred = model.predict(user_id, movie_id)
        rows.append({"tmdbId": movie_id, "est_rating": pred.est})

    return decorate_with_titles(pd.DataFrame.from_records(rows)).sort_values(by="est_rating", ascending=False).head(10)

In [26]:
def remove_rated_movies(movie_ids: pd.Series, rated_movies: pd.Series) -> pd.Series:
    # https://stackoverflow.com/questions/69774160/how-to-delete-values-from-one-pandas-series-that-are-common-to-another
    return movie_ids[~np.isin(movie_ids, rated_movies)]

In [32]:
all_movie_ids = movies["id"]
predict_ratings_for_movies(svd, "2", all_movie_ids)

Unnamed: 0,tmdbId,est_rating,original_title
863,3078,4.470912,It Happened One Night
7207,553,4.258784,Dogville
4769,269,4.249035,À bout de souffle
5467,129,4.234181,千と千尋の神隠し
3366,10774,4.221673,Network
1124,975,4.209552,Paths of Glory
1158,175,4.202627,Le Grand Bleu
18479,77338,4.193433,Intouchables
1159,11645,4.187255,乱
2594,247,4.166274,The Killing


## Evaluación

In [73]:
# Cuantos users tienen cierta cantidad de ratings?
#user_ratings_ids_sin_friki = user_ratings_ids[user_ratings_ids["userId"] != "45811"]
#user_ratings_ids.groupby(["userId"]).count().reset_index().groupby(["rating"]).count().sort_values("userId")
#user_ratings_small.groupby(["userId"]).count().reset_index().groupby(["rating"]).count().sort_values("userId")

Unnamed: 0_level_0,userId,movieId,timestamp,tmdbId,original_title
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2814,1,1,1,1,1
404,1,1,1,1,1
282,1,1,1,1,1
401,1,1,1,1,1
400,1,1,1,1,1
...,...,...,...,...,...
6,54,54,54,54,54
17,59,59,59,59,59
5,65,65,65,65,65
16,78,78,78,78,78


In [46]:
user_ratings_ids[user_ratings_ids["userId"] == "45811"]

Unnamed: 0,userId,movieId,rating,timestamp,tmdbId
4439700,45811,1,4.0,1450166644,862
4439701,45811,2,3.0,1473241811,8844
4439702,45811,6,4.0,1450164123,949
4439703,45811,9,2.5,1482568533,9091
4439704,45811,10,3.0,1485704866,710
...,...,...,...,...,...
4457971,45811,175941,3.0,1501022661,81313
4457972,45811,175945,2.0,1501022673,190817
4457973,45811,175951,1.5,1501022675,120831
4457974,45811,175967,2.5,1501022665,43656


Para usar MARK y MAPK, tenemos que pasar dos listas de la misma longitud que representan los ratings reales (*actual*) y
los ratings predichos (*predicted*).

Para separar el dataset en train y test, vamos a sacar 10 películas de cada usuario. Luego vamos a predecir 10 para cada uno y
aplicar MAR y MAP para evaluar cada modelo.

Previamente vamos a filtrar el dataset de todos los usuarios que no tengan al menos 15 películas, para tener suficientes para entrenar.

In [96]:
# 1. Sacar los que tienen menos de 15
user_rating_count = user_ratings_small.groupby(["userId"]).count()
users_to_remove = user_rating_count[user_rating_count["movieId"] < 15].reset_index()["userId"]
filtered_user_ratings = user_ratings_small[~user_ratings_small["userId"].isin(users_to_remove)]

In [101]:
# 2. Sacar 10 de cada uno para test
user_ids = filtered_user_ratings["userId"].unique()
train = filtered_user_ratings
test = pd.DataFrame(columns=filtered_user_ratings.columns)

for user_id in user_ids:
    # TODO: aca tal vez estamos agarrando películas con mal rating para test, así
    # premiando a los modelos que recomienden pelis que no les gustaron al user
    movies_of_user = filtered_user_ratings[filtered_user_ratings["userId"] == user_id].head(10)
    train.drop(movies_of_user.index, inplace=True)
    test = pd.concat([test, movies_of_user])

In [118]:
user_id = "1"
predicted_movies = predict_ratings_for_movies(svd, user_id, all_movie_ids)["tmdbId"]
#ctual_movies = test[test["userId" == user_id]]["tmdbId"]
predict_ratings_for_movies(svd, user_id, all_movie_ids)["tmdbId"]
test[test["userId"]== user_id]

Unnamed: 0,userId,movieId,rating,timestamp,tmdbId,original_title
10,1,4878,5.0,1425941434,141,Donnie Darko
11,1,5577,5.0,1425941397,9685,Igby Goes Down
12,1,33794,4.0,1425942005,272,Batman Begins
13,1,54503,3.5,1425941313,8363,Superbad
14,1,58559,4.0,1425942007,155,The Dark Knight
15,1,59315,5.0,1425941502,1726,Iron Man
16,1,68358,5.0,1425941464,13475,Star Trek
17,1,69844,5.0,1425942139,767,Harry Potter and the Half-Blood Prince
18,1,73017,5.0,1425942699,10528,Sherlock Holmes
19,1,81834,5.0,1425942133,12444,Harry Potter and the Deathly Hallows: Part 1


In [124]:
#import ml_metrics
import recmetrics.metrics
from tqdm import tqdm

In [125]:
rows = []

for user_id in tqdm(train["userId"]):

    predicted_movies = predict_ratings_for_movies(svd, user_id, all_movie_ids)["tmdbId"]
    actual_movies = test[test["userId"] == user_id]["tmdbId"]
    
    mark = recmetrics.metrics.mark(predicted_movies, actual_movies)
    #mapk = mar(
    
    rows.append({"userId": user_id, "mar_k": mark, "map_k": None})

df = pd.DataFrame.from_records(rows)

  0%|                                                                                          | 96/183329 [00:21<11:27:55,  4.44it/s]


KeyboardInterrupt: 

In [122]:
df = pd.DataFrame.from_records(rows)
df["mar_k"].mean()

0.19543246018038987

In [82]:
# test is in
df = pd.DataFrame({
    'ids': [1, 2, 3, 4],
    'values': ["a", "b", "c", "d"]
})

to_remove = pd.Series([1, 2])
df[~df["ids"].isin(to_remove)]

Unnamed: 0,ids,values
2,3,c
3,4,d
