# Collaborative filtering con surprise

Fuente: https://www.kaggle.com/code/ibtesama/getting-started-with-a-movie-recommendation-system

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import math
import scipy.stats as stats
from scipy import spatial
import surprise
import surprise.model_selection

from sklearn.metrics.pairwise import cosine_similarity

In [3]:
BASE_PATH = "dataset"
movies = pd.read_csv(f"{BASE_PATH}/movies_metadata.csv", low_memory=False, dtype={'id':str, 'original_title':str})
user_ratings = pd.read_csv(f"{BASE_PATH}/ratings.csv", dtype={'userId': str, 'movieId': str, 'rating': float,'timestamp': int})
id_links = pd.read_csv(f"{BASE_PATH}/links.csv", dtype={'movieId': str, 'imdbId': str, 'tmdbId': str})

In [4]:
def get_movie_name(movie_tmdb_id: str) -> str:
    return movies[movies["id"] == movie_tmdb_id]["original_title"][0]

def get_movie_id(movie_name: str) -> str:
    return movies[movies["original_title"] == movie_name]["id"].iloc[0]

In [5]:
def decorate_with_titles(df: pd.DataFrame):
    df_with_titles = pd.merge(df, movies[["id", "original_title"]], left_on="tmdbId", right_on="id", how="left")
    return df_with_titles.drop('id', axis=1) # 1 = columns
    

In [6]:
# Extend user ratings with additional information

# movie ids
user_ratings_ids = pd.merge(user_ratings, id_links[["movieId", "tmdbId"]], left_on='movieId', right_on='movieId', how='left')

In [7]:
# movie names
user_ratings_titles = decorate_with_titles(user_ratings_ids)

In [83]:
#user_ratings_small = user_ratings_titles[:len(user_ratings_titles) //128]
user_ratings_small = user_ratings_titles[:len(user_ratings_titles) // 2**7]
len(user_ratings_small)

203389

In [84]:
def predict_ratings_for_movies(model, user_id: str, movie_ids: pd.Series) -> pd.DataFrame:
    rows = []
    movies_rated_by_user = user_ratings_small[user_ratings_small['userId'] == user_id]["tmdbId"]
    movie_ids = remove_rated_movies(movie_ids, movies_rated_by_user)

    for _, movie_id in movie_ids.items():
        pred = model.predict(user_id, movie_id)
        rows.append({"tmdbId": movie_id, "est_rating": pred.est})

    return decorate_with_titles(pd.DataFrame.from_records(rows)).sort_values(by="est_rating", ascending=False).head(10)

In [85]:
def remove_rated_movies(movie_ids: pd.Series, rated_movies: pd.Series) -> pd.Series:
    # https://stackoverflow.com/questions/69774160/how-to-delete-values-from-one-pandas-series-that-are-common-to-another
    return movie_ids[~np.isin(movie_ids, rated_movies)]

## Evaluación

In [86]:
# Cuantos users tienen cierta cantidad de ratings?
#user_ratings_ids_sin_friki = user_ratings_ids[user_ratings_ids["userId"] != "45811"]
#user_ratings_ids.groupby(["userId"]).count().reset_index().groupby(["rating"]).count().sort_values("userId")
#user_ratings_small.groupby(["userId"]).count().reset_index().groupby(["rating"]).count().sort_values("userId")

Unnamed: 0_level_0,userId,movieId,timestamp,tmdbId
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18276,1,1,1,1
1801,1,1,1,1
1271,1,1,1,1
1809,1,1,1,1
1810,1,1,1,1
...,...,...,...,...
10,6795,6795,6795,6795
17,7012,7012,7012,7012
5,7324,7324,7324,7324
16,10105,10105,10105,10105


In [13]:
user_ratings_ids[user_ratings_ids["userId"] == "45811"]

Unnamed: 0,userId,movieId,rating,timestamp,tmdbId
4439700,45811,1,4.0,1450166644,862
4439701,45811,2,3.0,1473241811,8844
4439702,45811,6,4.0,1450164123,949
4439703,45811,9,2.5,1482568533,9091
4439704,45811,10,3.0,1485704866,710
...,...,...,...,...,...
4457971,45811,175941,3.0,1501022661,81313
4457972,45811,175945,2.0,1501022673,190817
4457973,45811,175951,1.5,1501022675,120831
4457974,45811,175967,2.5,1501022665,43656


Para usar MARK y MAPK, tenemos que pasar dos listas de la misma longitud que representan los ratings reales (*actual*) y
los ratings predichos (*predicted*).

Para separar el dataset en train y test, vamos a sacar 10 películas de cada usuario. Luego vamos a predecir 10 para cada uno y
aplicar MARK y MAPK para evaluar cada modelo.

Previamente vamos a filtrar el dataset de todos los usuarios que no tengan al menos 15 películas, para tener suficientes para entrenar.

In [87]:
# 1. Sacar los que tienen menos de 15
user_rating_count = user_ratings_small.groupby(["userId"]).count()
users_to_remove = user_rating_count[user_rating_count["movieId"] < 15].reset_index()["userId"]
filtered_user_ratings = user_ratings_small[~user_ratings_small["userId"].isin(users_to_remove)]

In [88]:
# 2. Sacar 10 de cada uno para test
user_ids = filtered_user_ratings["userId"].unique()
train = filtered_user_ratings
test = pd.DataFrame(columns=filtered_user_ratings.columns)

for user_id in user_ids:
    movies_of_user = filtered_user_ratings[filtered_user_ratings["userId"] == user_id].sample(n=10)
    train.drop(movies_of_user.index, inplace=True)
    test = pd.concat([test, movies_of_user])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [89]:
# 3. Entrenamos el modelo con la data de train
# user_ratings_small_sup = user_ratings_small[['userId', 'tmdbId', 'rating']].dropna()
reader = surprise.Reader(rating_scale=(1, 5))
data = surprise.Dataset.load_from_df(train[['userId', 'tmdbId', 'rating']], reader)

In [90]:
svd = surprise.SVD()
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f9f4a1ab1f0>

In [91]:
import average_precision
import recmetrics.metrics
from tqdm import tqdm

In [95]:

predicted = []
actual = []
user_ids = train["userId"].unique()
all_movie_ids = movies["id"]
for user_id in tqdm(user_ids):
    predicted_movies = predict_ratings_for_movies(svd, user_id, all_movie_ids)["tmdbId"]
    predicted.append(predicted_movies)

    actual_movies = test[test["userId"] == user_id]["tmdbId"]
    actual.append(actual_movies)

print("Finished!")

100%|█████████████████████| 1663/1663 [07:57<00:00,  3.48it/s]

Finished!





In [96]:
predicted2 = list(map(list, predicted))
actual2 = list(map(list, actual))

In [97]:
mark = recmetrics.metrics.mark(predicted2, actual2)
mapk = average_precision.mapk(predicted2, actual2)

In [98]:
mark, mapk

(0.0, 0.0)

In [92]:
def split_into_chunks(elems: list, chunks: int):
    chunk_size = len(elems)//chunks
    rem = len(elems)%chunks
    chunks_split = [ elems[chunk_size*i:chunk_size*(i+1)] for i in range(0, chunks)]

    # Agregamos el resto al último
    chunks_split[chunks-1].extend(user_ids[len(user_ids) - rem:])

    return chunks_split

In [93]:
%%time
from multiprocess import Process, Manager
from typing import List
# https://stackoverflow.com/questions/10415028/how-can-i-recover-the-return-value-of-a-function-passed-to-multiprocessing-proce


def predict(i: int, return_dict, users: List[str]):
    actual = []
    predicted = []
    print(f"[{i}]: start")
    for user_id in tqdm(users, position=i, desc=f" proc #{i}"):
        predicted_movies = predict_ratings_for_movies(svd, user_id, all_movie_ids)["tmdbId"]
        actual_movies = test[test["userId"] == user_id]["tmdbId"]
        
        predicted.append(list(predicted_movies))
        actual.append(list(actual_movies))
        
    return_dict[i] = {
        "pred": predicted,
        "actual": actual,
    }

    print(f"[{i}]: finish")

users_split = split_into_chunks(list(train["userId"].unique()), 8)

procs = []
manager = Manager()
return_dict = manager.dict()
for i, chunk in enumerate(users_split):
    p = Process(target=predict, args=(i, return_dict, chunk))
    p.start()
    procs.append(p)

for p in procs:
    p.join()

predicted = []
actual = []    

for return_value in return_dict.values():
    predicted.extend(return_value["pred"])
    actual.extend(return_value["actual"])
print("Finished!")

[0]: start
[1]: start
[2]: start
[3]: start
[4]: start
[5]: start[6]: start

[7]: start
[5]: finish
[0]: finish
[6]: finish
[1]: finish
[3]: finish
[7]: finish
[4]: finish
[2]: finish
Finished!
CPU times: user 59.6 ms, sys: 129 ms, total: 189 ms
Wall time: 1min 46s


In [94]:
mark = recmetrics.metrics.mark(predicted, actual, k=10)
mapk = average_precision.mapk(predicted, actual, k=10)
mark, mapk

(0.0, 0.0)

In [None]:
# test is in
df = pd.DataFrame({
    'ids': [1, 2, 3, 4],
    'values': ["a", "b", "c", "d"]
})

to_remove = pd.Series([1, 2])
df[~df["ids"].isin(to_remove)]

Unnamed: 0,ids,values
2,3,c
3,4,d


In [None]:
# TODO: Tal vez da 0,0 porque los tipos de ids de actual y predicted son int y string.