# Collaborative filtering con surprise

Fuente: https://www.kaggle.com/code/ibtesama/getting-started-with-a-movie-recommendation-system

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import math
import scipy.stats as stats
from scipy import spatial
import surprise
import surprise.model_selection

from sklearn.metrics.pairwise import cosine_similarity

from tqdm.notebook import tqdm

import time

In [3]:
BASE_PATH = "dataset"
movies = pd.read_csv(f"{BASE_PATH}/movies_metadata.csv", low_memory=False, dtype={'id':str, 'original_title':str})
user_ratings = pd.read_csv(f"{BASE_PATH}/ratings.csv", dtype={'userId': int, 'movieId': int, 'rating': float,'timestamp': int})
id_links = pd.read_csv(f"{BASE_PATH}/links.csv", dtype={'movieId': str, 'tmdbId': str})

In [4]:
# Remuevo los IDs problemáticos y casteo los movie ids a ints para que sea todo más rápido
movies = movies[
    (movies["id"] != "1997-08-20") &
    (movies["id"] != "2012-09-29") &
    (movies["id"] != "2014-01-01")
]
movies = movies.astype({"id": int})
id_links = id_links.dropna().astype({"tmdbId": int, "movieId": int}) # dropna porque tmdbId es NaN a veces

In [5]:
# Sacamos las películas duplicadas, algunas como id 69234 aparecen dos veces
len_before = len(movies)
movies = movies.drop_duplicates(subset=["id"]).reset_index()
print(f"before: {len_before}, after: {len(movies)}, diff: {len_before - len(movies)}")

before: 45463, after: 45433, diff: 30


In [6]:
# Extend user ratings with additional information, movie ids
user_ratings = pd.merge(user_ratings, id_links[["movieId", "tmdbId"]], left_on='movieId', right_on='movieId', how='left')
# Dropna porque hay algunas columnas de id_links que no tienen tmdbId,
# entonces tampoco los vamos a tener en user_ratings.
user_ratings.dropna(axis=0, subset=["tmdbId"], inplace=True)
user_ratings = user_ratings.astype({"tmdbId": int, "movieId": int}) # por alguna razón sino tmdbId es float

In [7]:
# Sacar ratings para las películas que no tenemos metadata
# hay algunas películas como "253768" que están en ratings pero no en movies.
len_before = len(user_ratings)
rated_movies = user_ratings["tmdbId"]
rated_movies_with_metadata = rated_movies[rated_movies.isin(movies["id"])]
user_ratings = user_ratings[user_ratings["tmdbId"].isin(rated_movies_with_metadata)]
print(f"before: {len_before} after: {len(user_ratings)}")

before: 26010786 after: 25981582


In [8]:
#user_ratings_small = user_ratings.sample(len(user_ratings) // 2**6)
user_ratings_small = user_ratings
print(f"{len(user_ratings_small)} of {len(user_ratings)}")

25981582 of 25981582


In [9]:
# Decoro user_ratings con el indice de movies para poder hacer los drops más rápido
#movie_id_to_movies_index = pd.Series(movies.index, index=movies['id'])
#user_ratings_small['movie_index'] = movie_id_to_movies_index[user_ratings_small['tmdbId']].values

In [10]:
def decorate_with_titles(df: pd.DataFrame):
    df_with_titles = pd.merge(df, movies[["id", "original_title"]], left_on="tmdbId", right_on="id", how="left")
    return df_with_titles.drop('id', axis=1) # 1 = columns
# movie names
#user_ratings_titles = decorate_with_titles(user_ratings_ids)

In [11]:
import time

def timestep(start: float, name: str) -> float:
    now = time.time()
    print(f"{name}: {now - start}")
    return now

In [13]:
def predict_ratings_for_movies(model, user_id: int, train) -> pd.DataFrame:
    rows = []

    #start = time.time()
    movie_ids = movies["id"]
    movies_rated_by_user = train[train['userId'] == user_id]["tmdbId"]
    movie_ids = remove_rated_movies(movie_ids, movies_rated_by_user)
    #start = timestep(start, "remove")

    for _, movie_id in movie_ids.items():
        pred = model.predict(user_id, movie_id)
        rows.append({"tmdbId": movie_id, "est_rating": pred.est})

    #start = timestep(start, "predict all")
    df = pd.DataFrame.from_records(rows)
    #start = timestep(start, "records")
    top_10_movies = df.sort_values(by="est_rating", ascending=False).head(10)
    #start = timestep(start, "top 10")
    
    # Comentado porque es muy lento y no hace falta
    #top_10_movies = decorate_with_titles(top_10_movies)
    
    return top_10_movies

In [14]:
def predict_ratings_for_movies_eff(model, user_id: int, train) -> pd.DataFrame:
    movie_ids = movies["id"]
    movies_rated_by_user = train[train['userId'] == user_id]["tmdbId"]
    movie_ids = remove_rated_movies(movie_ids, movies_rated_by_user)

    start = time.time()
    # Armo dataset de mentira para pasarle a .test
    fake_df = pd.DataFrame(columns=['rating', 'tmdbId', 'userId'])
    fake_df['tmdbId'] = movie_ids
    fake_df['userId'] = user_id
    fake_df['rating'] = 0.0

    fake_trainset = surprise.Dataset.load_from_df(fake_df[['userId', 'tmdbId', 'rating']], reader)
    testset = fake_trainset.build_full_trainset().build_testset()
    start = timestep(start, "test set")
    result = svd.test(testset)
    start = timestep(start, "test")

    rows = [{"tmdbId": pred.iid, "est_rating": pred.est} for pred in result]
    timestep(start, "rows")

    #start = timestep(start, "predict all")
    df = pd.DataFrame.from_records(rows)
    #start = timestep(start, "records")
    top_10_movies = df.sort_values(by="est_rating", ascending=False).head(10)
    
    return top_10_movies

In [15]:
def remove_rated_movies(movie_ids: pd.Series, rated_movies: pd.Series) -> pd.Series:
    # https://stackoverflow.com/questions/69774160/how-to-delete-values-from-one-pandas-series-that-are-common-to-another
    return movie_ids[~np.isin(movie_ids, rated_movies)]

In [16]:
# TODO (optimizacion): para sacar los ids rateados mas rapido, se puede decorar user ratings
# con el indice de la pelicula en movies para así hacer drop en remove_rated_movies.

## Evaluación

Para usar MARK y MAPK, tenemos que pasar dos listas de la misma longitud que representan los ratings reales (*actual*) y
los ratings predichos (*predicted*).

Para separar el dataset en train y test, vamos a sacar 10 películas de cada usuario. Luego vamos a predecir 10 para cada uno y
aplicar MARK y MAPK para evaluar cada modelo.

Previamente vamos a filtrar el dataset de todos los usuarios que no tengan al menos 15 películas, para tener suficientes para entrenar.

In [17]:
# 1. Sacar películas para las que no tenemos metadata
# hay algunas películas como "253768" que están en ratings pero no en movies.
rated_movies = user_ratings_small["tmdbId"]
rated_movies_with_metadata = rated_movies[rated_movies.isin(movies["id"])]
metadata_filtered_user_ratings = user_ratings_small[user_ratings_small["tmdbId"].isin(rated_movies_with_metadata)]

print(f"total: {len(user_ratings_small)}, after filter: {len(metadata_filtered_user_ratings)}")

total: 25981582, after filter: 25981582


In [18]:
# 2. Sacar los que tienen menos de 15
user_rating_count = metadata_filtered_user_ratings.groupby(["userId"]).count()
users_to_remove = user_rating_count[user_rating_count["movieId"] < 15].reset_index()["userId"]
filtered_user_ratings = metadata_filtered_user_ratings[~metadata_filtered_user_ratings["userId"].isin(users_to_remove)]

print(f"total: {len(metadata_filtered_user_ratings)}, after filter: {len(filtered_user_ratings)}")
print(f"(diff = {len(metadata_filtered_user_ratings) - len(filtered_user_ratings)})")

total: 25981582, after filter: 25556154
(diff = 425428)


In [19]:
def split_into_chunks(elems: list, chunks: int):
    chunk_size = len(elems)//chunks
    rem = len(elems)%chunks
    chunks_split = [ elems[chunk_size*i:chunk_size*(i+1)] for i in range(0, chunks)]

    # Agregamos el resto al último
    chunks_split[chunks-1].extend(elems[len(elems) - rem:])

    return chunks_split

In [None]:
%%time
# 3. Sacar 10 de cada uno para test
NUM_PROCS = 6

from multiprocess import Process, Manager
from typing import List
# https://stackoverflow.com/questions/10415028/how-can-i-recover-the-return-value-of-a-function-passed-to-multiprocessing-proce

def sample_test_ratings(procnum: int, return_dict, users: List[int]):
    print(f"[{procnum}] start")
    test = pd.DataFrame(columns=filtered_user_ratings.columns)

    for user_id in tqdm(users, position=procnum, desc=f" proc #{procnum}"):
        movies_of_user = filtered_user_ratings[filtered_user_ratings["userId"] == user_id].sample(n=10)
        test = pd.concat([test, movies_of_user])

    return_dict[procnum] = test
    print(f"[{procnum}] finish")

user_ids = list(filtered_user_ratings["userId"].unique())
users_split = split_into_chunks(user_ids, NUM_PROCS)

procs = []
manager = Manager()
return_dict = manager.dict()
for i, chunk in enumerate(users_split):
    p = Process(target=sample_test_ratings, args=(i, return_dict, chunk))
    p.start()
    procs.append(p)

for p in procs:
    p.join()

test = pd.DataFrame(columns=filtered_user_ratings.columns)
for return_value in return_dict.values():
    test = pd.concat([test, return_value])
    
train = filtered_user_ratings.drop(test.index)

print("Finished!")

[0] start[1] start

[2] start
[3] start
[4] start[5] start



HBox(children=(HTML(value=' proc #0'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #1'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #2'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #3'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #4'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #5'), FloatProgress(value=0.0, max=35413.0), HTML(value='')))

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
train.to_csv("dfs/collab2-train.csv")
test.to_csv("dfs/collab2-test.csv")

In [20]:
train = pd.read_csv("dfs/collab2-train.csv", index_col=0, dtype={"tmdbId": int})
test = pd.read_csv("dfs/collab2-test.csv", index_col=0, dtype={"tmdbId": int})

  mask |= (ar1 == a)


In [21]:
# 3. Entrenamos el modelo con la data de train
# user_ratings_small_sup = user_ratings_small[['userId', 'tmdbId', 'rating']].dropna()
reader = surprise.Reader(rating_scale=(1, 5))
data = surprise.Dataset.load_from_df(train[['userId', 'tmdbId', 'rating']], reader)

In [22]:
%%time
svd = surprise.SVD()
trainset = data.build_full_trainset()
svd.fit(trainset)
print("Fit done")

Fit done
CPU times: user 18min 54s, sys: 1min 12s, total: 20min 7s
Wall time: 21min 41s


In [None]:
predict_ratings_for_movies(svd, 1, train)

In [None]:
predict_ratings_for_movies_eff(svd, 1, train)

In [23]:
# Remove users that were already processed
import result_io
processed_users = result_io.read_processed_users()

users = train["userId"]
users = users[~np.isin(users, processed_users)]
users = list(users.unique())

print(f"Already processed {len(processed_users)}/{len(users) + len(processed_users)} users")

Already processed 195888/212468 users


In [24]:
%%time
from multiprocess import Process, Manager, Lock
from typing import List
# https://stackoverflow.com/questions/10415028/how-can-i-recover-the-return-value-of-a-function-passed-to-multiprocessing-proce


def predict(i: int, users: List[int]):
    print(f"[{i}]: start")
    for user_id in tqdm(users, position=i, desc=f" proc #{i}"):
        predicted_movies = list(predict_ratings_for_movies(svd, user_id, train)["tmdbId"])
        actual_movies = list(test[test["userId"] == user_id]["tmdbId"])
        
        result_io.write_col_result(user_id, predicted_movies, actual_movies)

    print(f"[{i}]: finish")

users_split = split_into_chunks(users, 6)

procs = []
for i, chunk in enumerate(users_split):
    p = Process(target=predict, args=(i, chunk))
    p.start()
    procs.append(p)

for p in procs:
    p.join()

print("Finished!")


[0]: start[1]: start

[3]: start[2]: start
[4]: start

[5]: start


HBox(children=(HTML(value=' proc #1'), FloatProgress(value=0.0, max=2763.0), HTML(value='')))

HBox(children=(HTML(value=' proc #0'), FloatProgress(value=0.0, max=2763.0), HTML(value='')))

HBox(children=(HTML(value=' proc #2'), FloatProgress(value=0.0, max=2763.0), HTML(value='')))

HBox(children=(HTML(value=' proc #3'), FloatProgress(value=0.0, max=2763.0), HTML(value='')))

HBox(children=(HTML(value=' proc #4'), FloatProgress(value=0.0, max=2763.0), HTML(value='')))

HBox(children=(HTML(value=' proc #5'), FloatProgress(value=0.0, max=2765.0), HTML(value='')))


[3]: finish





[0]: finish





[1]: finish





[4]: finish





[2]: finish





[5]: finish
Finished!
CPU times: user 21.3 s, sys: 12.7 s, total: 34 s
Wall time: 39min 11s




In [None]:
predicted = []
actual = []

users = list(train["userId"].unique())
for user_id in tqdm(users):
    predicted_movies = predict_ratings_for_movies(svd, user_id, train)["tmdbId"]
    actual_movies = test[test["userId"] == user_id]["tmdbId"]

    predicted.append(list(predicted_movies))
    actual.append(list(actual_movies))

In [25]:
predicted, actual = result_io.read_col_results()

In [26]:
import average_precision
import recmetrics.metrics

In [27]:
predicted, actual = result_io.read_col_results()
mark = recmetrics.metrics.mark(actual, predicted, k=10)
mapk = average_precision.mapk(actual, predicted, k=10)
mark, mapk
# (0.0027199076980248984, 0.0027199076980248984)

(0.0069481047528800605, 0.0069481047528800605)

In [None]:
# test is in
df = pd.DataFrame({
    'ids': [1, 2, 3, 4],
    'values': ["a", "b", "c", "d"]
})

to_remove = pd.Series([1, 2])
df[~df["ids"].isin(to_remove)]

In [None]:
# TODO: Tal vez da 0,0 porque los tipos de ids de actual y predicted son int y string.