## Evaluating the systems
Tomamos los usuarios que tengan x+y ratings de peliculas.
Removemos las x peliculas de sus ratings, y pedimos las recomendaciones para las y peliculas.\
Del total de recomendaciones nos quedamos con el TOP z, ordenando por aparicion, y el promedio del cosine_similarity.\
Calculamos recall y precision, variamos x,y,z

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time
import result_io

In [2]:
BASE_PATH = "dataset"
movies = pd.read_csv(f"{BASE_PATH}/movies_metadata.csv", low_memory=False)

In [3]:
# Sacamos las películas duplicadas, algunas como id 69234 aparecen dos veces
len_before = len(movies)
movies = movies.drop_duplicates(subset=["id"]).reset_index()
print(f"before: {len_before}, after: {len(movies)}, diff: {len_before - len(movies)}")

before: 45466, after: 45436, diff: 30


In [4]:
real_shit_indices = pd.Series(movies.index, index=movies['id']).drop_duplicates()

In [5]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
movies['overview'] = movies['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(45436, 75827)

In [111]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [6]:
def get_recommendations_and_similarities(data, movie_id, cosine_sim):
    # Get the movie index from dataframe
    idx = real_shit_indices[movie_id]
    
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    rows = []
    
    for index, similarity in sim_scores:
        title = data.iloc[index]['title']
        tmdb_id = data.iloc[index]['id']
        row = {'index':index, 'title':title, 'similarity':similarity, 'tmdbId': tmdb_id}
        rows.append(row)

    # Return the top 10 most similar movies
    #return pd.DataFrame.from_records(rows)
    return rows

In [5]:
user_ratings = pd.read_csv(f"{BASE_PATH}/ratings.csv", dtype={'userId': int, 'movieId': str, 'rating': float,'timestamp': int})

In [6]:
user_ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435


In [7]:
id_links = pd.read_csv(f"{BASE_PATH}/links.csv", dtype={'movieId': str, 'imdbId': str, 'tmdbId': str})

In [8]:
id_links.head(2)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844


In [9]:
user_ratings = pd.merge(user_ratings, id_links, left_on='movieId', right_on='movieId', how='left')

In [10]:
user_ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,1,110,1.0,1425941529,112573,197
1,1,147,4.5,1425942435,112461,10474


In [13]:
# Precomputamos 
movie_ids = movies["id"]

recoms_by_movie = {}

for movie_id in tqdm(movie_ids):
    recoms_by_movie[movie_id] = get_recommendations_and_similarities(movies, movie_id, cosine_sim)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=45436.0), HTML(value='')))




NameError: name 'cosine_sim' is not defined

In [146]:
import result_io
import json

def write_recoms_by_movie(recoms: dict):
    with open("results/recoms-by-movie.json", 'w') as f:
        json_str = json.dumps(recoms)
        f.write(json_str)

#json_str = json.dumps(recoms_by_movie)
#write_recoms_by_movie(recoms_by_movie)
#recoms_by_movie = result_io.read_recoms_by_movie()

In [23]:
def timestep(start: float, name: str) -> float:
    now = time.time()
    print(f"{name}: {now - start}")
    return now

In [151]:
def get_cached_recommendations_and_similarities(movie_id):
    rows = recoms_by_movie[movie_id]
    return pd.DataFrame.from_records(rows)

In [161]:
def get_user_recommendations(user, user_ratings_train=user_ratings):
    # WARNING!
    #
    # La matriz de ratings no usa el mismo ID que la matriz movies_metadata
    # En el archivo links se establece una relación entre el movieID de ratings y los IDs de TMBD e IMBD (que el primero parece ser el de movies_metadata)
    movies_and_ratings = user_ratings_train[user_ratings_train['userId'] == user][['tmdbId','rating']]
    out = pd.DataFrame(columns=['index', 'title', 'similarity', "tmdbId"])

    for _, info in movies_and_ratings.iterrows():
        # real shit indices se indexa con string, y estos son numeros i.e. '123', y movieId es un float, tonse '123.0' pincha
        movieID = str(int(info.loc['tmdbId']))
        rating = info.loc['rating']

        recommendations = get_cached_recommendations_and_similarities(movieID)
        
        # Pesa la similaridad * rating, y la normaliza ( /5.0)
        recommendations['similarity'] = recommendations['similarity'] * rating / 5.0

        out = pd.concat([out,recommendations])
        #out = out.append(recommendations, ignore_index=True)

    out = out.groupby(['index','title', 'tmdbId'])

    # TODO: tal vez hacer algo diferente de mean que premie que aparezca más de una vez.
    out = out.agg({'similarity':'mean'}).rename(columns={'similarity':'mean_similarity'}).reset_index()
    
    out.sort_values(by='mean_similarity', ascending=False, inplace=True)
    to_remove = pd.merge(movies_and_ratings, real_shit_indices.to_frame(), left_on='tmdbId', right_on='id', how='left')
    
    # Dado que le mergeamos la serie, queda la columna referenciable con el int 0 que son
    # la lista de index del dataframe de movies
    to_remove = to_remove[0].to_list()

    out = out[~out['index'].isin(to_remove)][0:10]
    return out

In [162]:
get_user_recommendations(2)

Unnamed: 0,index,title,tmdbId,mean_similarity
10,1154,The Empire Strikes Back,1891,0.319849
16,2279,Star Trek: Insurrection,200,0.233646
145,30536,Star Trek Beyond,188927,0.212369
19,2658,Better Than Chocolate,18212,0.210309
144,30413,The Star Wars Holiday Special,74849,0.192172
43,6792,Father of the Bride,11846,0.188924
129,26538,Star Wars: The Force Awakens,140607,0.188655
4,324,Star Trek: Generations,193,0.183676
95,18223,Getting to Know You,125506,0.179393
118,23172,The Amazing Spider-Man 2,102382,0.177994


In [11]:
user_ratings_small = user_ratings[:len(user_ratings) // 2**11]
len(user_ratings_small)
user_ratings_small = user_ratings

In [12]:
# 1. Sacar películas para las que no tenemos metadata
# hay algunas películas como "253768" que están en ratings pero no en movies.
rated_movies = user_ratings_small["tmdbId"]
rated_movies_with_metadata = rated_movies[rated_movies.isin(movies["id"])]
metadata_filtered_user_ratings = user_ratings_small[user_ratings_small["tmdbId"].isin(rated_movies_with_metadata)]

print(f"total: {len(user_ratings_small)}, after filter: {len(metadata_filtered_user_ratings)}")

total: 26024289, after filter: 25981582


In [13]:
# 2. Sacar los que tienen menos de 15
user_rating_count = metadata_filtered_user_ratings.groupby(["userId"]).count()
users_to_remove = user_rating_count[user_rating_count["movieId"] < 15].reset_index()["userId"]
filtered_user_ratings = metadata_filtered_user_ratings[~metadata_filtered_user_ratings["userId"].isin(users_to_remove)]

print(f"total: {len(metadata_filtered_user_ratings)}, after filter: {len(filtered_user_ratings)}")
print(f"(diff = {len(metadata_filtered_user_ratings) - len(filtered_user_ratings)})")

total: 25981582, after filter: 25556154
(diff = 425428)


In [None]:
# 3. Sacar 10 de cada uno para test
user_ids = filtered_user_ratings["userId"].unique()
train = filtered_user_ratings
test = pd.DataFrame(columns=filtered_user_ratings.columns)

for user_id in tqdm(user_ids):
    movies_of_user = filtered_user_ratings[filtered_user_ratings["userId"] == user_id].sample(n=10)
    test = pd.concat([test, movies_of_user])

train = train.drop(test.index)

In [14]:
def split_into_chunks(elems: list, chunks: int):
    chunk_size = len(elems)//chunks
    rem = len(elems)%chunks
    chunks_split = [ elems[chunk_size*i:chunk_size*(i+1)] for i in range(0, chunks)]

    # Agregamos el resto al último
    chunks_split[chunks-1].extend(elems[len(elems) - rem:])

    return chunks_split

In [15]:
%%time
# 3. Sacar 10 de cada uno para test

from multiprocess import Process, Manager
from typing import List
# https://stackoverflow.com/questions/10415028/how-can-i-recover-the-return-value-of-a-function-passed-to-multiprocessing-proce

def sample_test_ratings(procnum: int, return_dict, users: List[int]):
    print(f"[{procnum}] start")
    test = pd.DataFrame(columns=filtered_user_ratings.columns)

    for user_id in tqdm(users, position=procnum, desc=f" proc #{procnum}"):
        movies_of_user = filtered_user_ratings[filtered_user_ratings["userId"] == user_id].sample(n=10)
        test = pd.concat([test, movies_of_user])

    return_dict[procnum] = test
    print(f"[{procnum}] finish")

user_ids = list(filtered_user_ratings["userId"].unique())
users_split = split_into_chunks(user_ids, 6)

procs = []
manager = Manager()
return_dict = manager.dict()
for i, chunk in enumerate(users_split):
    p = Process(target=sample_test_ratings, args=(i, return_dict, chunk))
    p.start()
    procs.append(p)

for p in procs:
    p.join()

test = pd.DataFrame(columns=filtered_user_ratings.columns)
for return_value in return_dict.values():
    test = pd.concat([test, return_value])
    
train = filtered_user_ratings.drop(test.index)

print("Finished!")

[0] start[1] start
[2] start

[3] start
[4] start
[5] start


HBox(children=(HTML(value=' proc #1'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #0'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #2'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #4'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #3'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #5'), FloatProgress(value=0.0, max=35413.0), HTML(value='')))









Process Process-5:
Process Process-6:
Process Process-7:
Process Process-4:
Process Process-3:
Process Process-2:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/homebrew/anaconda3/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/homebrew/anaconda3/lib/python3.9/site-

KeyboardInterrupt: 

In [34]:
train.to_csv("dfs/content-train.csv")
test.to_csv("dfs/content-test.csv")

In [64]:
train = pd.read_csv("dfs/content-train.csv", index_col=0, dtype={"tmdbId": str})
test = pd.read_csv("dfs/content-test.csv", index_col=0, dtype={"tmdbId": str})

  mask |= (ar1 == a)


In [65]:
#train[train["userId"] == 67848]
#|135804||203267|
#get_user_recommendations(1, train)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23431474 entries, 3 to 26024288
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
 4   imdbId     int64  
 5   tmdbId     object 
dtypes: float64(1), int64(4), object(1)
memory usage: 1.2+ GB


In [163]:
%%time
from multiprocess import Process, Manager
from typing import List
# https://stackoverflow.com/questions/10415028/how-can-i-recover-the-return-value-of-a-function-passed-to-multiprocessing-proce

def predict(i: int, return_dict, users: List[int]):
    actual = []
    predicted = []
    print(f"[{i}]: start")
    for user_id in tqdm(users, position=i, desc=f" proc #{i}"):
        predicted_movies = get_user_recommendations(user_id, train)["tmdbId"]
        actual_movies = test[test["userId"] == user_id]["tmdbId"]
        
        predicted.append(list(predicted_movies))
        actual.append(list(actual_movies))
        
    return_dict[i] = {
        "pred": predicted,
        "actual": actual,
    }

    print(f"[{i}]: finish")

users_split = split_into_chunks(list(train["userId"].unique()), 4)

procs = []
manager = Manager()
return_dict = manager.dict()
for i, chunk in enumerate(users_split):
    p = Process(target=predict, args=(i, return_dict, chunk))
    p.start()
    procs.append(p)

for p in procs:
    p.join()

predicted = []
actual = []

for return_value in return_dict.values():
    predicted.extend(return_value["pred"])
    actual.extend(return_value["actual"])
print("Finished!")

[0]: start
[1]: start
[2]: start
[3]: start


HBox(children=(HTML(value=' proc #0'), FloatProgress(value=0.0, max=53117.0), HTML(value='')))

HBox(children=(HTML(value=' proc #1'), FloatProgress(value=0.0, max=53117.0), HTML(value='')))

HBox(children=(HTML(value=' proc #2'), FloatProgress(value=0.0, max=53117.0), HTML(value='')))

HBox(children=(HTML(value=' proc #3'), FloatProgress(value=0.0, max=53117.0), HTML(value='')))


[3]: finish

[1]: finish

[2]: finish

[0]: finish
Finished!
CPU times: user 1min 53s, sys: 1min 2s, total: 2min 55s
Wall time: 2h 32min 37s


In [164]:
import result_io
result_io.write_results(result_io.NAME_CONTENT, predicted, actual)

TODO: se está rompiendo porque hay ratings para peliculas que no aparecen en movies. Filtrar user ratings para que no los tenga.
Si sacamos muchas muchas, hay algo raro, si son pocas, debe ser un error del dataset.

In [166]:
import average_precision
import recmetrics.metrics

mark = recmetrics.metrics.mark(actual, predicted, k=10)
mapk = average_precision.mapk(actual, predicted, k=10)
mark, mapk

(0.005652072984435974, 0.005652072984435974)