## Evaluating the systems
Tomamos los usuarios que tengan x+y ratings de peliculas.
Removemos las x peliculas de sus ratings, y pedimos las recomendaciones para las y peliculas.\
Del total de recomendaciones nos quedamos con el TOP z, ordenando por aparicion, y el promedio del cosine_similarity.\
Calculamos recall y precision, variamos x,y,z

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time
import result_io
import math
import scipy.stats as stats
from scipy import spatial
from tqdm.notebook import tqdm
from multiprocess import Process, Manager
from typing import List

import time

In [2]:
BASE_PATH = "dataset"

movies = pd.read_csv(f"{BASE_PATH}/movies_metadata.csv", low_memory=False)
user_ratings = pd.read_csv(f"{BASE_PATH}/ratings.csv", dtype={'userId': int, 'movieId': str, 'rating': float,'timestamp': int})
id_links = pd.read_csv(f"{BASE_PATH}/links.csv", dtype={'movieId': str , 'tmdbId': str}) 
user_ratings = pd.merge(user_ratings, id_links, left_on='movieId', right_on='movieId', how='left')

In [3]:
# Sacamos las películas duplicadas, algunas como id 69234 aparecen dos veces
len_before = len(movies)
movies = movies.drop_duplicates(subset=["id"]).reset_index()
print(f"before: {len_before}, after: {len(movies)}, diff: {len_before - len(movies)}")

before: 45466, after: 45436, diff: 30


In [30]:
def timestep(start: float, name: str) -> float:
    now = time.time()
    print(f"{name}: {now - start}")
    return now

In [6]:
def get_random_recommendations():
    ten_movies = movies.sample(n=10)
    
    return ten_movies[['id','title']]

In [7]:
get_random_recommendations()

Unnamed: 0,id,title
27334,36634,The People Against O'Hara
22101,77403,Disco Godfather
5139,116904,Bar Girls
37709,337758,Honey Night
18384,60938,Time Without Pity
38856,382088,The Division: Agent Origins
31898,69599,Long Arm of the Law
15556,43605,Treasure Island
12132,13066,I Want Someone to Eat Cheese With
11551,13668,Catch and Release


In [22]:
# 1. Sacar películas para las que no tenemos metadata
# hay algunas películas como "253768" que están en ratings pero no en movies.
rated_movies = user_ratings["tmdbId"]
rated_movies_with_metadata = rated_movies[rated_movies.isin(movies["id"])]
metadata_filtered_user_ratings = user_ratings[user_ratings["tmdbId"].isin(rated_movies_with_metadata)]

print(f"total: {len(user_ratings)}, after filter: {len(metadata_filtered_user_ratings)}")

total: 26024289, after filter: 25981582


In [23]:
# 2. Sacar los que tienen menos de 15
user_rating_count = metadata_filtered_user_ratings.groupby(["userId"]).count()
users_to_remove = user_rating_count[user_rating_count["movieId"] < 15].reset_index()["userId"]
filtered_user_ratings = metadata_filtered_user_ratings[~metadata_filtered_user_ratings["userId"].isin(users_to_remove)]

print(f"total: {len(metadata_filtered_user_ratings)}, after filter: {len(filtered_user_ratings)}")
print(f"(diff = {len(metadata_filtered_user_ratings) - len(filtered_user_ratings)})")

total: 25981582, after filter: 25556154
(diff = 425428)


In [12]:
def split_into_chunks(elems: list, chunks: int):
    chunk_size = len(elems)//chunks
    rem = len(elems)%chunks
    chunks_split = [ elems[chunk_size*i:chunk_size*(i+1)] for i in range(0, chunks)]

    # Agregamos el resto al último
    chunks_split[chunks-1].extend(elems[len(elems) - rem:])

    return chunks_split

In [25]:
%%time
# 3. Sacar 10 de cada uno para test


# https://stackoverflow.com/questions/10415028/how-can-i-recover-the-return-value-of-a-function-passed-to-multiprocessing-proce

def sample_test_ratings(procnum: int, return_dict, users: List[int]):
    print(f"[{procnum}] start")
    test = pd.DataFrame(columns=filtered_user_ratings.columns)

    for user_id in tqdm(users, position=procnum, desc=f" proc #{procnum}"):
        movies_of_user = filtered_user_ratings[filtered_user_ratings["userId"] == user_id].sample(n=10)
        test = pd.concat([test, movies_of_user])

    return_dict[procnum] = test
    print(f"[{procnum}] finish")

user_ids = list(filtered_user_ratings["userId"].unique())
users_split = split_into_chunks(user_ids, 6)

procs = []
manager = Manager()
return_dict = manager.dict()
for i, chunk in enumerate(users_split):
    p = Process(target=sample_test_ratings, args=(i, return_dict, chunk))
    p.start()
    procs.append(p)

for p in procs:
    p.join()

test = pd.DataFrame(columns=filtered_user_ratings.columns)
for return_value in return_dict.values():
    test = pd.concat([test, return_value])
    
train = filtered_user_ratings.drop(test.index)

print("Finished!")

[0] start[1] start
[2] start

[3] start
[4] start
[5] start


HBox(children=(HTML(value=' proc #1'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #0'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #2'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #4'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #3'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #5'), FloatProgress(value=0.0, max=35413.0), HTML(value='')))







[4] finish
[2] finish
[3] finish
[5] finish
[1] finish
[0] finish
Finished!
CPU times: user 4min 4s, sys: 2min 25s, total: 6min 29s
Wall time: 58min 10s


In [28]:
train.to_csv("dfs/random-train.csv")
test.to_csv("dfs/random-test.csv")

In [9]:
train = pd.read_csv("dfs/random-train.csv", index_col=0, dtype={"tmdbId": str})
test = pd.read_csv("dfs/random-test.csv", index_col=0, dtype={"tmdbId": str})

In [None]:
# Remove users that were already processed
import result_io
processed_users = result_io.read_processed_users_for(result_io.NAME_RANDOM)

users = train["userId"]
users = users[~np.isin(users, processed_users)]
users = list(users.unique())

print(f"Already processed {len(processed_users)}/{len(users) + len(processed_users)} users")

In [13]:
%%time
from multiprocess import Process, Manager
from typing import List
# https://stackoverflow.com/questions/10415028/how-can-i-recover-the-return-value-of-a-function-passed-to-multiprocessing-proce

def predict(i: int, users: List[int]):
    print(f"[{i}]: start")
    for user_id in tqdm(users, position=i, desc=f" proc #{i}"):
        predicted_movies = list(get_random_recommendations())
        actual_movies = list(test[test["userId"] == user_id]["tmdbId"])
        
        result_io.write_results_new_format(result_io.NAME_RANDOM, user_id, predicted_movies, actual_movies)

    print(f"[{i}]: finish")

users_split = split_into_chunks(users, 6)

procs = []
for i, chunk in enumerate(users_split):
    p = Process(target=predict, args=(i, chunk))
    p.start()
    procs.append(p)

for p in procs:
    p.join()

print("Finished!")

[0]: start
[1]: start
[2]: start
[3]: start


HBox(children=(HTML(value=' proc #0'), FloatProgress(value=0.0, max=53117.0), HTML(value='')))

HBox(children=(HTML(value=' proc #2'), FloatProgress(value=0.0, max=53117.0), HTML(value='')))

HBox(children=(HTML(value=' proc #1'), FloatProgress(value=0.0, max=53117.0), HTML(value='')))

HBox(children=(HTML(value=' proc #3'), FloatProgress(value=0.0, max=53117.0), HTML(value='')))





[2]: finish
[0]: finish




[3]: finish
[1]: finish




Finished!
CPU times: user 16.2 s, sys: 10.3 s, total: 26.5 s
Wall time: 5min 25s
