## Content sopa

Tomamos los usuarios que tengan x+y ratings de peliculas.
Removemos las x peliculas de sus ratings, y pedimos las recomendaciones para las y peliculas.\
Del total de recomendaciones nos quedamos con el TOP z, ordenando por aparicion, y el promedio del cosine_similarity.\
Calculamos recall y precision, variamos x,y,z

In [87]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import time
import result_io

In [2]:
BASE_PATH = "dataset"
movies = pd.read_csv(f"{BASE_PATH}/movies_metadata.csv", low_memory=False)
credits = pd.read_csv(f'{BASE_PATH}/credits.csv')
keywords = pd.read_csv(f'{BASE_PATH}/keywords.csv')

In [3]:
# Remove rows with bad IDs.
movies = movies.drop([19730, 29503, 35587])

In [4]:
# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
movies['id'] = movies['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
movies = movies.merge(credits, on='id')
movies = movies.merge(keywords, on='id')

In [5]:
# Sacamos las películas duplicadas, algunas como id 69234 aparecen dos veces
len_before = len(movies)
movies = movies.drop_duplicates(subset=["id"]).reset_index()
print(f"before: {len_before}, after: {len(movies)}, diff: {len_before - len(movies)}")

before: 46628, after: 45432, diff: 1196


In [6]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    movies[feature] = movies[feature].apply(literal_eval)

In [7]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [8]:
# Define new director, cast, genres and keywords features that are in a suitable form.
movies['director'] = movies['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    movies[feature] = movies[feature].apply(get_list)
# Print the new features of the first 3 films
movies[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


In [9]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [10]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    movies[feature] = movies[feature].apply(clean_data)

In [11]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
# Create a new soup feature

In [12]:
movies['soup'] = movies.apply(create_soup, axis=1)
movies[['soup']].head(2)

Unnamed: 0,soup
0,jealousy toy boy tomhanks timallen donrickles ...
1,boardgame disappearance basedonchildren'sbook ...


In [13]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movies['soup'])

In [14]:
count_matrix.shape

(45432, 73880)

In [15]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [16]:
real_shit_indices = pd.Series(movies.index, index=movies['id'])

In [17]:
def get_recommendations_and_similarities(data, movie_id):
    # Get the movie index from dataframe
    idx = real_shit_indices[movie_id]
    
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    rows = []
    
    for index, similarity in sim_scores:
        title = data.iloc[index]['title']
        tmdb_id = data.iloc[index]['id']
        row = {'index':index, 'title':title, 'similarity':similarity, 'tmdbId': tmdb_id}
        rows.append(row)

    # Return the top 10 most similar movies
    #return pd.DataFrame.from_records(rows)
    return rows

In [36]:
user_ratings = pd.read_csv(f"{BASE_PATH}/ratings.csv", dtype={'userId': int, 'movieId': str, 'rating': float,'timestamp': int})
id_links = pd.read_csv(f"{BASE_PATH}/links.csv", dtype={'movieId': str, 'imdbId': str, 'tmdbId': str})
user_ratings = pd.merge(user_ratings, id_links, left_on='movieId', right_on='movieId', how='left')

In [44]:
user_ratings = user_ratings.dropna(subset=["tmdbId"])

In [52]:
user_ratings = user_ratings.astype({"tmdbId": int})

In [102]:
def decorate_with_titles(df: pd.DataFrame):
    df_with_titles = pd.merge(df, movies[["id", "title"]], left_on="tmdbId", right_on="id", how="left")
    return df_with_titles.drop('id', axis=1) # 1 = columns

In [19]:
# Precomputamos 
movie_ids = movies["id"]
recoms_by_movie = {}

for movie_id in tqdm(movie_ids):
    recoms_by_movie[movie_id] = get_recommendations_and_similarities(movies, movie_id)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=45432.0), HTML(value='')))




In [20]:
import result_io
import json

def write_recoms_by_movie(recoms: dict):
    with open("results/recoms-by-movie-sopa.json", 'w') as f:
        json_str = json.dumps(recoms)
        f.write(json_str)

#json_str = json.dumps(recoms_by_movie)
#write_recoms_by_movie(recoms_by_movie)
#recoms_by_movie = result_io.read_recoms_by_movie()

In [None]:
def timestep(start: float, name: str) -> float:
    now = time.time()
    print(f"{name}: {now - start}")
    return now

In [32]:
def get_cached_recommendations_and_similarities(movie_id):
    rows = recoms_by_movie[int(movie_id)]
    return pd.DataFrame.from_records(rows)

In [53]:
def get_user_recommendations(user, user_ratings_train=user_ratings):
    # WARNING!
    #
    # La matriz de ratings no usa el mismo ID que la matriz movies_metadata
    # En el archivo links se establece una relación entre el movieID de ratings y los IDs de TMBD e IMBD (que el primero parece ser el de movies_metadata)
    movies_and_ratings = user_ratings_train[user_ratings_train['userId'] == user][['tmdbId','rating']]
    out = pd.DataFrame(columns=['index', 'title', 'similarity', "tmdbId"])

    for _, info in movies_and_ratings.iterrows():
        # real shit indices se indexa con string, y estos son numeros i.e. '123', y movieId es un float, tonse '123.0' pincha
        movieID = str(int(info.loc['tmdbId']))
        rating = info.loc['rating']

        recommendations = get_cached_recommendations_and_similarities(movieID)
        
        # Pesa la similaridad * rating, y la normaliza ( /5.0)
        recommendations['similarity'] = recommendations['similarity'] * rating / 5.0

        out = pd.concat([out,recommendations])
        #out = out.append(recommendations, ignore_index=True)

    out = out.groupby(['index','title', 'tmdbId'])

    # TODO: tal vez hacer algo diferente de mean que premie que aparezca más de una vez.
    out = out.agg({'similarity':'mean'}).rename(columns={'similarity':'mean_similarity'}).reset_index()
    
    out.sort_values(by='mean_similarity', ascending=False, inplace=True)
    to_remove = pd.merge(movies_and_ratings, real_shit_indices.to_frame(), left_on='tmdbId', right_on='id', how='left')
    
    # Dado que le mergeamos la serie, queda la columna referenciable con el int 0 que son
    # la lista de index del dataframe de movies
    to_remove = to_remove[0].to_list()

    out = out[~out['index'].isin(to_remove)][0:10]
    return out

In [54]:
get_user_recommendations(2)

Unnamed: 0,index,title,tmdbId,mean_similarity
26,2279,Star Trek: Insurrection,200,0.75
54,5805,Star Trek: Nemesis,201,0.63901
5,324,Star Trek: Generations,193,0.63901
12,1154,The Empire Strikes Back,1891,0.56
170,42175,The Big Sick,416477,0.5
127,23456,Mission: Impossible - Rogue Nation,177677,0.48
7,442,The Favor,50463,0.478091
15,1323,Star Trek VI: The Undiscovered Country,174,0.456435
16,1324,Star Trek V: The Final Frontier,172,0.416667
25,2262,A View to a Kill,707,0.4


In [88]:
#user_ratings_small = user_ratings[:len(user_ratings) // 2**6]
user_ratings_small = user_ratings
len(user_ratings_small)

26010786

In [89]:
# 1. Sacar películas para las que no tenemos metadata
# hay algunas películas como "253768" que están en ratings pero no en movies.
rated_movies = user_ratings_small["tmdbId"]
rated_movies_with_metadata = rated_movies[rated_movies.isin(movies["id"])]
metadata_filtered_user_ratings = user_ratings_small[user_ratings_small["tmdbId"].isin(rated_movies_with_metadata)]

print(f"total: {len(user_ratings_small)}, after filter: {len(metadata_filtered_user_ratings)}")

total: 26010786, after filter: 25981578


In [90]:
# 2. Sacar los que tienen menos de 15
user_rating_count = metadata_filtered_user_ratings.groupby(["userId"]).count()
users_to_remove = user_rating_count[user_rating_count["movieId"] < 15].reset_index()["userId"]
filtered_user_ratings = metadata_filtered_user_ratings[~metadata_filtered_user_ratings["userId"].isin(users_to_remove)]

print(f"total: {len(metadata_filtered_user_ratings)}, after filter: {len(filtered_user_ratings)}")
print(f"(diff = {len(metadata_filtered_user_ratings) - len(filtered_user_ratings)})")

total: 25981578, after filter: 25556150
(diff = 425428)


In [91]:
def split_into_chunks(elems: list, chunks: int):
    chunk_size = len(elems)//chunks
    rem = len(elems)%chunks
    chunks_split = [ elems[chunk_size*i:chunk_size*(i+1)] for i in range(0, chunks)]

    # Agregamos el resto al último
    chunks_split[chunks-1].extend(elems[len(elems) - rem:])

    return chunks_split

In [92]:
%%time
# 3. Sacar 10 de cada uno para test

from multiprocess import Process, Manager
from typing import List
# https://stackoverflow.com/questions/10415028/how-can-i-recover-the-return-value-of-a-function-passed-to-multiprocessing-proce

def sample_test_ratings(procnum: int, return_dict, users: List[int]):
    print(f"[{procnum}] start")
    test = pd.DataFrame(columns=filtered_user_ratings.columns)

    for user_id in tqdm(users, position=procnum, desc=f" proc #{procnum}"):
        movies_of_user = filtered_user_ratings[filtered_user_ratings["userId"] == user_id].sample(n=10)
        test = pd.concat([test, movies_of_user])

    return_dict[procnum] = test
    print(f"[{procnum}] finish")

user_ids = list(filtered_user_ratings["userId"].unique())
users_split = split_into_chunks(user_ids, 6)

procs = []
manager = Manager()
return_dict = manager.dict()
for i, chunk in enumerate(users_split):
    p = Process(target=sample_test_ratings, args=(i, return_dict, chunk))
    p.start()
    procs.append(p)

for p in procs:
    p.join()

test = pd.DataFrame(columns=filtered_user_ratings.columns)
for return_value in return_dict.values():
    test = pd.concat([test, return_value])
    
train = filtered_user_ratings.drop(test.index)

print("Finished!")

[0] start
[1] start[2] start

[3] start
[4] start
[5] start


HBox(children=(HTML(value=' proc #1'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #2'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #0'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #3'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #4'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #5'), FloatProgress(value=0.0, max=35413.0), HTML(value='')))





[2] finish


[0] finish
[3] finish
[5] finish
[1] finish
[4] finish
Finished!
CPU times: user 2min 41s, sys: 1min 20s, total: 4min 1s
Wall time: 40min 34s


In [93]:
train.to_csv("dfs/content-sopa-train.csv")
test.to_csv("dfs/content-sopa-test.csv")

In [94]:
train = pd.read_csv("dfs/content-sopa-train.csv", index_col=0, dtype={"tmdbId": int})
test = pd.read_csv("dfs/content-sopa-test.csv", index_col=0, dtype={"tmdbId": int})

  mask |= (ar1 == a)


In [95]:
# Remove users that were already processed
import result_io
processed_users = result_io.read_processed_users_for(result_io.NAME_CONTENT_SOPA)

users = train["userId"]
users = users[~np.isin(users, processed_users)]
users = list(users.unique())

print(f"Already processed {len(processed_users)}/{len(users) + len(processed_users)} users")

No processed users
Already processed 0/212468 users


In [96]:
%%time
from multiprocess import Process, Manager
from typing import List
# https://stackoverflow.com/questions/10415028/how-can-i-recover-the-return-value-of-a-function-passed-to-multiprocessing-proce

def predict(i: int, users: List[int]):
    actual = []
    predicted = []
    print(f"[{i}]: start")
    for user_id in tqdm(users, position=i, desc=f" proc #{i}"):
        predicted_movies = list(get_user_recommendations(user_id, train)["tmdbId"])
        actual_movies = list(test[test["userId"] == user_id]["tmdbId"])
        
        result_io.write_results_new_format(result_io.NAME_CONTENT_SOPA, user_id, predicted_movies, actual_movies)
        
    print(f"[{i}]: finish")

users_split = split_into_chunks(users, 6)

procs = []
for i, chunk in enumerate(users_split):
    p = Process(target=predict, args=(i, chunk))
    p.start()
    procs.append(p)

for p in procs:
    p.join()

print("Finished!")

[1]: start[0]: start

[2]: start
[3]: start
[4]: start
[5]: start


HBox(children=(HTML(value=' proc #0'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #2'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #1'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #3'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #4'), FloatProgress(value=0.0, max=35411.0), HTML(value='')))

HBox(children=(HTML(value=' proc #5'), FloatProgress(value=0.0, max=35413.0), HTML(value='')))


[5]: finish

[2]: finish

[4]: finish

[3]: finish

[0]: finish

[1]: finish
Finished!
CPU times: user 2min 10s, sys: 1min 11s, total: 3min 22s
Wall time: 1h 55min 6s


In [97]:
import result_io

In [98]:
import average_precision
import recmetrics.metrics

predicted, actual = read_results_new_format(result_io.NAME_CONTENT_SOPA)

mark = recmetrics.metrics.mark(actual, predicted, k=10)
mapk = average_precision.mapk(actual, predicted, k=10)
mark, mapk

(0.011037699682730932, 0.011037699682730932)

In [None]:
from typing import List, Tuple
def read_results_new_format(name: str) -> Tuple[List[List[str]], List[List[str]]]:
    with open(result_io.filename(name), 'r') as f:
        content = f.read()
        lines = content.rstrip().split('\n')
        predicted = []
        actual = []
        
        for line in lines:
            l = line.split('|')
            # user|predicted|actual
            pred = result_io.parse_list(l[1])
            act = result_io.parse_list(l[2])

            predicted.append(pred)
            actual.append(act)
        
        return predicted, actual

## Corriendo para nuestros users

In [141]:
df_users = pd.read_csv("users.csv")

In [158]:
import dataframe_image as dfi

user_id_manu = 300001
user_id_elias = 300002
user_id_mati = 300003

In [162]:
df = get_user_recommendations(user_id_elias, df_users)
dfi.export(df, "graficos/tests/content-sopa-elias.png")

[0704/164608.363852:INFO:headless_shell.cc(660)] Written to file /var/folders/ww/8n507dkn503d0xnc7_dlmgy1lj2mm0/T/tmp_vyqwkg0/temp.png.


In [163]:
df = get_user_recommendations(user_id_manu, df_users)
dfi.export(df, "graficos/tests/content-sopa-manu.png")

[0704/164610.723870:INFO:headless_shell.cc(660)] Written to file /var/folders/ww/8n507dkn503d0xnc7_dlmgy1lj2mm0/T/tmp2jkws8x5/temp.png.


In [164]:
df = get_user_recommendations(user_id_mati, df_users)
dfi.export(df, "graficos/tests/content-sopa-mati.png")

[0704/164611.958676:INFO:headless_shell.cc(660)] Written to file /var/folders/ww/8n507dkn503d0xnc7_dlmgy1lj2mm0/T/tmpvunp2rcn/temp.png.
