## Evaluating the systems
Tomamos los usuarios que tengan x+y ratings de peliculas.
Removemos las x peliculas de sus ratings, y pedimos las recomendaciones para las y peliculas.\
Del total de recomendaciones nos quedamos con el TOP z, ordenando por aparicion, y el promedio del cosine_similarity.\
Calculamos recall y precision, variamos x,y,z

In [4]:
import pandas as pd
import numpy as np

In [5]:
movies = pd.read_csv("../archive/movies_metadata.csv", low_memory=False)

In [6]:
real_shit_indices = pd.Series(movies.index, index=movies['id']).drop_duplicates()

In [48]:
real_shit_indices

id
862           0
8844          1
15602         2
31357         3
11862         4
          ...  
439050    45461
111109    45462
67758     45463
227506    45464
461257    45465
Length: 45466, dtype: int64

In [8]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
movies['overview'] = movies['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(45466, 75827)

In [9]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [31]:
def get_recommendations_and_similarities(data, movie_id, cosine_sim):
    # Get the movie index from dataframe
    idx = real_shit_indices[movie_id]
    
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    rows = []
    
    for index, similarity in sim_scores:
        title = data.iloc[index]['title']
        # TODO ¿ tener movieId o Index (id fila) ?
        row = {'index':index, 'title':title, 'similarity':similarity}
        rows.append(row)

    # Return the top 10 most similar movies
    return pd.DataFrame.from_records(rows)

In [22]:
user_ratings = pd.read_csv("../archive/ratings.csv", dtype={'userId': int, 'movieId': str, 'rating': float,'timestamp': int})

In [23]:
user_ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435


In [24]:
id_links = pd.read_csv("../archive/links.csv", dtype={'movieId': str, 'imdbId': str, 'tmdbId': str})

In [26]:
id_links.head(2)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844


In [28]:
user_ratings = pd.merge(user_ratings, id_links, left_on='movieId', right_on='movieId', how='left')

In [29]:
user_ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,1,110,1.0,1425941529,112573,197
1,1,147,4.5,1425942435,112461,10474


In [79]:
def get_user_recommendations(user):
    # WARNING!
    #
    # La matriz de ratings no usa el mismo ID que la matriz movies_metadata
    # En el archivo links se establece una relación entre el movieID de ratings y los IDs de TMBD e IMBD (que el primero parece ser el de movies_metadata)
    movies_and_ratings = user_ratings[user_ratings['userId'] == user][['tmdbId','rating']]
    out = pd.DataFrame(columns=['index', 'title', 'similarity'])
    
    for _, info in movies_and_ratings.iterrows():
        # real shit indices se indexa con string, y estos son numeros i.e. '123', y movieId es un float, tonse '123.0' pincha
        movieID = str(int(info.loc['tmdbId']))
        rating = info.loc['rating']
        # ID = id_links[id_links["movieId"] == movieID]["tmdbId"].iloc[0]
        recommendations = get_recommendations_and_similarities(movies, movieID, cosine_sim)
        # Pesa la similaridad * rating, y la normaliza ( /5.0)
        recommendations['similarity'] = recommendations['similarity'] * rating / 5.0
        out = pd.concat([out,recommendations])
        #out = out.append(recommendations, ignore_index=True)
    
    out = out.groupby(['index','title'])
    #out['aux'] = None
    #out = out.agg({'similarity':'mean', 'aux':'size'}).rename(columns={'similarity':'mean_similarity','aux':'count'}).reset_index()
    out = out.agg({'similarity':'mean'}).rename(columns={'similarity':'mean_similarity'}).reset_index()
    
    #out.sort_values(by='count', ascending=False, inplace=True)
    out.sort_values(by='mean_similarity', ascending=False, inplace=True)
    to_remove = pd.merge(movies_and_ratings, real_shit_indices.to_frame(), left_on='tmdbId', right_on='id', how='left')
    # Dado que le mergeamos la serie, queda la columna referenciable con el int 0 que son la lista de index del dataframe de movies
    to_remove = to_remove[0].to_list()
    
    out = out[~out['index'].isin(to_remove)][0:10]
    
    
    return out

In [80]:
get_user_recommendations(1)

Unnamed: 0,index,title,mean_similarity
58,10554,Harry Potter and the Goblet of Fire,0.452404
246,44030,The Godfather Trilogy: 1972-1990,0.430101
22,3166,They Might Be Giants,0.383196
66,11927,Harry Potter and the Order of the Phoenix,0.358699
100,17437,Harry Potter and the Deathly Hallows: Part 2,0.348044
17,2301,Young Sherlock Holmes,0.296491
226,39647,Sherlock Holmes and the Leading Lady,0.284516
146,26251,The Hound of the Baskervilles,0.26389
105,18252,The Dark Knight Rises,0.261311
190,32901,Alice in Wonderland,0.25993
