## Evaluating the systems
Tomamos los usuarios que tengan x+y ratings de peliculas.
Removemos las x peliculas de sus ratings, y pedimos las recomendaciones para las y peliculas.\
Del total de recomendaciones nos quedamos con el TOP z, ordenando por aparicion, y el promedio del cosine_similarity.\
Calculamos recall y precision, variamos x,y,z

In [2]:
import pandas as pd
import numpy as np

In [3]:
movies = pd.read_csv("../archive/movies_metadata.csv", low_memory=False)

In [4]:
real_shit_indices = pd.Series(movies.index, index=movies['id']).drop_duplicates()

In [5]:
real_shit_indices

id
862           0
8844          1
15602         2
31357         3
11862         4
          ...  
439050    45461
111109    45462
67758     45463
227506    45464
461257    45465
Length: 45466, dtype: int64

In [8]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
movies['overview'] = movies['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(45466, 75827)

In [9]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [49]:
def get_recommendations_and_similarities(data, movie_id, cosine_sim):
    # Get the movie index from dataframe
    idx = real_shit_indices[movie_id]
    
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    rows = []
    
    for index, similarity in sim_scores:
        title = data.iloc[index]['title']
        # TODO ¿ tener movieId o Index (id fila) ?
        row = {'index':index, 'title':title, 'similarity':similarity}
        rows.append(row)

    # Return the top 10 most similar movies
    return pd.DataFrame.from_records(rows)

In [48]:
get_recommendations_and_similarities(movies,862,cosine_sim)

Unnamed: 0,index,title,similarity
0,27143,Good Sam,0.147815
1,1777,I Got the Hook Up,0.136935
2,22446,I Know That Voice,0.134279
3,23775,Blood in the Mobile,0.124231
4,27977,Party Girl,0.123438
5,44469,The Fruit Is Ripe,0.122564
6,19271,Hope Springs,0.120735
7,10674,Voices of a Distant Star,0.119158
8,39098,Alexander Hamilton,0.113965
9,25801,Implanted,0.107967


In [106]:
user_ratings = pd.read_csv("../archive/ratings.csv", dtype={'userId': int, 'movieId': str, 'rating': float,'timestamp': int})

In [95]:
id_links = pd.read_csv("../archive/links.csv", dtype={'movieId': str, 'imdbId': str, 'tmdbId': str})

In [96]:
id_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862


In [69]:
# usar df mas lindo bla
def condensamela(similarities):
    mapIndices = {}
    for sim in similarities:
        if sim[0] not in mapIndices:
            mapIndices[sim[0]] = (0,0)
        mapIndices[sim[0]] = (mapIndices[sim[0]][0]+1, mapIndices[sim[0]][1]+sim[1])
        
    return mapIndices

In [121]:
def get_user_recommendations(user):
    # WARNING!
    #
    # La matriz de ratings no usa el mismo ID que la matriz movies_metadata
    # En el archivo links se establece una relación entre el movieID de ratings y los IDs de TMBD e IMBD (que el primero parece ser el de movies_metadata)
    movies_and_ratings = user_ratings[user_ratings['userId']==user][['tmdbId','rating']]
    out = pd.DataFrame(columns=['index', 'title', 'similarity'])
    
    for _, info in movies_and_ratings.iterrows():
        # real shit indices se indexa con string, y estos son numeros i.e. '123', y movieId es un float, tonse '123.0' pincha
        movieID = str(int(info.loc['tmdbId']))
        rating = info.loc['rating']
        # ID = id_links[id_links["movieId"] == movieID]["tmdbId"].iloc[0]
        recommendations = get_recommendations_and_similarities(movies, movieID, cosine_sim)
        # Pesa la similaridad * rating, y la normaliza ( /5.0)
        recommendations['similarity'] = recommendations['similarity'] * rating / 5.0
        out = pd.concat([out,recommendations])
        #out = out.append(recommendations, ignore_index=True)
    
    return out
    # recommendations['appearances'] = ...
    recommendations.groupby(['index']).mean()
    
    # condensamela

In [101]:
id_links.head(2)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862
1,2,113497,8844


In [108]:
user_ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,1,110,1.0,1425941529,112573,197
1,1,147,4.5,1425942435,112461,10474
2,1,858,5.0,1425941523,68646,238


In [107]:
user_ratings = pd.merge(user_ratings, id_links, left_on='movieId', right_on='movieId', how='left')

In [92]:
movies.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [88]:
movie_ids = user_ratings['movieId'].drop_duplicates()
count = 0
for movie_id in movie_ids:
    if movies[movies['id']==movie_id].empty:
        count += 1
print(count)

45115


In [112]:
rec.sort_values(by='similarity')

Unnamed: 0,index,title,similarity
9,27428,One Step Ahead of My Shadow,0.011626
8,43571,Убить дракона,0.011712
7,4886,The Shipping News,0.011894
6,28584,Tom and Jerry: The Lost Dragon,0.013816
5,29229,Seven Dwarfs,0.015266
...,...,...,...
1,17437,Harry Potter and the Deathly Hallows: Part 2,0.428002
1,834,The Godfather,0.475954
0,1178,The Godfather: Part II,0.475954
0,44030,The Godfather Trilogy: 1972-1990,0.504059


In [119]:
pd.merge(user_ratings[user_ratings['userId']==1], movies[['id','title']], left_on='tmdbId', right_on='id',how='left').sort_values(by='rating', ascending=False)

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId,id,title
26,1,112552,5.0,1425941336,2582802,244786,244786,Whiplash
10,1,4878,5.0,1425941434,246578,141,141,Donnie Darko
21,1,91542,5.0,1425942618,1515091,58574,58574,Sherlock Holmes: A Game of Shadows
19,1,81834,5.0,1425942133,926084,12444,12444,Harry Potter and the Deathly Hallows: Part 1
18,1,73017,5.0,1425942699,988045,10528,10528,Sherlock Holmes
17,1,69844,5.0,1425942139,417741,767,767,Harry Potter and the Half-Blood Prince
16,1,68358,5.0,1425941464,796366,13475,13475,Star Trek
15,1,59315,5.0,1425941502,371746,1726,1726,Iron Man
23,1,96821,5.0,1425941382,1659337,84892,84892,The Perks of Being a Wallflower
11,1,5577,5.0,1425941397,280760,9685,9685,Igby Goes Down


In [129]:
get_user_recommendations(1)

Unnamed: 0,index,title,similarity
0,43086,Shooters,0.032936
1,2107,The Skin Game,0.030836
2,1181,Henry V,0.028559
3,742,Stalingrad,0.028374
4,16116,Certified Copy,0.02831
...,...,...,...
5,5803,Drumline,0.146932
6,12323,God Save the King,0.145966
7,23885,R100,0.143064
8,2709,On the Ropes,0.142582


In [175]:
asd = get_user_recommendations(1)
asd['aux'] = None
asd = asd.groupby(['index','title'])

In [176]:
asd.agg({'similarity':'mean', 'aux':'size'}).rename(columns={'similarity':'mean_similarity','aux':'count'}).reset_index()

Unnamed: 0,index,title,mean_similarity,count
0,150,Batman Forever,0.176627,2
1,379,Bad Company,0.165888,1
2,585,Batman,0.158436,1
3,742,Stalingrad,0.028374,1
4,834,The Godfather,0.475954,1
...,...,...,...,...
245,43892,The In Crowd,0.171783,1
246,44030,The Godfather Trilogy: 1972-1990,0.430101,2
247,44350,Warning: This Drug May Kill You,0.145779,1
248,44980,Batman Beyond: The Movie,0.170928,1
