In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations
import seaborn as sns

ratings = pd.read_csv("ratings.csv", encoding='latin-1', usecols=['userId', 'movieId', 'rating'])
#users = pd.read_csv("Usuario_0.csv", encoding='latin-1' ,usecols=['movieId', 'title', 'rating'])
movies = pd.read_csv("movies.csv", encoding='latin-1' ,usecols=['movieId', 'title', 'genres'])


In [5]:
movies.shape

(9742, 3)

In [6]:
movies.sample(5)


Unnamed: 0,movieId,title,genres
7339,78041,Killers (2010),Action|Comedy
7192,72554,Cell 211 (Celda 211) (2009),Action|Drama
3666,5048,Snow Dogs (2002),Adventure|Children|Comedy
5279,8712,My Favorite Wife (1940),Comedy|Romance
8734,127132,Zipper (2015),Drama|Thriller


In [7]:
def weighted_average_score(df, k=0.8):
    n_views = df.groupby('movieId', sort=False).movieId.count()
    ratings = df.groupby('movieId', sort=False).rating.mean()
    scores = ((1-k)*(n_views/n_views.max()) + 
              k*(ratings/ratings.max())).to_numpy().argsort()[::-1]
    df_deduped = df.groupby('movieId', sort=False).agg({'title':'first', 
                                                         'genres':'first', 
                                                         'rating':'mean'})
    return df_deduped.assign(views=n_views).iloc[scores]

In [8]:
df = movies.merge(ratings)
weighted_average_score(df).head(10)

Unnamed: 0_level_0,title,genres,rating,views
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
318,"Shawshank Redemption, The (1994)",Crime|Drama,4.429022,317
356,Forrest Gump (1994),Comedy|Drama|Romance|War,4.164134,329
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,4.197068,307
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,4.192446,278
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,4.16129,279
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.231076,251
2959,Fight Club (1999),Action|Crime|Drama|Thriller,4.272936,218
527,Schindler's List (1993),Drama|War,4.225,220
858,"Godfather, The (1972)",Crime|Drama,4.289062,192
1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,4.21564,211


In [9]:
genre_popularity = (movies.genres.str.split('|')
                      .explode()
                      .value_counts()
                      .sort_values(ascending=False))
genre_popularity.head(10)

Drama        4361
Comedy       3756
Thriller     1894
Action       1828
Romance      1596
Adventure    1263
Crime        1199
Sci-Fi        980
Horror        978
Fantasy       779
Name: genres, dtype: int64

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [12]:
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [34]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix)

In [35]:
cosine_sim_df = pd.DataFrame(cosine_sim, index=movies['title'], columns=movies['title'])

print('Shape:', cosine_sim_df.shape)
cosine_sim_df.sample(10, axis=1).round(2)

Shape: (9742, 9742)


title,Dave (1993),"Squid and the Whale, The (2005)",Philomena (2013),Young Adult (2011),Baxter (1989),Motel Hell (1980),Dave Chappelle: Deep in the Heart of Texas (2017),Blow (2001),"Fast & Furious 6 (Fast and the Furious 6, The) (2013)","School of Flesh, The (Ãcole de la chair, L') (1998)"
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Toy Story (1995),0.03,0.04,0.04,0.04,0.00,0.03,0.08,0.00,0.00,0.00
Jumanji (1995),0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
Grumpier Old Men (1995),1.00,0.19,0.19,0.19,0.00,0.13,0.40,0.00,0.00,0.35
Waiting to Exhale (1995),0.59,0.51,0.51,0.51,0.06,0.07,0.24,0.08,0.00,0.58
Father of the Bride Part II (1995),0.40,0.46,0.46,0.46,0.00,0.31,1.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...
Black Butler: Book of the Atlantic (2017),0.04,0.05,0.05,0.05,0.00,0.03,0.11,0.00,0.02,0.00
No Game No Life: Zero (2017),0.07,0.08,0.08,0.08,0.00,0.05,0.17,0.00,0.00,0.00
Flint (2017),0.00,0.43,0.43,0.43,0.29,0.00,0.00,0.35,0.00,0.38
Bungo Stray Dogs: Dead Apple (2018),0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.05,0.00


In [36]:
def genre_recommendations(i, M, items, k=10):
    """
    Recommends movies based on a similarity dataframe

    Parameters
    ----------
    i : str
        Movie (index of the similarity dataframe)
    M : pd.DataFrame
        Similarity dataframe, symmetric, with movies as indices and columns
    items : pd.DataFrame
        Contains both the title and some other features used to define similarity
    k : int
        Amount of recommendations to return

    """
    ix = M.loc[:,i].to_numpy().argpartition(range(-1,-k,-1))
    closest = M.columns[ix[-1:-(k+2):-1]]
    closest = closest.drop(i, errors='ignore')
    return pd.DataFrame(closest).merge(items).head(k)

In [37]:
movies[movies.title.eq('Coco (2017)')]


Unnamed: 0,movieId,title,genres
9621,177765,Coco (2017),Adventure|Animation|Children


In [38]:
genre_recommendations('Coco (2017)', cosine_sim_df, movies[['title', 'genres']])


Unnamed: 0,title,genres
0,Ice Age: A Mammoth Christmas (2011),Adventure|Animation|Children
1,Return to Never Land (2002),Adventure|Animation|Children
2,Digimon: The Movie (2000),Adventure|Animation|Children
3,Karlson Returns (1970),Adventure|Animation|Children
4,Balto (1995),Adventure|Animation|Children
5,Adventures of Mowgli: The Kidnapping (1968),Adventure|Animation|Children
6,Dinosaur (2000),Adventure|Animation|Children
7,Brother Bear (2003),Adventure|Animation|Children
8,101 Dalmatians (One Hundred and One Dalmatians...,Adventure|Animation|Children
9,Pocahontas II: Journey to a New World (1998),Adventure|Animation|Children


In [27]:
#prueba
print(movies[movies.title.eq('Contact (1997)')])


      movieId           title        genres
1187     1584  Contact (1997)  Drama|Sci-Fi


In [28]:
#prueba
movies[movies.title.eq('Jungle Book, The (1967)')]


Unnamed: 0,movieId,title,genres
1543,2078,"Jungle Book, The (1967)",Animation|Children|Comedy|Musical


In [29]:
#prueba
movies[movies.title.eq('Saving Private Ryan (1998)')]


Unnamed: 0,movieId,title,genres
1503,2028,Saving Private Ryan (1998),Action|Drama|War
