In [1]:
# import dependencies
import pandas as pd

In [2]:
# read ratings csv
ratings = pd.read_csv("../data/ml-100k/ratings.csv")

In [3]:
# only select ratings of "5"
ratings = ratings[ratings.rating == 5.0]

In [4]:
# count 5 star ratings for each movie
ratings = ratings.groupby(["movieId"]).count()

In [5]:
# sort by most 5 star ratings received and limit to 20 movies
ratings = ratings.sort_values(by=['rating'], ascending=False).head(30)

In [6]:
# read the movies csv and extract the year from the title
movies = pd.read_csv("../data/ml-100k/movies.csv")
movies['year'] = movies['title'].str.extract('.*\((.*)\).*')
print(movies.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  year  
0  Adventure|Animation|Children|Comedy|Fantasy  1995  
1                   Adventure|Children|Fantasy  1995  
2                               Comedy|Romance  1995  
3                         Comedy|Drama|Romance  1995  
4                                       Comedy  1995  


In [7]:
links = pd.read_csv("../data/ml-100k/links.csv")

In [8]:
# merge the ratings and movies df
df = pd.merge(ratings, movies, on="movieId", how="left")
df

Unnamed: 0,movieId,userId,rating,timestamp,title,genres,year
0,318,153,153,153,"Shawshank Redemption, The (1994)",Crime|Drama,1994
1,296,123,123,123,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,1994
2,356,116,116,116,Forrest Gump (1994),Comedy|Drama|Romance|War,1994
3,2571,109,109,109,"Matrix, The (1999)",Action|Sci-Fi|Thriller,1999
4,260,104,104,104,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,1977
5,527,92,92,92,Schindler's List (1993),Drama|War,1993
6,593,92,92,92,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,1991
7,858,88,88,88,"Godfather, The (1972)",Crime|Drama,1972
8,2959,81,81,81,Fight Club (1999),Action|Crime|Drama|Thriller,1999
9,110,80,80,80,Braveheart (1995),Action|Drama|War,1995


In [9]:
df = pd.merge(df, links, on="movieId", how="left")
df

Unnamed: 0,movieId,userId,rating,timestamp,title,genres,year,imdbId,tmdbId
0,318,153,153,153,"Shawshank Redemption, The (1994)",Crime|Drama,1994,111161,278.0
1,296,123,123,123,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,1994,110912,680.0
2,356,116,116,116,Forrest Gump (1994),Comedy|Drama|Romance|War,1994,109830,13.0
3,2571,109,109,109,"Matrix, The (1999)",Action|Sci-Fi|Thriller,1999,133093,603.0
4,260,104,104,104,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,1977,76759,11.0
5,527,92,92,92,Schindler's List (1993),Drama|War,1993,108052,424.0
6,593,92,92,92,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,1991,102926,274.0
7,858,88,88,88,"Godfather, The (1972)",Crime|Drama,1972,68646,238.0
8,2959,81,81,81,Fight Club (1999),Action|Crime|Drama|Thriller,1999,137523,550.0
9,110,80,80,80,Braveheart (1995),Action|Drama|War,1995,112573,197.0


In [10]:
# drop unnecessary columns
df.drop(['userId', 'timestamp', 'genres', 'tmdbId'],axis=1, inplace=True)
print(df)

    movieId  rating                                              title  year  \
0       318     153                   Shawshank Redemption, The (1994)  1994   
1       296     123                                Pulp Fiction (1994)  1994   
2       356     116                                Forrest Gump (1994)  1994   
3      2571     109                                 Matrix, The (1999)  1999   
4       260     104          Star Wars: Episode IV - A New Hope (1977)  1977   
5       527      92                            Schindler's List (1993)  1993   
6       593      92                   Silence of the Lambs, The (1991)  1991   
7       858      88                              Godfather, The (1972)  1972   
8      2959      81                                  Fight Club (1999)  1999   
9       110      80                                  Braveheart (1995)  1995   
10     1196      80  Star Wars: Episode V - The Empire Strikes Back...  1980   
11       50      71                     

In [11]:
df['imdbId'] = 'tt0' + df['imdbId'].astype(str)


In [12]:
df

Unnamed: 0,movieId,rating,title,year,imdbId
0,318,153,"Shawshank Redemption, The (1994)",1994,tt0111161
1,296,123,Pulp Fiction (1994),1994,tt0110912
2,356,116,Forrest Gump (1994),1994,tt0109830
3,2571,109,"Matrix, The (1999)",1999,tt0133093
4,260,104,Star Wars: Episode IV - A New Hope (1977),1977,tt076759
5,527,92,Schindler's List (1993),1993,tt0108052
6,593,92,"Silence of the Lambs, The (1991)",1991,tt0102926
7,858,88,"Godfather, The (1972)",1972,tt068646
8,2959,81,Fight Club (1999),1999,tt0137523
9,110,80,Braveheart (1995),1995,tt0112573
