In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors

In [2]:
ratings = pd.read_csv('ml-latest/ratings.csv')
print(ratings.head())
  
movies = pd.read_csv("movies_FINAL.csv")
print(movies.head())
  
n_ratings = len(ratings)
n_movies = len(ratings['movieId'].unique())
n_users = len(ratings['userId'].unique())

   userId  movieId  rating   timestamp
0       1      307     3.5  1256677221
1       1      481     3.5  1256677456
2       1     1091     1.5  1256677471
3       1     1257     4.5  1256677460
4       1     1449     4.5  1256677264
   movieId                        title  year    rating  Western  Film-Noir  \
0        1                    Toy Story  1995  3.886649        0          0   
1        2                      Jumanji  1995  3.246583        0          0   
2        3             Grumpier Old Men  1995  3.173981        0          0   
3        4            Waiting to Exhale  1995  2.874540        0          0   
4        5  Father of the Bride Part II  1995  3.077291        0          0   

   IMAX  Musical  Sci-Fi  Adventure  ...  Action  War  Mystery  Animation  \
0     0        0       0          1  ...       0    0        0          1   
1     0        0       0          1  ...       0    0        0          0   
2     0        0       0          0  ...       0    0       

In [3]:
print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")
print(f"Number of unique users: {n_users}")
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average ratings per movie: {round(n_ratings/n_movies, 2)}")

Number of ratings: 27753444
Number of unique movieId's: 53889
Number of unique users: 283228
Average ratings per user: 97.99
Average ratings per movie: 515.01


In [4]:
user_freq = ratings[['userId', 'movieId']].groupby('userId').count().reset_index()
user_freq.columns = ['userId', 'n_ratings']
user_freq.head()

Unnamed: 0,userId,n_ratings
0,1,16
1,2,15
2,3,11
3,4,736
4,5,72


In [5]:
mean_rating = ratings.groupby('movieId')[['rating']].mean()
# Lowest rated movies
lowest_rated = mean_rating['rating'].idxmin()
movies.loc[movies['movieId'] == lowest_rated]
# Highest rated movies
highest_rated = mean_rating['rating'].idxmax()
movies.loc[movies['movieId'] == highest_rated]
# show number of people who rated movies rated movie highest
ratings[ratings['movieId']==highest_rated]
# show number of people who rated movies rated movie lowest
ratings[ratings['movieId']==lowest_rated]
  
## the above movies has very low dataset. We will use bayesian average
movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()

In [6]:
from scipy.sparse import csr_matrix
  
def create_matrix(df):
      
    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())
      
    # Map Ids to indices
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))
      
    # Map indices to IDs
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))
      
    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]
  
    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
      
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [7]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)

In [20]:


def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
      
    neighbour_ids = []
      
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

In [21]:
movie_titles = dict(zip(movies['movieId'], movies['title']))

In [32]:
movie_id = 145
similar_ids = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]
print(f"Since you watched {movie_title}\n")
for i in similar_ids:
    print(movie_titles[i])

Since you watched Bad Boys

Demolition Man
Bad Boys II
Desperado
Die Hard: With a Vengeance
Crow, The
Speed
Die Hard 2
Con Air
Last Action Hero
Mortal Kombat


In [10]:
movie_id = 1
similar_ids = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]
print(f"Since you watched {movie_title}\n")
for i in similar_ids:
    print(movie_titles[i])

Since you watched Toy Story

Star Wars: Episode IV - A New Hope
Independence Day (a.k.a. ID4)
Toy Story 2
Back to the Future
Jurassic Park
Forrest Gump
Lion King, The
Mission: Impossible
Star Wars: Episode VI - Return of the Jedi
Aladdin


In [13]:
movie_id = 10
similar_ids = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]
print(f"Since you watched {movie_title}\n")
for i in similar_ids:
    print(movie_titles[i])

Since you watched GoldenEye

True Lies
Die Hard: With a Vengeance
Batman
Batman Forever
Speed
Stargate
Cliffhanger
Fugitive, The
Waterworld
Clear and Present Danger


In [17]:
movie_id = 47
similar_ids = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]
print(f"Since you watched {movie_title}\n")
for i in similar_ids:
    print(movie_titles[i])

Since you watched Seven (a.k.a. Se7en)

Pulp Fiction
Silence of the Lambs, The
Usual Suspects, The
Twelve Monkeys (a.k.a. 12 Monkeys)
Terminator 2: Judgment Day
Shawshank Redemption, The
Forrest Gump
Braveheart
Léon: The Professional (a.k.a. The Professional) (Léon)
Reservoir Dogs
