In [42]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [43]:
movies=pd.read_csv('movies.csv')
ratings=pd.read_csv('ratings.csv')

In [44]:
print(movies.shape)
movies.head()


(9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [45]:
print(ratings.shape)
ratings.head()

(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# Filtering users and movies to only include the first 200 

In [46]:
# include only the ratings where the user Id is less than or equal to 200
filtered_ratings=ratings[ratings['userId']<=200]
# # include only the first 200 unique movies Id
unique_movies_id=filtered_ratings['movieId'].unique()[:200]
# filter movies to inlude only the first 200 unique
filtered_movies=movies[movies['movieId'].isin(unique_movies_id)]

num_movies=len(filtered_movies)
print(num_movies)


200


In [47]:
# create a user movie matrix where each row represents a user and each column represents a movie
user_movie_matrix=filtered_ratings.pivot(index='userId',columns='movieId',values='rating').fillna(0)
user_movie_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
199,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
# calculate the meman
ratings_mean=np.mean(user_movie_matrix.values,axis=1)
# subtract the mean to make all the numbers have the same range
user_movie_matrix_normalized=user_movie_matrix-ratings_mean.reshape(-1,1)

# Cosine similarity between movies


In [49]:
movie_similarity=cosine_similarity(user_movie_matrix_normalized.T)
# calculate the cosine similarity matrix 
movie_similarity_df=pd.DataFrame(movie_similarity,index=user_movie_matrix.columns,columns=user_movie_matrix.columns)
movie_similarity_df

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.303974,0.215887,-0.180968,0.211856,0.268571,0.202235,-0.052205,-0.012556,0.301707,...,-0.221655,-0.269973,-0.301167,-0.243752,-0.243752,-0.269973,-0.243752,-0.269973,-0.269973,-0.269973
2,0.303974,1.000000,0.207615,-0.053844,0.208813,0.200378,0.168735,-0.026375,-0.172464,0.283316,...,-0.190878,-0.231886,-0.258341,-0.209636,-0.209636,-0.231886,-0.209636,-0.231886,-0.231886,-0.231886
3,0.215887,0.207615,1.000000,0.061811,0.348455,0.274118,0.562782,0.320433,0.187679,0.169653,...,-0.102053,-0.123237,-0.136881,-0.111748,-0.111748,-0.123237,-0.111748,-0.123237,-0.123237,-0.123237
4,-0.180968,-0.053844,0.061811,1.000000,0.161000,0.004126,0.218147,0.288558,0.081098,0.000486,...,0.085503,0.108037,0.122701,0.095783,0.095783,0.108037,0.095783,0.108037,0.108037,0.108037
5,0.211856,0.208813,0.348455,0.161000,1.000000,0.183259,0.508052,0.326770,-0.080381,0.218866,...,-0.089787,-0.108270,-0.120169,-0.098247,-0.098247,-0.108270,-0.098247,-0.108270,-0.108270,-0.108270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193579,-0.269973,-0.231886,-0.123237,0.108037,-0.108270,-0.260729,-0.111419,0.114641,0.172837,-0.188850,...,0.994314,1.000000,0.997359,0.998280,0.998280,1.000000,0.998280,1.000000,1.000000,1.000000
193581,-0.243752,-0.209636,-0.111748,0.095783,-0.098247,-0.235566,-0.101084,0.101721,0.153977,-0.170733,...,0.998847,0.998280,0.991386,1.000000,1.000000,0.998280,1.000000,0.998280,0.998280,0.998280
193583,-0.269973,-0.231886,-0.123237,0.108037,-0.108270,-0.260729,-0.111419,0.114641,0.172837,-0.188850,...,0.994314,1.000000,0.997359,0.998280,0.998280,1.000000,0.998280,1.000000,1.000000,1.000000
193585,-0.269973,-0.231886,-0.123237,0.108037,-0.108270,-0.260729,-0.111419,0.114641,0.172837,-0.188850,...,0.994314,1.000000,0.997359,0.998280,0.998280,1.000000,0.998280,1.000000,1.000000,1.000000


In [50]:
# function to get the top movies similar to a given movie
def get_similar_movies(movie_id,top_similar_movies=10):
  similar_scores=movie_similarity_df[movie_id].sort_values(ascending=False)
  # returns the top similar movies excluding the movie itself
  return similar_scores.iloc[1:top_similar_movies+1]

# Toy story movie

In [51]:
toy_story_similar = get_similar_movies(1, 10)

# from all the movies the first 200 users have rated
toy_story_df = movies[movies['movieId'].isin(toy_story_similar.index)]
print("Top 10 movies similar to Toy Story:")
toy_story_df



Top 10 movies similar to Toy Story:


Unnamed: 0,movieId,title,genres
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
277,318,"Shawshank Redemption, The (1994)",Crime|Drama
325,367,"Mask, The (1994)",Action|Comedy|Crime|Fantasy
418,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
506,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
546,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller
615,780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller
2355,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
3194,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...


# First 200 movies only (Toy Story)

In [52]:
# from the first 200 movies only
filtered_toy_story_df = filtered_movies[filtered_movies['movieId'].isin(toy_story_similar.index)]
print("Top 10 movies similar to Toy Story (from the first 200 movies only):")
filtered_toy_story_df


Top 10 movies similar to Toy Story (from the first 200 movies only):


Unnamed: 0,movieId,title,genres
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
325,367,"Mask, The (1994)",Action|Comedy|Crime|Fantasy
418,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
546,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller
615,780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller


# Waiting to exhale movie

In [53]:
waiting_to_exhale_similar=get_similar_movies(4,10)
print("Top 10 movies similar to 'Waiting to Exhale':")
# from the all the movies the first 200 users have rated
waiting_to_exhaled=pd.DataFrame(movies[movies['movieId'].isin(waiting_to_exhale_similar.index)])
waiting_to_exhaled


Top 10 movies similar to 'Waiting to Exhale':


Unnamed: 0,movieId,title,genres
100,113,Before and After (1996),Drama|Mystery
153,181,Mighty Morphin Power Rangers: The Movie (1995),Action|Children
232,270,Love Affair (1994),Drama|Romance
250,289,Only You (1994),Comedy|Romance
262,302,"Queen Margot (Reine Margot, La) (1994)",Drama|Romance
312,354,Cobb (1994),Drama
338,381,When a Man Loves a Woman (1994),Drama|Romance
387,445,Fatal Instinct (1993),Comedy
415,477,What's Love Got to Do with It? (1993),Drama|Musical
822,1082,"Candidate, The (1972)",Drama


# First 200 movies only (waiting to exhale)

In [54]:
# from the first 200 movies only
filtered_waiting_to_exhaled=pd.DataFrame(filtered_movies[filtered_movies['movieId'].isin(waiting_to_exhale_similar.index)])
filtered_waiting_to_exhaled

Unnamed: 0,movieId,title,genres


# Recommend 3 movies for user 200

In [55]:
user_id=200
# get the movies that the user has rated
user_ratings=user_movie_matrix.loc[user_id]
# get the movies that the user has not rated
user_unrated_movies=user_ratings[user_ratings==0].index

# dictionary to store the recommendation scores for each movie
recommendation_scores={}
# iterate over the movies that the user has not rated and calculate the recommendation score
for movie in user_unrated_movies:
  # get the similar movies to the current movie
  similar_movies=movie_similarity_df[movie].sort_values(ascending=False).iloc[1:]
  # ensure to only include movies that the user hasn't rated and are present in the user movie matrix
  common_movie_ids = similar_movies.index.intersection(user_movie_matrix.columns)
  similar_movies = similar_movies[common_movie_ids]
  # calculate the recommendation score
  score = sum(similar_movies * user_movie_matrix.loc[user_id, common_movie_ids].values)
  recommendation_scores[movie]=score
print(f"Recommendation scores: {recommendation_scores}")

# get the top 3 recommended movies for the user by sorting descendingly
recommendation_movies=sorted(recommendation_scores,key=recommendation_scores.get,reverse=True)[:3]
print(recommendation_movies)
recommended_movies_df=movies[movies['movieId'].isin(recommendation_movies)]
pd.DataFrame(recommended_movies_df)

print(f"Top 3 recommended movies for user {user_id}: ")
recommended_movies_df


Recommendation scores: {2: 165.474639173987, 3: 87.27167461504604, 4: -126.50126305191692, 6: 176.28804123419295, 7: 75.39779304969487, 8: -127.92706235099313, 9: -153.45310869066384, 11: 117.41370997194299, 12: -166.81569335491878, 13: -105.16758653851052, 14: -57.25127083234296, 15: -90.84475783138284, 16: 167.19266790412092, 17: 78.0126985887155, 18: 51.36447573063891, 20: -191.09204622755374, 21: 84.0321540788215, 22: 15.590396520812348, 23: -28.283701230467162, 24: -58.605374570116105, 25: 92.54910558367273, 26: 13.33668202115026, 27: -244.38463620281416, 28: -48.38615239985261, 29: 22.42668844828295, 31: 42.95638040379594, 32: 141.5949221336313, 36: 69.73025667623615, 38: -230.50230776746008, 41: -26.368456581062237, 43: -173.69645353723806, 44: 32.413462142777554, 45: 17.258914947845202, 46: -109.22477890792611, 48: 146.47614639721542, 50: 199.69512458085012, 52: -36.88767273957638, 53: -161.26417603338967, 54: -168.9459277125208, 55: -183.99630858714767, 57: -128.65768924655697

Unnamed: 0,movieId,title,genres
964,1265,Groundhog Day (1993),Comedy|Fantasy|Romance
2302,3052,Dogma (1999),Adventure|Comedy|Fantasy
3745,5218,Ice Age (2002),Adventure|Animation|Children|Comedy


# get the recommneded movie within the first 200 movies only

In [56]:
filtered_movies[filtered_movies['movieId'].isin(recommendation_movies)]

Unnamed: 0,movieId,title,genres
964,1265,Groundhog Day (1993),Comedy|Fantasy|Romance
