# Netflix-like recommendation systems with Sklearn

## Importing Libraries

In [16]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import tqdm
import statistics

The data are contained in the files movies.csv, ratings.csv and tags.csv.

In the movies.csv file:

movieId: ID of the movie/show (unique)
title: Title of the movie/show
genres: Genre of the show
In the ratings.csv file:

userId: ID of the user who gave a rating
movieId: ID of the movie/show rated
rating: Rating given to the show
timestamp: Time when the rating was specified
In the tags.csv file:

userId: ID of the user who gave a rating
movieId: ID of the movie/show rated
tag: Tags given to the show
timestamp: Time when the rating was specified   


From: https://www.kaggle.com/datasets/shubhammehta21/movie-lens-small-latest-dataset/data

In [17]:
movie_df = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/BxZuF3FrO7Bdw6McwsBaBw/movies.csv')
rating_df = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/R-bYYyyf7s3IUE5rsssmMw/ratings.csv')
tag_df = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/UZKHhXSl7Ft7t9mfUFZJPQ/tags.csv')

In [18]:
movie_df.sample(5)

Unnamed: 0,movieId,title,genres
2536,3396,"Muppet Movie, The (1979)",Adventure|Children|Comedy|Musical
8991,139415,Irrational Man (2015),Crime|Drama
7117,70994,Halloween II (2009),Horror|Thriller
863,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy
3638,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy


In [19]:
tag_df.sample(5)

Unnamed: 0,userId,movieId,tag,timestamp
1825,474,3566,business,1137205342
1521,474,1954,boxing,1137191671
376,62,168248,action,1528152295
351,62,135536,Margot Robbie,1525555073
230,62,87430,CGI,1525555168


In [20]:
rating_df.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
64608,414,52241,3.5,1188919798
621,6,158,4.0,845553660
33231,226,2085,3.0,1096420068
38299,263,1393,4.0,941591657
78203,484,63131,4.0,1342299442


Merge the 3 dataframes to create a single df that contains all the information

In [21]:
user_movie_df = movie_df.merge(rating_df, on = 'movieId', how = 'inner')
df = user_movie_df.merge(tag_df, on = ['movieId', 'userId'], how = 'inner')
df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp_x,tag,timestamp_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,1122227329,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,978575760,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,1525286001,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,1528843890,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,1528843890,magic board game,1528843932
...,...,...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,1528934550,star wars,1528934552
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,1537098554,anime,1537098582
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,1537098554,comedy,1537098587
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,1537098554,gintama,1537098603


Drop columns that are not needed for our analyzis

In [22]:
df.drop(columns=['timestamp_x', 'timestamp_y'], inplace=True)
df

Unnamed: 0,movieId,title,genres,userId,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game
...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,star wars
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,anime
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,comedy
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,gintama


## Exploratory data analysis (EDA)

In [23]:
#3476 rows × 6 columns
df.shape

(3476, 6)

In [24]:
df.dtypes

Unnamed: 0,0
movieId,int64
title,object
genres,object
userId,int64
rating,float64
tag,object


In [25]:
df.isnull().any()

Unnamed: 0,0
movieId,False
title,False
genres,False
userId,False
rating,False
tag,False


## Building Popularity-based recommendation

Recommend movies based on what is popular on the website, the ones that got more reviews. Based on some metrics, such as number of views, ratings, or purchases and suggests these items for all users.

In [26]:
df_copy = df
df.sample(5)

Unnamed: 0,movieId,title,genres,userId,rating,tag
792,1207,To Kill a Mockingbird (1962),Drama,474,4.0,Harper Lee
331,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,599,5.0,out of order
423,410,Addams Family Values (1993),Children|Comedy|Fantasy,62,4.5,Christina Ricci
2940,74458,Shutter Island (2010),Drama|Mystery|Thriller,424,5.0,insanity
2636,48516,"Departed, The (2006)",Crime|Drama|Thriller,7,1.0,way too long


Number of votes and average rating for each movie

In [27]:
num_votes = df_copy.groupby('movieId').size().reset_index(name='numVotes')
df_copy = pd.merge(df_copy, num_votes, on='movieId')
df_copy

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar,3
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar,3
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun,3
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy,4
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game,4
...,...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,star wars,2
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,anime,4
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,comedy,4
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,gintama,4


In [28]:
avg_ratings = df_copy.groupby('movieId')['rating'].mean().reset_index(name='avgRating')
df_copy = pd.merge(df_copy, avg_ratings, on='movieId')

In [29]:
df_copy.drop_duplicates(subset = ['movieId', 'title', 'avgRating', 'numVotes'], inplace = True)
df_copy

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes,avgRating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar,3,3.833333
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy,4,3.750000
7,3,Grumpier Old Men (1995),Comedy|Romance,289,2.5,moldy,2,2.500000
9,5,Father of the Bride Part II (1995),Comedy,474,1.5,pregnancy,2,1.500000
11,7,Sabrina (1995),Comedy|Romance,474,3.0,remake,1,3.000000
...,...,...,...,...,...,...,...,...
3461,183611,Game Night (2018),Action|Comedy|Crime|Horror,62,4.0,Comedy,3,4.000000
3464,184471,Tomb Raider (2018),Action|Adventure|Fantasy,62,3.5,adventure,3,3.500000
3467,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,62,4.0,Josh Brolin,3,4.000000
3470,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,Emilia Clarke,2,4.000000


In [30]:
import statistics

def calculate_weighted_score(avgRating, num_votes, C, m):
  return (num_votes * avgRating + m * C) / (num_votes + m)

average_rating = statistics.mean(df_copy['avgRating'])
print("The average rating across all movies is: ", average_rating)

avg_num_votes = statistics.mean(df_copy['numVotes'])
print('The average number of votes is:', avg_num_votes)

df_copy['score'] = df_copy.apply(lambda row: calculate_weighted_score(row['avgRating'], row['numVotes'], average_rating, avg_num_votes), axis=1)

df_copy[['movieId', 'title', 'avgRating', 'numVotes', 'score']].head()

The average rating across all movies is:  3.7323364168313313
The average number of votes is: 2.3743169398907105


Unnamed: 0,movieId,title,avgRating,numVotes,score
0,1,Toy Story (1995),3.833333,3,3.788714
3,2,Jumanji (1995),3.75,4,3.743421
7,3,Grumpier Old Men (1995),2.5,2,3.168895
9,5,Father of the Bride Part II (1995),1.5,2,2.71168
11,7,Sabrina (1995),3.0,1,3.515304


In [31]:
df_copy

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes,avgRating,score
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar,3,3.833333,3.788714
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy,4,3.750000,3.743421
7,3,Grumpier Old Men (1995),Comedy|Romance,289,2.5,moldy,2,2.500000,3.168895
9,5,Father of the Bride Part II (1995),Comedy,474,1.5,pregnancy,2,1.500000,2.711680
11,7,Sabrina (1995),Comedy|Romance,474,3.0,remake,1,3.000000,3.515304
...,...,...,...,...,...,...,...,...,...
3461,183611,Game Night (2018),Action|Comedy|Crime|Horror,62,4.0,Comedy,3,4.000000,3.881749
3464,184471,Tomb Raider (2018),Action|Adventure|Fantasy,62,3.5,adventure,3,3.500000,3.602644
3467,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,62,4.0,Josh Brolin,3,4.000000,3.881749
3470,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,Emilia Clarke,2,4.000000,3.854716


#### Top 5 suggestions sorting by score in descending order

In [39]:
top5 = df_copy.sort_values(by='score', ascending=False)
top5.head()

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes,avgRating,score
199,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,103,5.0,good dialogue,181,4.983425,4.967226
1337,2959,Fight Club (1999),Action|Crime|Drama|Thriller,424,4.5,dark comedy,54,4.944444,4.893394
604,924,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi,474,4.0,Hal,41,4.95122,4.884498
998,1732,"Big Lebowski, The (1998)",Comedy|Crime,474,3.5,Coen Brothers,32,4.953125,4.868802
164,293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,166,4.5,assassin,35,4.928571,4.852577


## Content-based Recommendation

Different from popularity recommendation, the content-based will recommend a movie based on features that closely match the user's profile. For example, movies that has similar, genre, cast, keywords.    
Is limited as it doesn't recommend items outside the user's known preferences, which can limit the dicovery of new types of items.    


Cosine similarity based on a number of features

In [40]:
df_content_rec = df_copy[['movieId', 'title', 'userId', 'avgRating', 'numVotes', 'score', 'genres', 'tag']].copy()
df_content_rec.reset_index(drop=True, inplace=True)
df_content_rec

Unnamed: 0,movieId,title,userId,avgRating,numVotes,score,genres,tag
0,1,Toy Story (1995),336,3.833333,3,3.788714,Adventure|Animation|Children|Comedy|Fantasy,pixar
1,2,Jumanji (1995),62,3.750000,4,3.743421,Adventure|Children|Fantasy,fantasy
2,3,Grumpier Old Men (1995),289,2.500000,2,3.168895,Comedy|Romance,moldy
3,5,Father of the Bride Part II (1995),474,1.500000,2,2.711680,Comedy,pregnancy
4,7,Sabrina (1995),474,3.000000,1,3.515304,Comedy|Romance,remake
...,...,...,...,...,...,...,...,...
1459,183611,Game Night (2018),62,4.000000,3,3.881749,Action|Comedy|Crime|Horror,Comedy
1460,184471,Tomb Raider (2018),62,3.500000,3,3.602644,Action|Adventure|Fantasy,adventure
1461,187593,Deadpool 2 (2018),62,4.000000,3,3.881749,Action|Comedy|Sci-Fi,Josh Brolin
1462,187595,Solo: A Star Wars Story (2018),62,4.000000,2,3.854716,Action|Adventure|Children|Sci-Fi,Emilia Clarke


In [41]:
df_content_rec['features'] = df_content_rec['genres'].str.replace('|', ' ') + ' ' + df_content_rec['tag'].fillna('')
df_content_rec

Unnamed: 0,movieId,title,userId,avgRating,numVotes,score,genres,tag,features
0,1,Toy Story (1995),336,3.833333,3,3.788714,Adventure|Animation|Children|Comedy|Fantasy,pixar,Adventure Animation Children Comedy Fantasy pixar
1,2,Jumanji (1995),62,3.750000,4,3.743421,Adventure|Children|Fantasy,fantasy,Adventure Children Fantasy fantasy
2,3,Grumpier Old Men (1995),289,2.500000,2,3.168895,Comedy|Romance,moldy,Comedy Romance moldy
3,5,Father of the Bride Part II (1995),474,1.500000,2,2.711680,Comedy,pregnancy,Comedy pregnancy
4,7,Sabrina (1995),474,3.000000,1,3.515304,Comedy|Romance,remake,Comedy Romance remake
...,...,...,...,...,...,...,...,...,...
1459,183611,Game Night (2018),62,4.000000,3,3.881749,Action|Comedy|Crime|Horror,Comedy,Action Comedy Crime Horror Comedy
1460,184471,Tomb Raider (2018),62,3.500000,3,3.602644,Action|Adventure|Fantasy,adventure,Action Adventure Fantasy adventure
1461,187593,Deadpool 2 (2018),62,4.000000,3,3.881749,Action|Comedy|Sci-Fi,Josh Brolin,Action Comedy Sci-Fi Josh Brolin
1462,187595,Solo: A Star Wars Story (2018),62,4.000000,2,3.854716,Action|Adventure|Children|Sci-Fi,Emilia Clarke,Action Adventure Children Sci-Fi Emilia Clarke


### Term Frequency-Inverse Document Frequency(TF-IDF) vectorizer    

Transform text into numerical representations, it will take into consideration the frequency of a word within a specific document and its rarity across all documents.

In [42]:
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = TfidfVectorizer(stop_words='english')

X = vectorizer.fit_transform(df_content_rec['features'])

In [45]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(X)

def recommendation(title, df, similarity, top_n=3):
  try:
    idx = df[df['title'] == title].index[0]
  except IndexError:
    print(f"Movie '{title}' not found in the dataset.")
    return

  sim_scores = list(enumerate(similarity[idx]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

  print(f"Movies similar to '{title}' (First movie is itself):")
  for i, (index, score) in enumerate(sim_scores[:top_n+1]):
    movie = df.iloc[index]
    print(f"{i}. {movie['title']} (Similarity Score: {score:.3f})")
    print(f"   Genres: {movie['genres']}")
    print(f"   Tag: {movie['tag']}\n")

recommendation("Toy Story (1995)", df_content_rec, similarity)

Movies similar to 'Toy Story (1995)' (First movie is itself):
0. Toy Story (1995) (Similarity Score: 1.000)
   Genres: Adventure|Animation|Children|Comedy|Fantasy
   Tag: pixar

1. Bug's Life, A (1998) (Similarity Score: 0.939)
   Genres: Adventure|Animation|Children|Comedy
   Tag: Pixar

2. Toy Story 2 (1999) (Similarity Score: 0.675)
   Genres: Adventure|Animation|Children|Comedy|Fantasy
   Tag: animation

3. Sintel (2010) (Similarity Score: 0.583)
   Genres: Animation|Fantasy
   Tag: adventure



##### Check the recommendations for the movie 'Toy Story 2 (1999)'

In [46]:
recommendation("Toy Story 2 (1999)", df_content_rec, similarity)

Movies similar to 'Toy Story 2 (1999)' (First movie is itself):
0. Toy Story 2 (1999) (Similarity Score: 1.000)
   Genres: Adventure|Animation|Children|Comedy|Fantasy
   Tag: animation

1. Croods, The (2013) (Similarity Score: 0.856)
   Genres: Adventure|Animation|Comedy
   Tag: animation

2. Sintel (2010) (Similarity Score: 0.853)
   Genres: Animation|Fantasy
   Tag: adventure

3. Invincible Iron Man, The (2007) (Similarity Score: 0.775)
   Genres: Animation
   Tag: animation



## Collaborative filtering   
2 types:
-   User-based Collaborative Filtering: Identifies users with similar preferences and recommends items that similar users have liked.
-   Item-based Collaborative Filtering: Items similar to those liked or rated in the past are recommended.

In [47]:
user_rating_matrix = rating_df.pivot(index='movieId', columns="userId", values="rating")
user_rating_matrix = user_rating_matrix.fillna(0)
user_rating_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


### NearestNeighbors Classifier based on the cosine similarity

In [49]:
from sklearn.neighbors import NearestNeighbors

rec = NearestNeighbors(metric='cosine')
rec.fit(user_rating_matrix)

In [52]:
def get_recommendations(title):
  movie = df_content_rec[df_content_rec['title'] == title]

  if movie.empty:
    print(f"Movie '{title}' not found is dataset.")
    return None

  movie_id = int(movie['movieId'])

  try:
    user_index = user_rating_matrix.index.get_loc(movie_id)
  except KeyError:
    print(f"Movie ID {movie_id} not found in the user rating matrix.")
    return None

  user_ratings = user_rating_matrix.iloc[user_index]
  reshaped_df = user_ratings.values.reshape(1,-1)

  distances, indices = rec.kneighbors(reshaped_df, n_neighbors=15)
  nearest_idx = user_rating_matrix.iloc[indices[0]].index[1:]

  nearest_neighbors = pd.DataFrame({'movieId': nearest_idx})
  result = pd.merge(nearest_neighbors, df_content_rec, on='movieId', how='left')

  return result[['title', 'avgRating', 'genres']].head()

get_recommendations('Toy Story (1995)')

  movie_id = int(movie['movieId'])


Unnamed: 0,title,avgRating,genres
0,Toy Story 2 (1999),3.125,Adventure|Animation|Children|Comedy|Fantasy
1,Jurassic Park (1993),4.5,Action|Adventure|Sci-Fi|Thriller
2,Independence Day (a.k.a. ID4) (1996),4.0,Action|Adventure|Sci-Fi|Thriller
3,Star Wars: Episode IV - A New Hope (1977),4.527778,Action|Adventure|Sci-Fi
4,Forrest Gump (1994),3.666667,Comedy|Drama|Romance|War


In [53]:
get_recommendations('Jurassic Park (1993)')

  movie_id = int(movie['movieId'])


Unnamed: 0,title,avgRating,genres
0,Terminator 2: Judgment Day (1991),2.625,Action|Sci-Fi
1,Forrest Gump (1994),3.666667,Comedy|Drama|Romance|War
2,Braveheart (1995),4.35,Action|Drama|War
3,"Fugitive, The (1993)",5.0,Thriller
4,Speed (1994),4.0,Action|Romance|Thriller
