Hybrid Recommender System
1) User Based Recommendation
2) Item Based Recommendation 

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 20)

Task 1: Data Processing 

In [2]:
# reading datasets
movie = pd.read_csv("../input/movielens-20m-dataset/movie.csv")
rating = pd.read_csv("../input/movielens-20m-dataset/rating.csv")
df_ = movie.merge(rating, how="left", on="movieId")
df = df_.copy()
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41


In [3]:
df.shape

(20000797, 6)

In [4]:
# number of unique titles in ratings 
comment_counts = pd.DataFrame(df["title"].value_counts())
comment_counts

Unnamed: 0,title
Pulp Fiction (1994),67310
Forrest Gump (1994),66172
"Shawshank Redemption, The (1994)",63366
"Silence of the Lambs, The (1991)",63299
Jurassic Park (1993),59715
...,...
Trilogy of Terror II (1996),1
"Enemy of the People, An (Ganashatru) (1989)",1
Mo (1983),1
Valley Of Flowers (2006),1


In [5]:
# movies rarely rated 
rare_movies = comment_counts[comment_counts["title"] <= 1000].index

In [6]:
# exclusion of movies rarely rated 
common_movies = df[~df["title"].isin(rare_movies)]
common_movies.shape
# check number of common movies
common_movies["title"].nunique()

3159

In [7]:
# creating pivot table consisting of so called common movies             
user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")
#user_movie_df.shape
user_movie_df.head(10)

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,
6.0,,,,,,,,,,,...,,,,,,,,,,
7.0,,,,,,,,,,,...,,,,,,,,,,2.0
8.0,,,,,,,,,,,...,,,,,,,,,,
9.0,,,,,,,,,,,...,,,,,,,,,,
10.0,,,,,,,,,,,...,,,,,,,,,,


In [8]:
# movies are in columns checked
user_movie_df.columns

Index([''burbs, The (1989)', '(500) Days of Summer (2009)',
       '*batteries not included (1987)', '...And Justice for All (1979)',
       '10 Things I Hate About You (1999)', '10,000 BC (2008)',
       '101 Dalmatians (1996)',
       '101 Dalmatians (One Hundred and One Dalmatians) (1961)',
       '102 Dalmatians (2000)', '12 Angry Men (1957)',
       ...
       'Zero Dark Thirty (2012)', 'Zero Effect (1998)', 'Zodiac (2007)',
       'Zombieland (2009)', 'Zoolander (2001)', 'Zulu (1964)', '[REC] (2007)',
       'eXistenZ (1999)', 'xXx (2002)', '¡Three Amigos! (1986)'],
      dtype='object', name='title', length=3159)

Task 2: Determinig the movies that the random picked user watched

In [9]:
# picking up a random user for user based recommendation
random_user = int(pd.Series(user_movie_df.index).sample(1, random_state=45).values)

In [10]:
#selecting the movies the the random picked user watched 
random_user_df = user_movie_df[user_movie_df.index == random_user]
random_user_df.head()


title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
28941.0,,,,,,,,,,,...,,,,,,,,,,


In [11]:
#moving them to a list 
movies_watched = random_user_df.columns[random_user_df.notna().any()].tolist() 
movies_watched
#verification
#user_movie_df.loc[user_movie_df.index == random_user, user_movie_df.columns == "Ace Ventura: Pet Detective (1994)"]
#len(movies_watched)

['Ace Ventura: Pet Detective (1994)',
 'Ace Ventura: When Nature Calls (1995)',
 'Aladdin (1992)',
 'American President, The (1995)',
 'Apollo 13 (1995)',
 'Babe (1995)',
 'Bullets Over Broadway (1994)',
 'Clueless (1995)',
 'Disclosure (1994)',
 'Forrest Gump (1994)',
 'Four Weddings and a Funeral (1994)',
 'Home Alone (1990)',
 'Jurassic Park (1993)',
 'Like Water for Chocolate (Como agua para chocolate) (1992)',
 'Little Women (1994)',
 "Mr. Holland's Opus (1995)",
 'Mrs. Doubtfire (1993)',
 'Much Ado About Nothing (1993)',
 "Muriel's Wedding (1994)",
 'Nine Months (1995)',
 'Operation Dumbo Drop (1995)',
 'Piano, The (1993)',
 'Postman, The (Postino, Il) (1994)',
 'Ready to Wear (Pret-A-Porter) (1994)',
 'Remains of the Day, The (1993)',
 'Sabrina (1995)',
 "Schindler's List (1993)",
 'Secret Garden, The (1993)',
 'Sense and Sensibility (1995)',
 'Shadowlands (1993)',
 'Silence of the Lambs, The (1991)',
 'Star Trek: Generations (1994)',
 'Stargate (1994)']

Task 3: Finding the other users who watched the same movies 

In [12]:
# selecting the movies that random user watched which also includes other users
movies_watched_df = user_movie_df[movies_watched]
movies_watched_df.head()

title,Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),Aladdin (1992),"American President, The (1995)",Apollo 13 (1995),Babe (1995),Bullets Over Broadway (1994),Clueless (1995),Disclosure (1994),Forrest Gump (1994),...,Ready to Wear (Pret-A-Porter) (1994),"Remains of the Day, The (1993)",Sabrina (1995),Schindler's List (1993),"Secret Garden, The (1993)",Sense and Sensibility (1995),Shadowlands (1993),"Silence of the Lambs, The (1991)",Star Trek: Generations (1994),Stargate (1994)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,3.5,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,5.0,5.0,5.0
4.0,,3.0,,,,,,,,4.0,...,,,,,3.0,,,,3.0,
5.0,,,5.0,5.0,5.0,,,,,,...,,3.0,,,5.0,3.0,,3.0,,4.0


In [13]:
# number of movies watched by users to find the similar pattern with random user
user_movie_count = movies_watched_df.T.notnull().sum()
user_movie_count = user_movie_count.reset_index()
#number of movies watched by users
user_movie_count.columns = ["userId", "movie_count"]
user_movie_count.head(10)
# excluding the user who watched less than 20 movies to get similar pattern with random user
# user_movie_count[user_movie_count["movie_count"] > 20].sort_values("movie_count", ascending=False)
# users who watched same amount of movies with random user
# user_movie_count[user_movie_count["movie_count"] == 33].count() # just 17

Unnamed: 0,userId,movie_count
0,1.0,1
1,2.0,2
2,3.0,4
3,4.0,6
4,5.0,11
5,6.0,3
6,7.0,9
7,8.0,13
8,9.0,1
9,10.0,3


In [14]:
# selecting the users who watched more than %60 of movies the the random user watched to get better results
perc = len(movies_watched) * 60 / 100
users_same_movies = user_movie_count[user_movie_count["movie_count"] > perc]["userId"]
len(users_same_movies)

4139

Task 4: Determining the similar users to random user for recommendation

In [15]:
# creating dataframe consisting of movies watched by random user and other users who watched them
final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies.index)],
                      random_user_df[movies_watched]])
final_df

title,Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),Aladdin (1992),"American President, The (1995)",Apollo 13 (1995),Babe (1995),Bullets Over Broadway (1994),Clueless (1995),Disclosure (1994),Forrest Gump (1994),...,Ready to Wear (Pret-A-Porter) (1994),"Remains of the Day, The (1993)",Sabrina (1995),Schindler's List (1993),"Secret Garden, The (1993)",Sense and Sensibility (1995),Shadowlands (1993),"Silence of the Lambs, The (1991)",Star Trek: Generations (1994),Stargate (1994)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
90.0,3.5,,3.0,,3.5,4.0,,2.0,,5.0,...,,,,,,,,3.5,4.5,3.5
129.0,0.5,,1.5,,,2.5,,,,4.5,...,,,,,,,,,,
155.0,,,,,,,,,,,...,,,,,,,,,,
157.0,,,,,,,,,,5.0,...,,,,,,,,,,
159.0,,,,,,5.0,,,,,...,,,,5.0,5.0,5.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138278.0,,,,,,,,,,,...,,,,,,,,,,
138381.0,,,5.0,3.0,,4.0,,,,5.0,...,,,,5.0,,5.0,,,,
138414.0,2.0,,3.0,,,2.0,,,,,...,,,,,,,,,3.0,2.5
138482.0,,,,,,,,,,,...,,,,,,,,,,


In [16]:
# finding correlations between users
corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
corr_df = pd.DataFrame(corr_df, columns=["corr"])
corr_df.index.names = ['user_id_1', 'user_id_2']
corr_df = corr_df.reset_index()
corr_df

Unnamed: 0,user_id_1,user_id_2,corr
0,77717.0,86171.0,-1.0
1,119382.0,49391.0,-1.0
2,62598.0,52062.0,-1.0
3,43192.0,77141.0,-1.0
4,58945.0,64628.0,-1.0
...,...,...,...
335560,127258.0,118736.0,1.0
335561,51620.0,32664.0,1.0
335562,45308.0,16115.0,1.0
335563,23436.0,88218.0,1.0


In [17]:
# selecting users at least %65 correlated with random user
top_users = corr_df[(corr_df["user_id_1"] == random_user) & (corr_df["corr"] >= 0.65)][
    ["user_id_2", "corr"]].reset_index(drop=True)
top_users = top_users.sort_values(by='corr', ascending=False)
top_users.rename(columns={"user_id_2": "userId"}, inplace=True)
top_users

Unnamed: 0,userId,corr
10,34006.0,0.980581
9,107125.0,0.892885
8,11416.0,0.705431
7,87093.0,0.690941
6,69431.0,0.68853
5,137557.0,0.685879
4,134822.0,0.67683
3,41388.0,0.664024
2,51620.0,0.661762
1,75178.0,0.656075


In [18]:
# rating scores of similar users with random user
top_users_ratings = top_users.merge(rating[["userId", "movieId", "rating"]], how='inner')
top_users_ratings.head()

Unnamed: 0,userId,corr,movieId,rating
0,34006.0,0.980581,7,3.0
1,34006.0,0.980581,150,0.5
2,34006.0,0.980581,173,2.0
3,34006.0,0.980581,225,1.0
4,34006.0,0.980581,494,2.0


Task 5: Calculation of weighted average recommendation score and recommend first 5 movies


In [19]:
# considering rating and correlation together: weighted average
top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['rating']
# getting the movie IDs and weighted ratings
recommendation_df = top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})
recommendation_df = recommendation_df.reset_index()
recommendation_df.head()

Unnamed: 0,movieId,weighted_rating
0,1,3.116352
1,2,1.675794
2,3,1.958276
3,6,2.935694
4,7,2.841561


In [20]:
# 5 movies to recommend (user-based)
movies_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > 4].sort_values("weighted_rating", ascending=False)
movies_to_be_recommend = movies_to_be_recommend.merge(movie[["movieId", "title"]])["title"]
movies_to_be_recommend.head(5)

0           Happy Gilmore (1996)
1               Labyrinth (1986)
2    Boondock Saints, The (2000)
3                  Snatch (2000)
4                 Frailty (2001)
Name: title, dtype: object

Task 6: Item based recommendation based on recently watched and highly graded movie by random picked user 

In [21]:
# getting the movie ID of recently watched and highly graded movie by random picked user 
movie_id = rating[(rating["userId"] == random_user) & (rating["rating"] ==  5.0)].\
sort_values(by = "timestamp", ascending = False)["movieId"][0:6].values[0]

In [22]:
# 5 movies to recommend (item-based)
movie_name = movie[movie["movieId"]== movie_id]["title"]
movie_name = user_movie_df[movie_name]
movies_from_item_based = user_movie_df.corrwith(movie_name).sort_values(ascending=False)
movies_from_item_based[1:6].index

Index([''burbs, The (1989)', '(500) Days of Summer (2009)',
       '*batteries not included (1987)', '...And Justice for All (1979)',
       '10 Things I Hate About You (1999)'],
      dtype='object', name='title')