<a href="https://www.kaggle.com/omerparlak/item-based-and-user-based-movie-recommandation?scriptVersionId=88257581" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Business Problem

#### Movie recommendation for random users with item-based and user-based methods.

In [1]:
import numpy as np
import pandas as pd
pd.pandas.set_option('display.max_columns', 10)

In [2]:
def create_user_movie_df():
    movie = pd.read_csv("../input/movielens-20m-dataset/movie.csv")
    rating = pd.read_csv("../input/movielens-20m-dataset/rating.csv")
    df = movie.merge(rating, how="left", on="movieId")
    comment_counts = pd.DataFrame(df["title"].value_counts())
    rare_movies = comment_counts[comment_counts["title"] <= 1500].index
    common_movies = df[~df["title"].isin(rare_movies)]
    user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")
    return user_movie_df


def item_based_recommender(movie_name, user_movie_df):
    movie = user_movie_df[movie_name]
    return user_movie_df.corrwith(movie).sort_values(ascending=False).head(10)

# Data Preprocessing

In [3]:
user_movie_df = create_user_movie_df()
user_movie_df.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 Things I Hate About You (1999),"10,000 BC (2008)",...,Zombieland (2009),Zoolander (2001),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1.0,,,,,,...,,,,,
2.0,,,,,,...,,,,,
3.0,,,,,,...,,,,,
4.0,,,,,,...,,,,,
5.0,,,,,,...,,,,,


# Determining the Movies Watched by the Random User

In [4]:
random_user = int(pd.Series(user_movie_df.index).sample(1, random_state=17).values)
random_user

42504

In [5]:
random_user_df = user_movie_df[user_movie_df.index == random_user]
random_user_df

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 Things I Hate About You (1999),"10,000 BC (2008)",...,Zombieland (2009),Zoolander (2001),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
42504.0,,4.0,,,,...,4.0,,,,4.0


In [6]:
movies_watched = random_user_df.columns[random_user_df.notna().any()].tolist()
movies_watched

['(500) Days of Summer (2009)',
 '127 Hours (2010)',
 '2 Fast 2 Furious (Fast and the Furious 2, The) (2003)',
 '20,000 Leagues Under the Sea (1954)',
 '2001: A Space Odyssey (1968)',
 '21 (2008)',
 '40-Year-Old Virgin, The (2005)',
 '8 Mile (2002)',
 'Amazing Spider-Man, The (2012)',
 'American Beauty (1999)',
 'American History X (1998)',
 'American Pie (1999)',
 'Anchorman: The Legend of Ron Burgundy (2004)',
 'Apollo 13 (1995)',
 'Avatar (2009)',
 'Back to the Future (1985)',
 'Band of Brothers (2001)',
 'Bank Job, The (2008)',
 'Batman Begins (2005)',
 'Battlestar Galactica (2003)',
 'Big (1988)',
 'Blade Runner (1982)',
 'Blues Brothers, The (1980)',
 'Bourne Identity, The (2002)',
 'Bourne Supremacy, The (2004)',
 'Bourne Ultimatum, The (2007)',
 'Bowling for Columbine (2002)',
 'Braveheart (1995)',
 'Bruce Almighty (2003)',
 'Cabin in the Woods, The (2012)',
 'Casino Royale (2006)',
 "Charlie Wilson's War (2007)",
 'Cloud Atlas (2012)',
 'Con Air (1997)',
 'Contact (1997)',
 'C

In [7]:
len(movies_watched)

165

# Accessing Data and Ids of Other Users Watching the Same Movies

In [8]:
movies_watched_df = user_movie_df[movies_watched]
movies_watched_df

title,(500) Days of Summer (2009),127 Hours (2010),"2 Fast 2 Furious (Fast and the Furious 2, The) (2003)","20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),...,X-Men: First Class (2011),You've Got Mail (1998),Zack and Miri Make a Porno (2008),Zombieland (2009),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1.0,,,,,3.5,...,,,,,
2.0,,,,,5.0,...,,,,,
3.0,,,,,5.0,...,,,,,
4.0,,,,,,...,,,,,
5.0,,,,,,...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
138489.0,,,,,,...,,,,,
138490.0,,,,,,...,,,,,
138491.0,,,,,,...,,,,,
138492.0,,,,,,...,,,,,


In [9]:
# Count of same movies
user_movie_count = movies_watched_df.T.notnull().sum()
user_movie_count

userId
1.0         21
2.0         13
3.0         34
4.0          5
5.0         14
            ..
138489.0     7
138490.0     8
138491.0     3
138492.0    10
138493.0    39
Length: 138493, dtype: int64

In [10]:
user_movie_count = user_movie_count.reset_index()
user_movie_count.columns = ["userId", "movie_count"]
user_movie_count

Unnamed: 0,userId,movie_count
0,1.0,21
1,2.0,13
2,3.0,34
3,4.0,5
4,5.0,14
...,...,...
138488,138489.0,7
138489,138490.0,8
138490,138491.0,3
138491,138492.0,10


In [11]:
# Users who watched 60% of movies watched by random_user

perc = len(movies_watched) * 60 / 100
users_same_movies = user_movie_count[user_movie_count["movie_count"] > perc]["userId"]
users_same_movies

571          572.0
585          586.0
636          637.0
767          768.0
981          982.0
            ...   
138185    138186.0
138207    138208.0
138253    138254.0
138300    138301.0
138436    138437.0
Name: userId, Length: 1106, dtype: float64

# Identifying Users Most Similar to the User to be Recommended

In [12]:
final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies.index)], random_user_df[movies_watched]])
final_df.head()

title,(500) Days of Summer (2009),127 Hours (2010),"2 Fast 2 Furious (Fast and the Furious 2, The) (2003)","20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),...,X-Men: First Class (2011),You've Got Mail (1998),Zack and Miri Make a Porno (2008),Zombieland (2009),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
571.0,,,,,,...,,,,,
585.0,,,,,,...,,,,,
636.0,,,,,,...,,,,,
767.0,,,,,,...,,,,,
981.0,,,,,,...,,,,,


In [13]:
corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
corr_df = pd.DataFrame(corr_df, columns=["corr"])
corr_df.index.names = ['user_id_1', 'user_id_2']
corr_df = corr_df.reset_index()
corr_df.head()

Unnamed: 0,user_id_1,user_id_2,corr
0,56166.0,8151.0,-1.0
1,50359.0,83967.0,-1.0
2,33574.0,27052.0,-1.0
3,90077.0,82416.0,-1.0
4,16675.0,93668.0,-1.0


In [14]:
# Users with a correlation of over 65%
top_users = corr_df[(corr_df["user_id_1"] == random_user) & (corr_df["corr"] >= 0.65)][["user_id_2", "corr"]].reset_index(drop=True)
top_users = top_users.sort_values(by='corr', ascending=False)
top_users.rename(columns={"user_id_2": "userId"}, inplace=True)
top_users

Unnamed: 0,userId,corr
9,56960.0,0.906329
8,26834.0,0.873793
7,11247.0,0.855209
6,101154.0,0.851122
5,44383.0,0.806144
4,119366.0,0.762672
3,29672.0,0.753399
2,128257.0,0.723339
1,80667.0,0.721995
0,72516.0,0.708241


# Calculating Weighted Average Recommendation Score and Keeping Top 5 Movies

In [15]:
rating = pd.read_csv("../input/movielens-20m-dataset/rating.csv")

In [16]:
top_users_ratings = top_users.merge(rating[["userId", "movieId", "rating"]], how='inner')
top_users_ratings.head()

Unnamed: 0,userId,corr,movieId,rating
0,56960.0,0.906329,111,3.5
1,56960.0,0.906329,260,4.5
2,56960.0,0.906329,296,4.5
3,56960.0,0.906329,527,5.0
4,56960.0,0.906329,541,3.5


In [17]:
top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['rating']
top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})

Unnamed: 0_level_0,weighted_rating
movieId,Unnamed: 1_level_1
1,3.609974
2,3.013596
3,3.609974
6,2.887979
7,2.887979
...,...
61024,3.013596
68319,2.260197
72998,3.766995
77800,2.821502


In [18]:
recommendation_df = top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})
recommendation_df = recommendation_df.reset_index()

In [19]:
# Let's get weighted_rating greater than 4
movies_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > 4].sort_values("weighted_rating",ascending=False)
movies_to_be_recommend

Unnamed: 0,movieId,weighted_rating
483,79132,4.531646
163,1201,4.531646
454,8368,4.368963
373,4896,4.368963
356,4262,4.368963
162,1199,4.255612
430,6377,4.255612
398,5893,4.255612
384,5325,4.255612
44,223,4.255612


In [20]:
movie = pd.read_csv("../input/movielens-20m-dataset/movie.csv")

In [21]:
movies_to_be_recommend_final = movies_to_be_recommend.merge(movie[["movieId", "title"]])
movies_to_be_recommend_final

Unnamed: 0,movieId,weighted_rating,title
0,79132,4.531646,Inception (2010)
1,1201,4.531646,"Good, the Bad and the Ugly, The (Buono, il bru..."
2,8368,4.368963,Harry Potter and the Prisoner of Azkaban (2004)
3,4896,4.368963,Harry Potter and the Sorcerer's Stone (a.k.a. ...
4,4262,4.368963,Scarface (1983)
5,1199,4.255612,Brazil (1985)
6,6377,4.255612,Finding Nemo (2003)
7,5893,4.255612,"Last Seduction, The (1994)"
8,5325,4.255612,Dogtown and Z-Boyz (2001)
9,223,4.255612,Clerks (1994)


In [22]:
# Extracting movies watched by random_user from recommended movies 
# (dropped Inception(2010))
movies_to_be_recommend_final = movies_to_be_recommend_final[~movies_to_be_recommend_final["title"].isin(movies_watched)]
movies_to_be_recommend_final.head()

Unnamed: 0,movieId,weighted_rating,title
1,1201,4.531646,"Good, the Bad and the Ugly, The (Buono, il bru..."
2,8368,4.368963,Harry Potter and the Prisoner of Azkaban (2004)
3,4896,4.368963,Harry Potter and the Sorcerer's Stone (a.k.a. ...
4,4262,4.368963,Scarface (1983)
5,1199,4.255612,Brazil (1985)


# Item Based Recommendation

In [23]:
movie = pd.read_csv("../input/movielens-20m-dataset/movie.csv")
rating = pd.read_csv("../input/movielens-20m-dataset/rating.csv")

In [24]:
df = movie.merge(rating, how="left", on="movieId")
comment_counts = pd.DataFrame(df["title"].value_counts())
rare_movies = comment_counts[comment_counts["title"] <= 1500].index
common_movies = df[~df["title"].isin(rare_movies)]
common_movies.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41


In [25]:
common_movies.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41


In [26]:
common_movies[(common_movies["userId"] == random_user) & (common_movies["rating"] == 5.0)]

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
2091001,316,Stargate (1994),Action|Adventure|Sci-Fi,42504.0,5.0,2012-07-26 21:47:33
2224471,329,Star Trek: Generations (1994),Adventure|Drama|Sci-Fi,42504.0,5.0,2012-07-26 21:55:34
2478208,356,Forrest Gump (1994),Comedy|Drama|Romance|War,42504.0,5.0,2012-07-26 22:44:08
3495195,527,Schindler's List (1993),Drama|War,42504.0,5.0,2011-03-07 09:53:26
3886146,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,42504.0,5.0,2011-04-09 14:05:35
6416125,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,42504.0,5.0,2011-03-07 09:57:25
7009445,1265,Groundhog Day (1993),Comedy|Fantasy|Romance,42504.0,5.0,2011-03-07 10:07:13
7634649,1376,Star Trek IV: The Voyage Home (1986),Adventure|Comedy|Sci-Fi,42504.0,5.0,2012-07-26 22:15:32
8458683,1653,Gattaca (1997),Drama|Sci-Fi|Thriller,42504.0,5.0,2011-04-09 14:08:09
11032110,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,42504.0,5.0,2012-08-05 14:23:46


In [27]:
# Getting the last movie id that random_user gave 5 points
movie_id = common_movies[(common_movies["userId"] == random_user) & (common_movies["rating"] == 5.0)] \
.sort_values(by="timestamp", ascending=False)["movieId"][0:1].values[0]
movie_id

2571

In [28]:
movie_name = movie[movie["movieId"] == movie_id]["title"].values[0]
movie_name

'Matrix, The (1999)'

In [29]:
# movies_from_item_based[0:1].index = ['Matrix, The (1999)'] 
movies_from_item_based = item_based_recommender(common_movies[common_movies["movieId"] == movie_id]["title"].values[0], user_movie_df)
movies_from_item_based[1:6].index

Index(['Matrix Reloaded, The (2003)', 'Matrix Revolutions, The (2003)',
       'Animatrix, The (2003)', 'Blade (1998)',
       'Terminator 2: Judgment Day (1991)'],
      dtype='object', name='title')