### Importing libraries

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 20)
movie = pd.read_csv('movie.csv')
rating = pd.read_csv('rating.csv')
df = movie.merge(rating, how="left", on="movieId")
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41


### Data Understanding

In [2]:
df.shape

(20000797, 6)

In [3]:
df["title"].nunique()

27262

In [4]:
df["title"].value_counts().head()

Pulp Fiction (1994)                 67310
Forrest Gump (1994)                 66172
Shawshank Redemption, The (1994)    63366
Silence of the Lambs, The (1991)    63299
Jurassic Park (1993)                59715
Name: title, dtype: int64

In [5]:
comment_counts = pd.DataFrame(df["title"].value_counts())
rare_movies = comment_counts[comment_counts["title"] <= 1000].index
common_movies = df[~df["title"].isin(rare_movies)]

In [6]:
user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")
user_movie_df.head(20)

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,
6.0,,,,,,,,,,,...,,,,,,,,,,
7.0,,,,,,,,,,,...,,,,,,,,,,2.0
8.0,,,,,,,,,,,...,,,,,,,,,,
9.0,,,,,,,,,,,...,,,,,,,,,,
10.0,,,,,,,,,,,...,,,,,,,,,,


### Determining the Movies Watched by the User to Make a Suggestion

In [7]:
user = 108170
user_df = user_movie_df[user_movie_df.index == user]

movies_watched = user_df.columns[user_df.notna().any()].tolist()

len(movies_watched) #186

186

### Accessing Data and Ids of Other Users Watching the Same Movies

In [8]:
pd.set_option('display.max_columns', 5)


movies_watched_df = user_movie_df[movies_watched]
movies_watched_df.head()

title,2001: A Space Odyssey (1968),"Adventures of Priscilla, Queen of the Desert, The (1994)",...,Willow (1988),X2: X-Men United (2003)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,3.5,,...,4.0,4.0
2.0,5.0,,...,,
3.0,5.0,,...,,
4.0,,,...,,
5.0,,,...,,


In [9]:
movies_watched_df.shape #(138493, 186)

(138493, 186)

In [10]:
user_movie_count = movies_watched_df.T.notnull().sum()
user_movie_count = user_movie_count.reset_index()
user_movie_count.columns = ["userId", "movie_count"]

In [11]:
perc = len(movies_watched) * 60 / 100
users_same_movies = user_movie_count[user_movie_count["movie_count"] > perc]["userId"]
users_same_movies.count() #2326

2326

### Identifying Users with the Most Similar Behaviors to the User to be Suggested

In [12]:
final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(users_same_movies)],
                      user_df[movies_watched]])

final_df.shape #(2327, 186)

(2327, 186)

In [13]:
corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
corr_df = pd.DataFrame(corr_df, columns=["corr"])
corr_df.index.names = ['user_id_1', 'user_id_2']
corr_df = corr_df.reset_index()

top_users = corr_df[(corr_df["user_id_1"] == user) & (corr_df["corr"] >= 0.65)][
    ["user_id_2", "corr"]].reset_index(drop=True)

top_users = top_users.sort_values(by='corr', ascending=False)

top_users.rename(columns={"user_id_2": "userId"}, inplace=True)

top_users_ratings = top_users.merge(rating[["userId", "movieId", "rating"]], how='inner')
top_users_ratings = top_users_ratings[top_users_ratings["userId"] != user]

In [14]:
top_users_ratings

Unnamed: 0,userId,corr,movieId,rating
194,5155.0,0.716406,1,3.5
195,5155.0,0.716406,2,3.0
196,5155.0,0.716406,5,3.0
197,5155.0,0.716406,9,3.0
198,5155.0,0.716406,10,4.0
...,...,...,...,...
1561,121747.0,0.673295,27846,4.5
1562,121747.0,0.673295,30707,4.5
1563,121747.0,0.673295,30810,4.5
1564,121747.0,0.673295,31221,2.0


### Calculating the Weighted Average Recommendation Score

In [15]:
top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['rating']

recommendation_df = top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})

recommendation_df = recommendation_df.reset_index()

movies_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > 3].sort_values("weighted_rating", ascending=False)

In [16]:
movies_to_be_recommend 

Unnamed: 0,movieId,weighted_rating
418,1997,3.582032
171,671,3.582032
396,1884,3.582032
291,1333,3.582032
1037,8641,3.582032
...,...,...
474,2288,3.029829
899,5782,3.029829
585,2951,3.029829
593,2997,3.029829


### Let's make 5 user-based suggestions based on the name of the movie that the user has watched most recently.

In [17]:
recommend_df = movies_to_be_recommend.merge(movie[["movieId", "title"]])["title"].iloc[0:5]
recommend_df

0                              Exorcist, The (1973)
1    Mystery Science Theater 3000: The Movie (1996)
2             Fear and Loathing in Las Vegas (1998)
3                                 Birds, The (1963)
4      Anchorman: The Legend of Ron Burgundy (2004)
Name: title, dtype: object

### Let's make 5 item-based suggestions based on the name of the movie that the user has watched most recently.

In [18]:
movie_id = rating[(rating["userId"] == user) & (rating["rating"] == 5.0)]. \
    sort_values(by="timestamp", ascending=False)["movieId"][0:1].values[0]

movie_name_5 = movie[movie["movieId"] == movie_id]["title"].values[0]
movie_name_5 =  user_movie_df[movie_name_5]

movies_from_item_based = user_movie_df.corrwith(movie_name_5).sort_values(ascending=False)

movies_from_item_based[1:6].index

Index(['My Science Project (1985)', 'Mediterraneo (1991)',
       'Old Man and the Sea, The (1958)',
       'National Lampoon's Senior Trip (1995)', 'Clockwatchers (1997)'],
      dtype='object', name='title')