# Item based collaborative filtering



In [36]:
import pandas as pd
import numpy as np

## Load the rating  and movies data

The dataset was downloaded from the kaggle dataset that can be found here: https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset

In [37]:
ratings_df = pd.read_csv('Data/updated_ratings.csv')
ratings_df = ratings_df.drop('timestamp', axis=1)
ratings_df = ratings_df.drop("Unnamed: 0",axis=1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,858,5.0
1,1,1246,5.0
2,1,1968,4.0
3,1,2762,4.5
4,1,2959,4.0


In [38]:
movies_df= pd.read_csv("Data/merged.csv")
movies_df = movies_df[["id", "title"]]
movies_df.rename(columns = {'id':'movieId'}, inplace = True)
movies_df.head()

Unnamed: 0,movieId,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II


## Filter users and movies with few ratings

This step was performed for two reasons:
* In order to have more robust results we needed to discard users and movies with very few ratings.
* Not sufficient memory if we tried to run the method below with all these values since it creates a very sparse matrix with (shape number_of_users, number_of_movies) 

In [39]:
users_limit = ratings_df['userId'].value_counts().quantile(0.4)
filter_users = ratings_df['userId'].value_counts() > users_limit
print(f"Filtering users with less than {users_limit} ratings.")
filter_users = filter_users[filter_users].index.tolist()

movies_limit = ratings_df['movieId'].value_counts().quantile(0.4)
filter_movies = ratings_df['movieId'].value_counts() > movies_limit
print(f"Filtering movies with less than {movies_limit} ratings.")
filter_movies = filter_movies[filter_movies].index.tolist()

print(f'Original shape: {ratings_df.shape}')
ratings_df = ratings_df[(ratings_df['movieId'].isin(filter_movies)) & (ratings_df['userId'].isin(filter_users))]
# ratings_df =  ratings_df[ratings_df['rating'] >= 3.5]
# ratings_df['rating'] = ratings_df['rating'].apply(lambda x: 1 if x >= 3.5 else 0)
print(f'New shape: {ratings_df.shape}')

Filtering users with less than 8.0 ratings.
Filtering movies with less than 66.0 ratings.
Original shape: (6935908, 3)
New shape: (6440009, 3)


# Merge the two dataframes
Now we have a dataframe with all the information.

In [40]:
data = pd.merge(movies_df,ratings_df)
data.head()

Unnamed: 0,movieId,title,userId,rating
0,862,Toy Story,1923,3.0
1,862,Toy Story,2103,5.0
2,862,Toy Story,5380,1.0
3,862,Toy Story,6177,4.0
4,862,Toy Story,6525,4.0


# Data transformation
Here we create a dataframe where every row represents a user and every column a movie. This way we can create our correlation matrix to produce suggestions on the correlation of ratings between users.

In [41]:
pivot_table = data.pivot_table(index = ["userId"],columns = ["title"],values = "rating")
pivot_table.fillna(0, inplace=True)
pivot_table.shape

(146726, 1743)

## Get recommendations

Here we find the pearson correlation of the column that correspondes to the movie given with all the other columns. The 6 first movies (first one is always the movie given since it has correlation 1.0), are returned so as to have the 5 most correlated movies to this one based on the ratings that it was given by the users.

In [42]:

def get_recommendation(movie):
    movie_watched = pivot_table[movie]
    similarity_with_other_movies = pivot_table.corrwith(movie_watched)
    similarity_with_other_movies = similarity_with_other_movies.sort_values(ascending=False)
    print(similarity_with_other_movies.head(6))

In [43]:
movie = "Die Hard 2"
get_recommendation(movie)

title
Die Hard 2            1.000000
Parenthood            0.490138
Rope                  0.438043
Rambo III             0.436231
The Great Outdoors    0.375681
Addicted to Love      0.371151
dtype: float64


In [44]:
movie = "Young and Innocent"
get_recommendation(movie)

title
Young and Innocent                                          1.000000
License to Wed                                              0.413809
5 Card Stud                                                 0.393295
Mothra vs. Godzilla                                         0.390459
Rope                                                        0.381726
Shriek If You Know What I Did Last Friday the Thirteenth    0.380771
dtype: float64


In [45]:
movie = "Terminator 3: Rise of the Machines"
get_recommendation(movie)
    

title
Terminator 3: Rise of the Machines    1.000000
Point Break                           0.426371
The Million Dollar Hotel              0.387339
The Talented Mr. Ripley               0.346179
A River Runs Through It               0.331954
License to Wed                        0.329634
dtype: float64


In [47]:

movie = "Rocky Balboa"
get_recommendation(movie)

title
Rocky Balboa            1.000000
Mothra vs. Godzilla     0.354468
Fools Rush In           0.343406
Bridge to Terabithia    0.330150
Jacob's Ladder          0.321013
The Thirteenth Floor    0.307325
dtype: float64


In [48]:

movie = "Reservoir Dogs"
get_recommendation(movie)

title
Reservoir Dogs               1.000000
Wag the Dog                  0.533294
Big Fish                     0.526468
Psycho                       0.513535
Titanic                      0.495617
A Nightmare on Elm Street    0.473788
dtype: float64


In [49]:
#"Die Hard 2", "Terminator 3: Rise of the Machines", "Young and Innocent", "Reservoir Dogs", "Rocky Balboa"

movie = "Spider-Man 2"
get_recommendation(movie)

title
Spider-Man 2          1.000000
Some Like It Hot      0.231414
Me, Myself & Irene    0.223979
Death Machine         0.221533
Cop Land              0.216437
I, Robot              0.210806
dtype: float64
