# Item based collaborative filtering



In [249]:
import pandas as pd
import numpy as np

## Load the rating  and movies data

In [250]:
ratings_df = pd.read_csv('Data/updated_ratings.csv')
ratings_df = ratings_df.drop('timestamp', axis=1)
ratings_df = ratings_df.drop("Unnamed: 0",axis=1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,858,5.0
1,1,1246,5.0
2,1,1968,4.0
3,1,2762,4.5
4,1,2959,4.0


## Load the movies data

In [251]:
movies_df= pd.read_csv("Data/merged.csv")
movies_df = movies_df[["id", "title"]]
movies_df.rename(columns = {'id':'movieId'}, inplace = True)
movies_df.head()

Unnamed: 0,movieId,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II


## Filter users and movies with few ratings

In [252]:
users_limit = ratings_df['userId'].value_counts().quantile(0.4)
filter_users = ratings_df['userId'].value_counts() > users_limit
print(f"Filtering users with less than {users_limit} ratings.")
filter_users = filter_users[filter_users].index.tolist()

movies_limit = ratings_df['movieId'].value_counts().quantile(0.4)
filter_movies = ratings_df['movieId'].value_counts() > movies_limit
print(f"Filtering movies with less than {movies_limit} ratings.")
filter_movies = filter_movies[filter_movies].index.tolist()

print(f'Original shape: {ratings_df.shape}')
ratings_df = ratings_df[(ratings_df['movieId'].isin(filter_movies)) & (ratings_df['userId'].isin(filter_users))]
# ratings_df =  ratings_df[ratings_df['rating'] >= 3.5]
# ratings_df['rating'] = ratings_df['rating'].apply(lambda x: 1 if x >= 3.5 else 0)
print(f'New shape: {ratings_df.shape}')

Filtering users with less than 8.0 ratings.
Filtering movies with less than 66.0 ratings.
Original shape: (6935908, 3)
New shape: (6440009, 3)


# Merge the two dataframes
Now we have a dataframe with all the information.

In [253]:
data = pd.merge(movies_df,ratings_df)
data.head()

Unnamed: 0,movieId,title,userId,rating
0,862,Toy Story,1923,3.0
1,862,Toy Story,2103,5.0
2,862,Toy Story,5380,1.0
3,862,Toy Story,6177,4.0
4,862,Toy Story,6525,4.0


# Data transformation
Here we create a dataframe where every row represents a user and every column a movie. This way we can create our correlation matrix to produce suggestions on the correlation of ratings between users.

In [254]:
pivot_table = data.pivot_table(index = ["userId"],columns = ["title"],values = "rating")
pivot_table.fillna(0, inplace=True)
pivot_table.shape

(146726, 1743)

In [255]:
## Create the

## Get recommendations

In [256]:

def get_recommendation(movie):
    movie_watched = pivot_table[movie]
    similarity_with_other_movies = pivot_table.corrwith(movie_watched)
    similarity_with_other_movies = similarity_with_other_movies.sort_values(ascending=False)
    print(similarity_with_other_movies.head(6))

movie = "2001: A Space Odyssey"
get_recommendation(movie)

title
2001: A Space Odyssey    1.000000
Donnie Darko             0.365946
Four Rooms               0.343112
Lolita                   0.332287
The Living Daylights     0.306956
Armageddon               0.290077
dtype: float64


In [257]:
movie = "Pulp Fiction"
get_recommendation(movie)

title
Pulp Fiction                       1.000000
The Final Countdown                0.242623
Letters from Iwo Jima              0.237927
Star Trek II: The Wrath of Khan    0.217962
A Streetcar Named Desire           0.211560
Ronin                              0.208437
dtype: float64


In [258]:
movie = "Superman"
get_recommendation(movie)

title
Superman                                    1.000000
The Chronicles of Narnia: Prince Caspian    0.261318
Harry Potter and the Goblet of Fire         0.258294
Westworld                                   0.254305
Dog Day Afternoon                           0.237674
Breaking and Entering                       0.229087
dtype: float64


In [259]:
movie = "2001: A Space Odyssey"
get_recommendation(movie)

title
2001: A Space Odyssey    1.000000
Donnie Darko             0.365946
Four Rooms               0.343112
Lolita                   0.332287
The Living Daylights     0.306956
Armageddon               0.290077
dtype: float64


In [260]:
movie = "2001: A Space Odyssey"
get_recommendation(movie)

title
2001: A Space Odyssey    1.000000
Donnie Darko             0.365946
Four Rooms               0.343112
Lolita                   0.332287
The Living Daylights     0.306956
Armageddon               0.290077
dtype: float64
