In [None]:
# https://medium.com/grabngoinfo/recommendation-system-item-based-collaborative-filtering-f5078504996a
# https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b
# https://www.youtube.com/watch?v=fEd1p8-3S7w&ab_channel=JonCavallieMester

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# 1. Load data

rating df

In [None]:
df_rating = pd.read_csv('ratings.csv')

In [None]:
df_rating.shape

In [None]:
df_rating.head()

movie name df

In [None]:
df_name = pd.read_csv('movies.csv')

In [None]:
df_name.shape

In [None]:
df_name.head()

join both

In [None]:
df = pd.merge(df_rating, df_name, on='movieId', how='inner')

In [None]:
df = df[['userId','movieId','title','rating']]

In [None]:
df.shape

In [None]:
df.head()

# 2. Check data quality

data type

In [None]:
df.info()

unique value

In [None]:
df.shape

In [None]:
for i in df.columns:
    print('Columns name: ', i)
    print('Unique value: ', df[i].unique())
    print('Count unique value: ', df[i].nunique())
    print('-'*10)

missing value

In [None]:
df.isnull().sum()

blank value

In [None]:
(df == '').sum()

check movie id

In [None]:
movieid_counts = df.groupby('title').agg(number_movieid=('movieId', pd.Series.nunique)).reset_index()

In [None]:
multiple_movieid = movieid_counts[movieid_counts['number_movieid'] > 1]

In [None]:
multiple_movieid

In [None]:
max_movieid_df = df.groupby('title')['movieId'].max().reset_index()

In [None]:
df = pd.merge(df, max_movieid_df, on='title', how='inner')

In [None]:
df = df.rename(columns={'movieId_y': 'movieId'})

In [None]:
df = df[['userId','movieId','title','rating']]

In [None]:
df

check title

In [None]:
title_counts = df.groupby('movieId').agg(number_title=('title', pd.Series.nunique)).reset_index()

In [None]:
multiple_title = title_counts[title_counts['number_title'] > 1]

In [None]:
multiple_title

unique value (again)

In [None]:
df.shape

In [None]:
for i in df.columns:
    print('Columns name: ', i)
    print('Unique value: ', df[i].unique())
    print('Count unique value: ', df[i].nunique())
    print('-'*10)

In [None]:
df

# 3. Exploratory data analysis (EDA)

columns name

In [None]:
df.columns

rating max/min/distribution

In [None]:
df['rating'].max()

In [None]:
df['rating'].min()

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(df['rating'], bins=10, color='skyblue', edgecolor='black')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.title('Distribution of Ratings')
plt.grid(True)
plt.show()

average rating by title

In [None]:
avg_rating_by_title = df.groupby('title')['rating'].mean()

In [None]:
avg_rating_by_title_sorted = avg_rating_by_title.sort_values(ascending=False)

In [None]:
avg_rating_by_title_sorted

average rating by userId

In [None]:
avg_rating_by_userId = df.groupby('userId')['rating'].mean()

In [None]:
avg_rating_by_userId_sorted = avg_rating_by_userId.sort_values(ascending=False)

In [None]:
avg_rating_by_userId_sorted

count userId by title

In [None]:
count_userId_by_title = df.groupby('title')['userId'].nunique()

In [None]:
count_userId_by_title_sorted = count_userId_by_title.sort_values(ascending=False)

In [None]:
count_userId_by_title_sorted

count title by userId

In [None]:
count_title_by_userId = df.groupby('userId')['title'].nunique()

In [None]:
count_title_by_userId_sorted = count_title_by_userId.sort_values(ascending=False)

In [None]:
count_title_by_userId_sorted

# 4. Model - user based

pivot

In [None]:
user_item_metrix = df.pivot_table(index='userId', columns='title', values='rating')

In [None]:
user_item_metrix

mean normalize

In [None]:
# subtract by user's mean like KNNWithMean in surprise
# Since some people tend to give a higher rating than others, we normalize the rating by extracting the average rating of each user.

In [None]:
avg_rating_by_userId_sorted

In [None]:
user_item_metrix_norm = user_item_metrix.subtract(avg_rating_by_userId_sorted, axis = 'rows')

In [None]:
user_item_metrix_norm

In [None]:
user_item_metrix_norm_filled = user_item_metrix_norm.fillna(0)

In [None]:
user_item_metrix_norm_filled

similarity

In [None]:
user_similarity = cosine_similarity(user_item_metrix_norm_filled)

In [None]:
user_item_metrix_norm_filled.index

In [None]:
user_similarity

In [None]:
user_similarity = pd.DataFrame(user_similarity, index=user_item_metrix_norm_filled.index, columns=user_item_metrix_norm_filled.index)

In [None]:
user_similarity

# 5. Prediction - user based

setup

In [None]:
picked_userid = 407

In [None]:
# set top similar user (k) = 10
k = 10

top similar user

In [None]:
similar_users = user_similarity[picked_userid].sort_values(ascending=False)[:k]

In [None]:
similar_users

unwatched movie

In [None]:
picked_userid_unwatched = user_item_metrix_norm.T[407][user_item_metrix_norm.T[picked_userid].isna()]

In [None]:
picked_userid_unwatched

In [None]:
user_item_metrix_norm

score

In [None]:
similar_users.index.tolist()

In [None]:
len(picked_userid_unwatched.index.tolist())

In [None]:
similar_user_movies = user_item_metrix_norm.loc[similar_users.index.tolist()][picked_userid_unwatched.index.tolist()]

In [None]:
similar_user_movies

In [None]:
item_list = []
item_score_list = []
item_score_predicted_list = []

In [None]:
for i in similar_user_movies.columns:
    
    sum_value_weight = 0
    sum_weight = 0
    movie_rating = similar_user_movies[i]
    for u in similar_users.index:
        if pd.isna(movie_rating[u]) == False:
            score = similar_users[u] * movie_rating[u]
            sum_value_weight += score
            sum_weight += similar_users[u]

    if sum_weight == 0:
        item_score = 0
    else:
        item_score = sum_value_weight/sum_weight

    item_list.append(i)
    item_score_list.append(item_score)
    item_score_predicted_list.append(item_score + avg_rating_by_userId_sorted[picked_userid])

In [None]:
df_predict = pd.DataFrame({
    'title': item_list,
    'ranking': item_score_list,
    'ranking_predicted':item_score_predicted_list
})

In [None]:
df_predict.sort_values(by='ranking', ascending=False).iloc[:5]

In [None]:
df_predict.sort_values(by='ranking_predicted', ascending=False).iloc[:5]

In [None]:
****

# 6. Model - item based

pivot

In [None]:
item_user_metrix = df.pivot_table(index='title', columns='userId', values='rating')

In [None]:
item_user_metrix

mean normalize

In [None]:
# subtract by movie's mean like KNNWithMean in surprise

In [None]:
avg_rating_by_title_sorted

In [None]:
item_user_metrix_norm = item_user_metrix.subtract(avg_rating_by_title_sorted, axis = 'rows')

In [None]:
item_user_metrix_norm

In [None]:
item_user_metrix_norm_filled = item_user_metrix_norm.fillna(0)

similarity

In [None]:
item_user_metrix_norm_filled

In [None]:
item_similarity = cosine_similarity(item_user_metrix_norm_filled)

In [None]:
item_user_metrix_norm_filled.index

In [None]:
item_similarity

In [None]:
item_similarity = pd.DataFrame(item_similarity, index=item_user_metrix_norm_filled.index, columns=item_user_metrix_norm_filled.index)

In [None]:
item_similarity

In [None]:
# diagonal is zero due to all zero in the movie

# 7. Prediction - user based

setup

In [None]:
picked_userid = 407

In [None]:
k = 10

top similar user

In [None]:
similar_users = user_similarity[picked_userid].sort_values(ascending=False)[:k]

In [None]:
similar_users

watched movie

In [None]:
item_user_metrix_norm

In [None]:
***

In [None]:
picked_title_watched = user_item_metrix_norm.T[407][user_item_metrix_norm.T[picked_userid].isna()]

In [None]:
picked_userid_unwatched

In [None]:
user_item_metrix_norm