# TITLE

# CONTENTS

# BUSINESS UNDERSTANDING

# DATA UNDERSTANDING

# DATA PREPARATION

## EDA of DataFrames

In [1]:
# Import necessary libraries for data preparation

import pandas as pd
import numpy as np



In [2]:
ratings_df = pd.read_csv('Data/ratings.csv')
movies_df = pd.read_csv('Data/movies.csv')
tags_df = pd.read_csv('Data/tags.csv')
links_df = pd.read_csv('Data/links.csv')

In [3]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
ratings_df.shape

(100836, 4)

In [5]:
ratings_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [6]:
ratings_df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [7]:
ratings_df.value_counts()

userId  movieId  rating  timestamp 
1       1        4.0     964982703     1
434     4993     5.0     1270604133    1
        4963     4.0     1270604560    1
        4896     2.5     1270604915    1
        4886     4.5     1270604658    1
                                      ..
227     58303    4.0     1447210409    1
        56782    4.5     1447210013    1
        56367    4.5     1447210824    1
        55820    4.0     1447209881    1
610     170875   3.0     1493846415    1
Name: count, Length: 100836, dtype: int64

In [8]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
movies_df.shape

(9742, 3)

In [10]:
movies_df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [11]:
movies_df.value_counts()

movieId  title                                                  genres                                     
1        Toy Story (1995)                                       Adventure|Animation|Children|Comedy|Fantasy    1
53322    Ocean's Thirteen (2007)                                Crime|Thriller                                 1
53129    Mr. Brooks (2007)                                      Crime|Drama|Thriller                           1
53138    Librarian: Return to King Solomon's Mines, The (2006)  Action|Adventure|Fantasy                       1
53140    Librarian: Quest for the Spear, The (2004)             Action|Adventure|Comedy|Fantasy|Romance        1
                                                                                                              ..
4390     Rape Me (Baise-moi) (2000)                             Crime|Drama|Thriller                           1
4392     Alice (1990)                                           Comedy|Drama|Fantasy|Romance         

In [12]:
movies_df.duplicated().sum()

0

In [13]:
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [14]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


# Collaborative Filtering Model: Pre-processing, Feature Engineering & Model Deployment

### Pre-process 'ratings_df'

In [15]:
# Filter out movies with ratings under 3.5 stars
min_rating_threshold = 3.5
filtered_ratings_df = ratings_df[ratings_df['rating'] >= min_rating_threshold]

# Filter out movies with too few ratings (adjust min_ratings as needed)
min_ratings = 50
filtered_ratings_df = filtered_ratings_df.groupby('movieId').filter(lambda x: len(x) >= min_ratings)

### Feature engineering 'filtered_ratings_df'

In [16]:
# Calculate the average rating for each movie. This feature can provide insights into the overall popularity 
# and user sentiment toward each movie
average_ratings = filtered_ratings_df.groupby('movieId')['rating'].mean().reset_index()
average_ratings.rename(columns={'rating': 'avg_rating'}, inplace=True)
filtered_ratings_df = filtered_ratings_df.merge(average_ratings, on='movieId', how='left')

In [17]:
# Create a feature that represents the number of ratings each movie has received. This can help the model 
# understand how well-known and frequently rated a movie is
num_ratings = filtered_ratings_df.groupby('movieId')['rating'].count().reset_index()
num_ratings.rename(columns={'rating': 'num_ratings'}, inplace=True)
filtered_ratings_df = filtered_ratings_df.merge(num_ratings, on='movieId', how='left')

In [18]:
# Calculate the average rating given by each user. This feature can capture each user's general rating behavior
average_user_ratings = filtered_ratings_df.groupby('userId')['rating'].mean().reset_index()
average_user_ratings.rename(columns={'rating': 'avg_user_rating'}, inplace=True)
filtered_ratings_df = filtered_ratings_df.merge(average_user_ratings, on='userId', how='left')

In [19]:
# Create a feature that represents the strength of the interaction between a user and an item. This can be 
# calculated by multiplying the user's rating by the number of ratings given by the user
filtered_ratings_df['interaction_strength'] = filtered_ratings_df['rating'] * filtered_ratings_df['num_ratings']

In [20]:
filtered_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,avg_rating,num_ratings,avg_user_rating,interaction_strength
0,1,1,4.0,964982703,4.284848,165,4.685714,660.0
1,1,6,4.0,964982224,4.322368,76,4.685714,304.0
2,1,47,5.0,964983815,4.369427,157,4.685714,785.0
3,1,50,5.0,964982931,4.497126,174,4.685714,870.0
4,1,110,4.0,964982176,4.422872,188,4.685714,752.0


In [21]:
filtered_ratings_df = filtered_ratings_df.drop('timestamp', axis=1)

In [22]:
unique_user_ids = filtered_ratings_df['userId'].nunique()
unique_movie_ids = filtered_ratings_df['movieId'].nunique()

print(f"Number of unique user IDs: {unique_user_ids}")
print(f"Number of unique movie IDs: {unique_movie_ids}")

Number of unique user IDs: 603
Number of unique movie IDs: 242


In [23]:
filtered_ratings_df.head()

Unnamed: 0,userId,movieId,rating,avg_rating,num_ratings,avg_user_rating,interaction_strength
0,1,1,4.0,4.284848,165,4.685714,660.0
1,1,6,4.0,4.322368,76,4.685714,304.0
2,1,47,5.0,4.369427,157,4.685714,785.0
3,1,50,5.0,4.497126,174,4.685714,870.0
4,1,110,4.0,4.422872,188,4.685714,752.0


# merge movies_df with filtered_ratings_df, plot distributions and feature interactions

## Collaborative Filtering
### Model 1: Surprise (Matrix Factorization with SVD)

In [24]:
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from collections import defaultdict

In [25]:
# Set a fixed random seed for reproducibility
random_seed = 42

# Create a Surprise Dataset
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(filtered_ratings_df[['userId', 'movieId', 'rating']], reader)

# Split the dataset into train and test sets with the fixed random seed
trainset, testset = train_test_split(data, test_size=0.2, random_state=random_seed)

# Train the SVD model (or any other collaborative filtering model) with the fixed random seed
first_model = SVD(random_state=random_seed)
first_model.fit(trainset)

# Make predictions on the test set
predictions = first_model.test(testset)

In [26]:
# Calculate and print RMSE and MAE evaluation metrics
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

RMSE: 0.4718
MAE:  0.3990
RMSE: 0.4718
MAE: 0.3990


In [27]:
# Function to get top-N recommendations
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

# Get top-N recommendations using the trained model
top_n = get_top_n(predictions, n=10)

# Merge ratings data with movies data
merged_ratings_df = filtered_ratings_df.merge(movies_df[['movieId', 'title']], on='movieId', how='left')

# Function to get recommendations for a specific user
def get_recommendations_for_user(user_id, top_n):
    user_ratings = top_n[user_id]
    recommended_movies = []
    for (movie_id, predicted_rating) in user_ratings:
        movie_title = merged_ratings_df.loc[merged_ratings_df['movieId'] == movie_id, 'title'].values[0]
        recommended_movies.append((movie_title, predicted_rating))
    return recommended_movies

# Plug in the user ID for which you want recommendations
user_id_to_recommend = 100  # Replace with the desired user ID
recommended_movies_for_user = get_recommendations_for_user(user_id_to_recommend, top_n)

# Print recommendations for the specific user
print(f"Top 10 Movie Recommendations for User {user_id_to_recommend}:")
for i, (movie_title, predicted_rating) in enumerate(recommended_movies_for_user, 1):
    print(f"{i}. {movie_title} (Predicted Rating: {predicted_rating:.4f})")

Top 10 Movie Recommendations for User 100:
1. Princess Bride, The (1987) (Predicted Rating: 4.4257)
2. Saving Private Ryan (1998) (Predicted Rating: 4.2670)
3. Back to the Future (1985) (Predicted Rating: 4.2313)
4. Amadeus (1984) (Predicted Rating: 4.2033)
5. Batman (1989) (Predicted Rating: 4.2022)
6. Austin Powers: The Spy Who Shagged Me (1999) (Predicted Rating: 4.1204)
7. Catch Me If You Can (2002) (Predicted Rating: 4.0901)
8. Groundhog Day (1993) (Predicted Rating: 4.0269)
9. Jerry Maguire (1996) (Predicted Rating: 4.0081)
10. Finding Nemo (2003) (Predicted Rating: 3.9516)
