# Recommenders Project (Content-based & Collaborative)

# 1. Importing Libraries & Loading Data

In [1]:
# Import necessary libraries for mathematical operations and data manipulation
from math import sqrt
import pandas as pd
import numpy as np

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')



# Load Movie and Rating Datasets

In [3]:
# Define file paths
movies_file = 'movies.csv'
ratings_file = 'ratings.csv'

# Load datasets
movies_df = pd.read_csv(movies_file)
ratings_df = pd.read_csv(ratings_file)

# 2. Data Preprocessing

# 2.1 Explore Movie Data

In [4]:
# Display first few rows of movies dataframe
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


# 2.2 Clean Movie Titles and Extract Year

In [5]:
# Extract release year from title and clean title column
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))', expand=False)
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)', expand=False)
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '', regex=True)
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

# Display cleaned movie data
movies_df.head(10)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
5,6,Heat,Action|Crime|Thriller,1995
6,7,Sabrina,Comedy|Romance,1995
7,8,Tom and Huck,Adventure|Children,1995
8,9,Sudden Death,Action,1995
9,10,GoldenEye,Action|Adventure|Thriller,1995


# 2.3 Process Genre Information

In [6]:
# Convert genres string to list
movies_df['genres'] = movies_df['genres'].str.split('|')
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


# 2.4 One-Hot Encode Genres

In [7]:
# Create a copy for one-hot encoding
moviesWithGenres_df = movies_df.copy()

# Apply one-hot encoding for genres
for index, row in movies_df.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1

# Fill NaN values with 0 (genre not present)
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head(4)

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 2.5 Clean Ratings Data

In [8]:
# Explore ratings data
ratings_df.head()

# Remove timestamp column (not needed for our analysis)
ratings_df = ratings_df.drop('timestamp', axis=1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [8]:
#Cleaning the rartings file
ratings_df = ratings_df.drop('timestamp',axis = 1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


# 3. Content-Based Filtering

Content-based filtering recommends items similar to what the user has liked in the past based on item features.

# 3.1 Define User Preferences

In [9]:
# Define user input with movie ratings (scale: 0.5-5.0)
userInput = [
    {'title': 'Help!', 'rating': 3.5},
    {'title': 'Shutter Island', 'rating': 5},
    {'title': 'Spider-Man 2', 'rating': 2},
    {'title': "Prestige, The", 'rating': 4},
    {'title': 'Vertigo', 'rating': 4.5},
    {'title': 'Dark Knight, The', 'rating': 2.5},
    {'title': 'Forrest Gump', 'rating': 4},
    {'title': "What's Eating Gilbert Grape ", 'rating': 3},
    {'title': 'Memento', 'rating': 4.5},
    {'title': 'American Psycho', 'rating': 5},
    {'title': 'Fight Club', 'rating': 4.5}
]

# Convert to DataFrame
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,Help!,3.5
1,Shutter Island,5.0
2,Spider-Man 2,2.0
3,"Prestige, The",4.0
4,Vertigo,4.5
5,"Dark Knight, The",2.5
6,Forrest Gump,4.0
7,What's Eating Gilbert Grape,3.0
8,Memento,4.5
9,American Psycho,5.0


# 3.2 Create User Profile Based on Genre Preferences


In [10]:
# Find movie IDs for user-rated movies
inputId = moviesWithGenres_df[movies_df['title'].isin(inputMovies['title'].tolist())]

# Merge with user ratings
userMovies = pd.merge(inputMovies, inputId)

# Drop unnecessary columns
userMovies = userMovies.drop('genres', axis=1).drop('year', axis=1)
userMovies

# Extract genre columns only
userGenreTable = userMovies.drop('rating', axis=1).drop('title', axis=1).drop('movieId', axis=1)
userGenreTable.head()

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 3.3 Calculate Weighted User Profile

In [11]:
# Create weighted user profile based on ratings
userProfile = userGenreTable.transpose().dot(userMovies['rating'])
userProfile

Adventure              2.0
Animation              0.0
Children               0.0
Comedy                 7.5
Fantasy                0.0
Romance                8.5
Drama                 24.5
Action                 9.0
Crime                 12.0
Thriller              27.5
Horror                 5.0
Mystery               23.0
Sci-Fi                 6.0
War                    4.0
Musical                3.5
Documentary            0.0
IMAX                   4.5
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

# 3.4 Prepare Genre Matrix for All Movies

In [12]:
# Set movieId as index and keep only genre columns
genreTable = moviesWithGenres_df.set_index(moviesWithGenres_df['movieId'])
genreTable = genreTable.drop('movieId', axis=1).drop('title', axis=1).drop('genres', axis=1).drop('year', axis=1)
genreTable

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 3.5 Generate Content-Based Recommendations

In [13]:
# Calculate recommendation scores using dot product
recommendation_df = pd.DataFrame()
recommendation_df = ((genreTable * userProfile).sum(axis=1)) / (userProfile.sum())
recommendation_df.head()

# Sort by highest recommendation score
recommendation_df = recommendation_df.sort_values(ascending=False)
recommendation_df.head()

movieId
81132    0.806569
79132    0.777372
198      0.744526
26701    0.744526
31921    0.704380
dtype: float64

# 3.6 Display Top Content-Based Recommendations

In [15]:
# Show top 20 recommended movies
top_content_movies = movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(20).keys())]
print(f"Top {len(top_content_movies)} Content-Based Recommendations:")
top_content_movies[['title', 'genres', 'year']]

Top 20 Content-Based Recommendations:


Unnamed: 0,title,genres,year
167,Strange Days,"[Action, Crime, Drama, Mystery, Sci-Fi, Thriller]",1995
1526,"Negotiator, The","[Action, Crime, Drama, Mystery, Thriller]",1998
2109,"Pelican Brief, The","[Crime, Drama, Mystery, Romance, Thriller]",1993
3839,Insomnia,"[Action, Crime, Drama, Mystery, Thriller]",2002
4622,In the Cut,"[Crime, Drama, Mystery, Romance, Thriller]",2003
4654,21 Grams,"[Crime, Drama, Mystery, Romance, Thriller]",2003
4940,Man on Fire,"[Action, Crime, Drama, Mystery, Thriller]",2004
5327,Cellular,"[Action, Crime, Drama, Mystery, Thriller]",2004
5556,Patlabor: The Movie (Kidô keisatsu patorebâ: T...,"[Action, Animation, Crime, Drama, Film-Noir, M...",1989
5569,Prime Suspect 2,"[Crime, Drama, Mystery, Romance, Thriller]",1992


# Interpretation:
The content-based filter recommends movies with similar genres to those the user rated highly (especially Thriller, Mystery, Crime, and Drama genres).

# Collaborative Filltering
Collaborative filtering recommends items based on the preferences of similar users.

# 4.1 Prepare Data for Collaborative Filtering

In [16]:
# Create simplified movies dataframe
movies_df2 = movies_df.drop('genres', axis=1)
movies_df2.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


# 4.2 Find Users with Similar Movie Ratings

In [18]:
# Find users who rated the same movies as our input user
userSubset = ratings_df[ratings_df['movieId'].isin(userMovies['movieId'].tolist())]

# Group by user and sort by number of common ratings
userSubsetGroup = userSubset.groupby(['userId'])
userSubsetGroup = sorted(userSubsetGroup, key=lambda x: len(x[1]), reverse=True)

# Select top 100 users with most common ratings
userSubsetGroup = userSubsetGroup[0:100]
userSubsetGroup

[(105,        userId  movieId  rating
  16231     105      356     3.5
  16243     105      903     4.0
  16320     105     2959     5.0
  16336     105     3535     3.5
  16377     105     4226     3.5
  16512     105     8636     3.5
  16605     105    48780     5.0
  16645     105    58559     3.5
  16699     105    74458     5.0), (249,        userId  movieId  rating
  36401     249      356     4.5
  36434     249      903     4.0
  36580     249     2959     5.0
  36604     249     3535     5.0
  36641     249     4226     5.0
  36779     249     8636     4.0
  36878     249    48780     3.5
  36950     249    58559     5.0
  37039     249    74458     5.0), (274,        userId  movieId  rating
  39300     274      356     4.5
  39716     274     2946     3.5
  39720     274     2959     5.0
  39791     274     3535     5.0
  39879     274     4226     4.5
  40167     274     8636     4.0
  40359     274    48780     3.5
  40444     274    58559     4.5
  40533     274    74458  

# 4.3 Calculate Pearson Correlation for Similarity

In [19]:
# Dictionary to store Pearson correlation values
pearsonCorrelationDict = {}

# Calculate similarity between input user and each other user
for name, group in userSubsetGroup:
    # Sort values to align ratings
    group = group.sort_values(by='movieId')
    inputMovies_sorted = userMovies.sort_values(by='movieId')
    
    # Number of common ratings
    nRatings = len(group)
    
    # Extract ratings for common movies
    temp_df = inputMovies_sorted[inputMovies_sorted['movieId'].isin(group['movieId'].tolist())]
    tempRatingList = temp_df['rating'].tolist()
    tempGroupList = group['rating'].tolist()
    
    # Compute Pearson correlation
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList), 2) / float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList), 2) / float(nRatings)
    Sxy = sum(i * j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList) * sum(tempGroupList) / float(nRatings)
    
    # Calculate correlation coefficient
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy / sqrt(Sxx * Syy)
    else:
        pearsonCorrelationDict[name] = 0

# Convert to DataFrame
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.405554,105
1,0.303046,249
2,0.441942,274
3,-0.044544,610
4,-0.834036,68


# 4.4 Select Most Similar Users

In [20]:
# Select top 50 most similar users
similarUsers = pearsonDF.sort_values('similarityIndex', ascending=False)[0:50]
similarUsers.head()

# Merge with ratings to get their movie preferences
similarUsers = pd.merge(similarUsers, ratings_df)
similarUsers.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,64,1,4.0
1,1.0,64,3,3.5
2,1.0,64,6,4.5
3,1.0,64,16,5.0
4,1.0,64,19,3.5


 # 4.5 Calculate Weighted Ratings

In [21]:
# Apply similarity score as weight to ratings
similarUsers['weightedRating'] = similarUsers['similarityIndex'] * similarUsers['rating']
similarUsers.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,64,1,4.0,4.0
1,1.0,64,3,3.5,3.5
2,1.0,64,6,4.5,4.5
3,1.0,64,16,5.0,5.0
4,1.0,64,19,3.5,3.5


# 4.6 Generate Collaborative Recommendations

In [22]:
# Aggregate weighted ratings by movie
tempSimilarUsers = similarUsers.groupby('movieId')[['similarityIndex', 'weightedRating']].sum()

# Calculate weighted average scores
recommendation2_df = pd.DataFrame()
recommendation2_df['weightedAverageScore'] = tempSimilarUsers['weightedRating'] / tempSimilarUsers['similarityIndex']
recommendation2_df['movieId'] = tempSimilarUsers.index
recommendation2_df.head()

# Sort by highest weighted average score
recommendation2_df = recommendation2_df.sort_values('weightedAverageScore', ascending=False)[0:50]
recommendation2_df.head()

Unnamed: 0_level_0,weightedAverageScore,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
3435,5.0,3435
170705,5.0,170705
71131,5.0,71131
5889,5.0,5889
5888,5.0,5888


# 4.7 Display Top Collaborative Recommendations

In [23]:
# Show top 10 collaborative recommendations
top_collab_movies = movies_df.loc[movies_df['movieId'].isin(recommendation2_df.head(10)['movieId'].tolist())]
print(f"Top {len(top_collab_movies)} Collaborative Filtering Recommendations:")
top_collab_movies[['title', 'genres', 'year']]

Top 10 Collaborative Filtering Recommendations:


Unnamed: 0,title,genres,year
1531,Out of the Past,[Film-Noir],1947
2568,Double Indemnity,"[Crime, Drama, Film-Noir]",1944
2639,All the Vermeers in New York,"[Comedy, Drama, Romance]",1990
4108,Brother (Brat),"[Crime, Drama]",1997
4109,"Cruel Romance, A (Zhestokij Romans)","[Drama, Romance]",1984
7123,"Most Hated Family in America, The",[Documentary],2007
8813,George Carlin: Life Is Worth Losing,[Comedy],2005
8815,Love and Pigeons,"[Comedy, Romance]",1985
9317,The Bremen Town Musicians,"[Animation, Drama, Fantasy]",1969
9497,Band of Brothers,"[Action, Drama, War]",2001


# 5. Results Comparison<a id="5-results-comparison"></a>

## Content-Based Filtering Results:
### Approach: 
Based on movie attributes (genres)
### Strengths:
Good for new items, no cold start for items, transparent recommendations
### Recommendations:
Focus on movies with similar genre combinations to user's preferences

## Collaborative Filtering Results:
### Approach: 
Based on user behavior and similar users' preferences

### Strengths: 
Can find unexpected recommendations, works well with diverse tastes

### Recommendations:
Based on what similar users liked, even if genres differ