In [None]:
#!pip install numpy==1.24.4


In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#! wget https://files.grouplens.org/datasets/movielens/ml-20m.zip

In [None]:
#! unzip ml-20m.zip

In [None]:
movies = pd.read_csv('ml-20m/movies.csv')
ratings = pd.read_csv('ml-20m/ratings.csv', nrows=100_000)
tags = pd.read_csv('ml-20m/tags.csv')

In [None]:
sampled_movie_ids = ratings['movieId'].unique()
movies = movies[movies['movieId'].isin(sampled_movie_ids)]
tags = tags[tags['movieId'].isin(sampled_movie_ids)]

In [None]:
print("Movies:\n", movies.head())
print("Ratings:\n", ratings.head())
print("Tags:\n", tags.head())


Movies:
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
Ratings:
    userId  movieId  rating   timestamp
0       1        2     3.5  1112486027
1       1       29     3.5  1112484676
2       1       32     3.5  1112484819
3       1       47     3.5  1112484727
4       1       50     3.5  1112484580
Tags:
    userId  movieId            tag   timestamp
0      18     4141    Mark Waters  1240597180
1      65      208      dark hero  1368150078
2      

**Step 1: Popularity-Based Recommender**

In [None]:
# Most Rated Movies (by count)
popularity_df = ratings.groupby('movieId').agg({
    'rating': ['count', 'mean']
}).reset_index()

popularity_df.columns = ['movieId', 'rating_count', 'rating_mean']

popularity_df = popularity_df.merge(movies, on='movieId')

most_rated = popularity_df.sort_values('rating_count', ascending=False)

most_rated[['title', 'rating_count', 'rating_mean']].head(10)


Unnamed: 0,title,rating_count,rating_mean
267,Pulp Fiction (1994),350,4.1
323,Forrest Gump (1994),340,4.058824
286,"Shawshank Redemption, The (1994)",305,4.413115
436,Jurassic Park (1993),302,3.639073
538,"Silence of the Lambs, The (1991)",295,4.111864
235,Star Wars: Episode IV - A New Hope (1977),264,4.083333
101,Braveheart (1995),262,3.965649
534,Terminator 2: Judgment Day (1991),256,3.953125
2106,"Matrix, The (1999)",253,4.063241
482,Schindler's List (1993),247,4.263158


In [None]:
#Top Rated Movies with Minimum Threshold
threshold = 1000
top_rated = popularity_df[popularity_df['rating_count'] >= threshold]


top_rated = top_rated.sort_values('rating_mean', ascending=False)


top_rated[['title', 'rating_count', 'rating_mean']].head(10)


Unnamed: 0,title,rating_count,rating_mean


In [None]:
# Weighted Rating Formula (like IMDb)
C = popularity_df['rating_mean'].mean()
m = popularity_df['rating_count'].quantile(0.90)

def weighted_rating(x, m=m, C=C):
    v = x['rating_count']
    R = x['rating_mean']
    return (v / (v + m) * R) + (m / (v + m) * C)

# Filter to qualified movies
qualified = popularity_df[popularity_df['rating_count'] >= m].copy()
qualified['score'] = qualified.apply(weighted_rating, axis=1)


qualified = qualified.sort_values('score', ascending=False)

qualified[['title', 'rating_count', 'rating_mean', 'score']].head(10)

C = popularity_df['rating_mean'].mean()
m = popularity_df['rating_count'].quantile(0.90)

def weighted_rating(x, m=m, C=C):
    v = x['rating_count']
    R = x['rating_mean']
    return (v / (v + m) * R) + (m / (v + m) * C)

qualified = popularity_df[popularity_df['rating_count'] >= m].copy()
qualified['score'] = qualified.apply(weighted_rating, axis=1)

qualified = qualified.sort_values('score', ascending=False)

qualified[['title', 'rating_count', 'rating_mean', 'score']].head(10)


Unnamed: 0,title,rating_count,rating_mean,score
286,"Shawshank Redemption, The (1994)",305,4.413115,4.311775
721,"Godfather, The (1972)",197,4.395939,4.248655
1013,"Godfather: Part II, The (1974)",137,4.419708,4.215237
48,"Usual Suspects, The (1995)",224,4.337054,4.21238
987,One Flew Over the Cuckoo's Nest (1975),148,4.35473,4.173846
482,Schindler's List (1993),247,4.263158,4.15684
2430,Fight Club (1999),207,4.270531,4.145336
753,Rear Window (1954),79,4.398734,4.091235
3922,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",128,4.273438,4.085092
3397,Memento (2000),161,4.214286,4.067773


**Step 2: Content-Based Filtering**

In [None]:
# Rebuild metadata to ensure wider tag coverage
metadata = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(map(str, x))).reset_index()

movies_content = pd.merge(movies, metadata, on='movieId', how='left')
movies_content['tag'] = movies_content['tag'].fillna('').astype(str)

movies_content = movies_content[movies_content['tag'].str.strip() != ''].reset_index(drop=True)



In [None]:
# TF-IDF on the tag column
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(movies_content['tag'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (7767, 5000)


In [None]:
# Reduce to 100 latent features
svd = TruncatedSVD(n_components=100, random_state=42)
latent_matrix_1 = svd.fit_transform(tfidf_matrix)

print("Latent matrix shape:", latent_matrix_1.shape)

Latent matrix shape: (7767, 100)


In [None]:
# Compute cosine similarity between all movie vectors
cosine_sim = cosine_similarity(latent_matrix_1)

print("Cosine similarity shape:", cosine_sim.shape)


Cosine similarity shape: (7767, 7767)


In [None]:
#Recommendation Function (Content-Based)
def recommend_content(title, top_n=10):
    """Content-based recommender: recommends movies similar to a given title"""
    if title not in movie_indices:
        return f"Movie '{title}' not found in dataset."

    idx = movie_indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n + 1]  # Skip the first because it's the same movie

    movie_indices_rec = [i[0] for i in sim_scores]
    return movies_content.iloc[movie_indices_rec][['title']]

# Example
recommend_content("Toy Story (1995)", top_n=10)



Unnamed: 0,title
2377,Toy Story 2 (1999)
6237,Ratatouille (2007)
1767,"Bug's Life, A (1998)"
3793,Ice Age (2002)
3600,"Monsters, Inc. (2001)"
4386,Finding Nemo (2003)
7133,Toy Story 3 (2010)
6874,Up (2009)
2179,Thumbelina (1994)
7618,Monsters University (2013)


**Step 3: Collaborative Filtering**

In [None]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import KNNBasic, SVD
from surprise.accuracy import rmse

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


In [None]:


# KNN-Based Collaborative Filtering (Item-Item Similarity)
sim_options = {
    'name': 'cosine',
    'user_based': False
}
algo_knn = KNNBasic(sim_options=sim_options)
algo_knn.fit(trainset)

predictions_knn = algo_knn.test(testset)
rmse(predictions_knn)


Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0071


1.0071398199083963

In [None]:
#SVD-Based Matrix Factorization
from surprise import SVD

algo_svd = SVD()
algo_svd.fit(trainset)

predictions_svd = algo_svd.test(testset)
rmse(predictions_svd)


RMSE: 0.9152


0.9152116067544253

In [None]:
#Get Top-N Recommendations for a User
from collections import defaultdict

def get_top_n(predictions, n=10):
    top_n = defaultdict(list)

    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n




In [None]:
top_n_recs = get_top_n(predictions_svd, n=10)  # Using the 'predictions' from SVD

user_id = 1
user_recs = top_n_recs[user_id]

print(f"Top 10 Movie Recommendations for User {user_id}:\n")
for movie_id, score in user_recs:
    title = movies[movies['movieId'] == int(movie_id)]['title'].values
    if len(title) > 0:
        print(f"{title[0]} — predicted rating: {score:.2f}")
    else:
        print(f"Movie ID {movie_id} — predicted rating: {score:.2f}")


Top 10 Movie Recommendations for User 1:

Butch Cassidy and the Sundance Kid (1969) — predicted rating: 4.25
Die Hard (1988) — predicted rating: 4.19
Donnie Darko (2001) — predicted rating: 4.16
Thing, The (1982) — predicted rating: 4.11
Blade Runner (1982) — predicted rating: 4.10
Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000) — predicted rating: 4.10
Pirates of the Caribbean: The Curse of the Black Pearl (2003) — predicted rating: 4.10
2001: A Space Odyssey (1968) — predicted rating: 4.05
Kill Bill: Vol. 2 (2004) — predicted rating: 4.04
Ran (1985) — predicted rating: 4.04


**Hybrid Model**

In [None]:
movie_indices = pd.Series(movies_content.index, index=movies_content['title']).drop_duplicates()
movie_id_to_index = pd.Series(movies_content.index.values, index=movies_content['movieId']).to_dict()

In [None]:

# Title to index lookup (already created earlier)
def predict_content_score(user_id, title):
    """Compute average similarity score to user's history"""
    if title not in movie_indices:
        return 0
    idx = movie_indices[title]

    user_history = ratings[ratings['userId'] == user_id]
    if user_history.empty:
        return 0

    total_sim = 0
    weighted_sum = 0
    for _, row in user_history.iterrows():
        mid = row['movieId']
        if mid in movie_id_to_index:
            other_idx = movie_id_to_index[mid]
            sim = cosine_sim[idx][other_idx]
            total_sim += sim
            weighted_sum += sim * row['rating']
    return weighted_sum / total_sim if total_sim > 0 else 0


In [None]:
#predict Hybrid Score
def predict_hybrid_score(user_id, movie_id, alpha=0.5):

    title_row = movies[movies['movieId'] == movie_id]
    if title_row.empty:
        return 0
    title = title_row['title'].values[0]


    content_score = predict_content_score(user_id, title)


    try:
        collab_score = algo_svd.predict(user_id, movie_id).est
    except:
        collab_score = 0


    return alpha * content_score + (1 - alpha) * collab_score


In [None]:
#Recommend Top Movies Using Hybrid Score
def recommend_hybrid(user_id, top_n=10, alpha=0.5):
    # Filter candidate movies to only include those in movies_content
    candidate_movies = movies_content['movieId'].unique()  # Change this line
    scored = []

    for movie_id in candidate_movies:
        score = predict_hybrid_score(user_id, movie_id, alpha)
        scored.append((movie_id, score))

    top_scores = sorted(scored, key=lambda x: x[1], reverse=True)[:top_n]

    # Convert to titles
    top_movie_ids = [x[0] for x in top_scores]
    recs = movies[movies['movieId'].isin(top_movie_ids)][['movieId', 'title']]

    hybrid_df = pd.DataFrame(top_scores, columns=['movieId', 'hybrid_score'])
    return pd.merge(recs, hybrid_df, on='movieId').sort_values(by='hybrid_score', ascending=False)

In [None]:
print("Number of movies in movies_content:", movies_content.shape[0])
print("TF-IDF matrix shape:", tfidf_matrix.shape)
print("Cosine similarity shape:", cosine_sim.shape)



Number of movies in movies_content: 7767
TF-IDF matrix shape: (7767, 5000)
Cosine similarity shape: (7767, 7767)


In [None]:
#example
recommend_hybrid(user_id=1, top_n=10, alpha=0.6)


IndexError: index 28 is out of bounds for axis 0 with size 2