<a href="https://colab.research.google.com/github/nancymahmoud1/Elevvo/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Load and Merge Data**

In [2]:
import kagglehub
import pandas as pd

from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
movies_file_path = "/content/drive/MyDrive/Elevvo/movies-recommendation-dataset/movies.csv"
movies = pd.read_csv(movies_file_path)

ratings_file_path = "/content/drive/MyDrive/Elevvo/movies-recommendation-dataset/ratings.csv"
ratings = pd.read_csv(ratings_file_path)

# Merge movie titles into the ratings based on movieId
df = pd.merge(ratings, movies, on='movieId')
print("First 5 records:", df.head())


First 5 records:    userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  


In [3]:
# Get the number of missing values in every column
movies.isnull().sum()
ratings.isnull().sum()

Unnamed: 0,0
userId,0
movieId,0
rating,0
timestamp,0


In [4]:
# Split the Ratings into Train/Test per User
from sklearn.model_selection import train_test_split

# We train the model on 80% of each user’s ratings.
# We test it on the 20% that we held out for the same users.
def split_train_test(df, test_size=0.2):
    train_list = []
    test_list = []

    for user_id, group in df.groupby('userId'):
        if len(group) < 5:
            train_list.append(group)
            continue
        train, test = train_test_split(group, test_size=test_size, random_state=42)
        train_list.append(train)
        test_list.append(test)

    train_df = pd.concat(train_list)
    test_df = pd.concat(test_list)
    return train_df, test_df

train_df, test_df = split_train_test(df)


# **Create User-Item Matrix**

In [5]:
user_item_matrix = train_df.pivot_table(
    index='userId',
    columns='title',
    values='rating').fillna(0)

# **Compute User Similarity (Cosine Similarity)**

Why Cosine Similarity?

> It measures the angle between two rating vectors, not the difference in magnitude.

> That’s great for recommendation systems because users might have different rating styles (e.g., one gives 5s and 4s, another gives 3s and 2s), but still like the same things.



In [6]:
from sklearn.metrics.pairwise import cosine_similarity

# We find users who are similar to each other by comparing their ratings.
# If two users rated many movies similarly, we consider them similar.
user_similarity = cosine_similarity(user_item_matrix)
user_sim_df = pd.DataFrame(
    user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)


# **Implementation (Precision@K)**

In [7]:
def get_top_k_recommendations(user_id, user_item_matrix, k=10):
    similar_users = user_sim_df.loc[user_id].drop(user_id).sort_values(ascending=False).head(5)
    weighted_sum = pd.Series(dtype=float)

    for sim_user, score in similar_users.items():
        weighted_sum = weighted_sum.add(user_item_matrix.loc[sim_user] * score, fill_value=0)

    weighted_sum = weighted_sum / similar_users.sum()
    already_seen = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    recommendations = weighted_sum.drop(labels=already_seen).sort_values(ascending=False).head(k)
    return recommendations.index.tolist()


In [8]:
# Calculate Precision at K
def precision_at_k(user_id, k=10):
    recommended = get_top_k_recommendations(user_id, user_item_matrix, k)
    actual = test_df[test_df['userId'] == user_id]['title'].tolist()

    if not actual:
        return None  # no test data for this user

    relevant_and_recommended = set(recommended).intersection(set(actual))
    return len(relevant_and_recommended) / k

In [9]:
# Evaluate on all users
precisions = []
for user_id in user_item_matrix.index:
    p = precision_at_k(user_id, k=10)
    if p is not None:
        precisions.append(p)

avg_precision_at_k = sum(precisions) / len(precisions)
print(f"Average Precision@10: {avg_precision_at_k:.4f}")

Average Precision@10: 0.2310


In [10]:
# Function to get similar users
def get_similar_users(user_id, top_n=5):
    sim_scores = user_sim_df.loc[user_id].drop(user_id)
    return sim_scores.sort_values(ascending=False).head(top_n)


In [11]:
# Recommend Unseen Movies Based on Similar Users
def recommend_movies(user_id, top_n=10):
    similar_users = get_similar_users(user_id)
    weighted_sum = pd.Series(dtype=float)

    for sim_user, score in similar_users.items():
        user_ratings = user_item_matrix.loc[sim_user]
        weighted_sum = weighted_sum.add(user_ratings * score, fill_value=0)

    # Normalize
    weighted_sum = weighted_sum / similar_users.sum()

    # Filter out already watched movies
    seen_movies = user_item_matrix.loc[user_id]
    unseen = weighted_sum[seen_movies == 0]

    return unseen.sort_values(ascending=False).head(top_n)


In [13]:
recommend_movies(user_id=42, top_n=5)


Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
Aliens (1986),4.404907
Office Space (1999),4.403742
Austin Powers: International Man of Mystery (1997),4.389126
"Terminator, The (1984)",4.29402
Jaws (1975),3.985246


The system is predicting user 42’s potential rating for a movie they haven’t seen, by looking at :
- How similar users rated it
- How similar those users are to user 42