In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shubhammehta21/movie-lens-small-latest-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/shubhammehta21/movie-lens-small-latest-dataset?dataset_version_number=1...


100%|██████████| 971k/971k [00:00<00:00, 14.2MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/shubhammehta21/movie-lens-small-latest-dataset/versions/1


In [2]:
!ls /root/.cache/kagglehub/datasets/shubhammehta21/movie-lens-small-latest-dataset/versions/1

links.csv  movies.csv  ratings.csv  README.txt	tags.csv


In [3]:
import pandas as pd

# Load MovieLens data
ratings = pd.read_csv('/root/.cache/kagglehub/datasets/shubhammehta21/movie-lens-small-latest-dataset/versions/1/ratings.csv')  # userId, movieId, rating, timestamp
movies = pd.read_csv('/root/.cache/kagglehub/datasets/shubhammehta21/movie-lens-small-latest-dataset/versions/1/movies.csv')   # movieId, title, genres

# Explore
print(ratings.head())
print(ratings.info())
print(f"Users: {ratings['userId'].nunique()}, Movies: {ratings['movieId'].nunique()}")
print(ratings['rating'].describe())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None
Users: 610, Movies: 9724
count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64


In [6]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

user_similarity = cosine_similarity(user_movie_matrix)

user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)


In [5]:
def recommend_user_based(user_id, user_movie_matrix, user_similarity_df, top_n=5):
    #  similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:]  # Exclude self
    similar_users_ids = similar_users.index[:10]  #  10 similar users

    # Get movies rated by similar users but not by target user
    user_ratings = user_movie_matrix.loc[user_id]
    unrated_movies = user_ratings[user_ratings == 0].index

    #  ratings as weighted average of similar users' ratings
    predictions = {}
    for movie in unrated_movies:
        sim_scores = user_similarity_df.loc[similar_users_ids, user_id]
        movie_ratings = user_movie_matrix.loc[similar_users_ids, movie]
        weighted_sum = np.dot(sim_scores, movie_ratings)
        sim_sum = sim_scores.sum()
        if sim_sum > 0:
            predictions[movie] = weighted_sum / sim_sum

    # Sort and return top N
    return pd.Series(predictions).sort_values(ascending=False).head(top_n)


In [7]:
print(recommend_user_based(1, user_movie_matrix, user_similarity_df))


589     4.195979
1200    4.119903
2762    3.971041
1610    3.957980
858     3.885992
dtype: float64


In [9]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(ratings, test_size=0.2, random_state=42)
train_matrix = train.pivot(index='userId', columns='movieId', values='rating').fillna(0)
test_matrix = test.pivot(index='userId', columns='movieId', values='rating').fillna(0)

#  evaluation (precision@k)
def precision_at_k(user_id, k=5):
    preds = recommend_user_based(user_id, train_matrix, user_similarity_df, top_n=k)
    actual = test_matrix.loc[user_id]
    actual_positive = actual[actual > 2].index
    pred_positive = preds.index
    hits = len(set(pred_positive) & set(actual_positive))
    return hits / k

print(f"Precision@5 for User 1: {precision_at_k(1)}")

Precision@5 for User 1: 0.2


In [10]:
# item similarity
item_similarity = cosine_similarity(user_movie_matrix.T)  # Transpose for items
item_similarity_df = pd.DataFrame(item_similarity, index=user_movie_matrix.columns, columns=user_movie_matrix.columns)

def recommend_item_based(user_id, user_movie_matrix, item_similarity_df, top_n=5):
    user_ratings = user_movie_matrix.loc[user_id]
    rated_movies = user_ratings[user_ratings > 0].index

    predictions = {}
    for movie in user_movie_matrix.columns:
        if movie not in rated_movies:
            sim_scores = item_similarity_df.loc[rated_movies, movie]
            weighted_sum = np.dot(sim_scores, user_ratings[rated_movies])
            sim_sum = sim_scores.sum()
            if sim_sum > 0:
                predictions[movie] = weighted_sum / sim_sum

    return pd.Series(predictions).sort_values(ascending=False).head(top_n)

print(recommend_item_based(1, user_movie_matrix, item_similarity_df))

63239     5.0
193587    5.0
193573    5.0
193571    5.0
193567    5.0
dtype: float64


In [11]:
def recommend_hybrid(user_id, user_movie_matrix, user_similarity_df, item_similarity_df, top_n=5):
    user_preds = recommend_user_based(user_id, user_movie_matrix, user_similarity_df, top_n=10)
    item_preds = recommend_item_based(user_id, user_movie_matrix, item_similarity_df, top_n=10)

    combined = pd.concat([user_preds, item_preds]).groupby(level=0).mean()
    return combined.sort_values(ascending=False).head(top_n)

print(recommend_hybrid(1, user_movie_matrix, user_similarity_df, item_similarity_df))

193579    5.0
193581    5.0
191005    5.0
63239     5.0
193583    5.0
dtype: float64


In [None]:
import pandas as pd

# Load MovieLens data
ratings = pd.read_csv('/root/.cache/kagglehub/datasets/shubhammehta21/movie-lens-small-latest-dataset/versions/1/ratings.csv')  # userId, movieId, rating, timestamp
movies = pd.read_csv('/root/.cache/kagglehub/datasets/shubhammehta21/movie-lens-small-latest-dataset/versions/1/movies.csv')   # movieId, title, genres

# Explore
print(ratings.head())
print(ratings.info())
print(f"Users: {ratings['userId'].nunique()}, Movies: {ratings['movieId'].nunique()}")
print(ratings['rating'].describe())

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

user_similarity = cosine_similarity(user_movie_matrix)

user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)

def recommend_user_based(user_id, user_movie_matrix, user_similarity_df, top_n=5):
    #  similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:]  # Exclude self
    similar_users_ids = similar_users.index[:10]  #  10 similar users

    # Get movies rated by similar users but not by target user
    user_ratings = user_movie_matrix.loc[user_id]
    unrated_movies = user_ratings[user_ratings == 0].index

    #  ratings as weighted average of similar users' ratings
    predictions = {}
    for movie in unrated_movies:
        sim_scores = user_similarity_df.loc[similar_users_ids, user_id]
        movie_ratings = user_movie_matrix.loc[similar_users_ids, movie]
        weighted_sum = np.dot(sim_scores, movie_ratings)
        sim_sum = sim_scores.sum()
        if sim_sum > 0:
            predictions[movie] = weighted_sum / sim_sum

    # Sort and return top N
    return pd.Series(predictions).sort_values(ascending=False).head(top_n)

print(recommend_user_based(1, user_movie_matrix, user_similarity_df))

from sklearn.model_selection import train_test_split

train, test = train_test_split(ratings, test_size=0.2, random_state=42)
train_matrix = train.pivot(index='userId', columns='movieId', values='rating').fillna(0)
test_matrix = test.pivot(index='userId', columns='movieId', values='rating').fillna(0)

#  evaluation (precision@k)
def precision_at_k(user_id, k=5):
    preds = recommend_user_based(user_id, train_matrix, user_similarity_df, top_n=k)
    actual = test_matrix.loc[user_id]
    actual_positive = actual[actual > 2].index
    pred_positive = preds.index
    hits = len(set(pred_positive) & set(actual_positive))
    return hits / k

print(f"Precision@5 for User 1: {precision_at_k(1)}")

# item similarity
item_similarity = cosine_similarity(user_movie_matrix.T)  # Transpose for items
item_similarity_df = pd.DataFrame(item_similarity, index=user_movie_matrix.columns, columns=user_movie_matrix.columns)

def recommend_item_based(user_id, user_movie_matrix, item_similarity_df, top_n=5):
    user_ratings = user_movie_matrix.loc[user_id]
    rated_movies = user_ratings[user_ratings > 0].index

    predictions = {}
    for movie in user_movie_matrix.columns:
        if movie not in rated_movies:
            sim_scores = item_similarity_df.loc[rated_movies, movie]
            weighted_sum = np.dot(sim_scores, user_ratings[rated_movies])
            sim_sum = sim_scores.sum()
            if sim_sum > 0:
                predictions[movie] = weighted_sum / sim_sum

    return pd.Series(predictions).sort_values(ascending=False).head(top_n)

print(recommend_item_based(1, user_movie_matrix, item_similarity_df))

def recommend_hybrid(user_id, user_movie_matrix, user_similarity_df, item_similarity_df, top_n=5):
    user_preds = recommend_user_based(user_id, user_movie_matrix, user_similarity_df, top_n=10)
    item_preds = recommend_item_based(user_id, user_movie_matrix, item_similarity_df, top_n=10)

    combined = pd.concat([user_preds, item_preds]).groupby(level=0).mean()
    return combined.sort_values(ascending=False).head(top_n)

print(recommend_hybrid(1, user_movie_matrix, user_similarity_df, item_similarity_df))

In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

ratings = pd.read_csv('/root/.cache/kagglehub/datasets/shubhammehta21/movie-lens-small-latest-dataset/versions/1/ratings.csv')
movies = pd.read_csv('/root/.cache/kagglehub/datasets/shubhammehta21/movie-lens-small-latest-dataset/versions/1/movies.csv')


print(ratings.head())
print(ratings.info())
print(f"Users: {ratings['userId'].nunique()}, Movies: {ratings['movieId'].nunique()}")
print(ratings['rating'].describe())

# User-movie matrix
user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# User similarity
user_similarity = cosine_similarity(user_movie_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)

# Item similarity
item_similarity = cosine_similarity(user_movie_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_movie_matrix.columns, columns=user_movie_matrix.columns)

# Train-test split
train, test = train_test_split(ratings, test_size=0.2, random_state=42)
train_matrix = train.pivot(index='userId', columns='movieId', values='rating').fillna(0)
test_matrix = test.pivot(index='userId', columns='movieId', values='rating').fillna(0)

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None
Users: 610, Movies: 9724
count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64


In [13]:
def recommend_user_based(user_id, user_movie_matrix, user_similarity_df, movies_df, top_n=5):
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:11]
    similar_users_ids = similar_users.index

    print(f"\nTop similar users to User {user_id}:")
    for sim_user in similar_users_ids:
        top_movies = user_movie_matrix.loc[sim_user].sort_values(ascending=False).head(3)
        top_movie_titles = movies_df[movies_df['movieId'].isin(top_movies.index)]['title'].tolist()
        print(f"User {sim_user} (Similarity: {similar_users[sim_user]:.3f}): {top_movie_titles}")

    user_ratings = user_movie_matrix.loc[user_id]
    unrated_movies = user_ratings[user_ratings == 0].index

    predictions = {}
    for movie in unrated_movies:
        sim_scores = user_similarity_df.loc[similar_users_ids, user_id]
        movie_ratings = user_movie_matrix.loc[similar_users_ids, movie]
        weighted_sum = np.dot(sim_scores, movie_ratings)
        sim_sum = sim_scores.sum()
        if sim_sum > 0:
            predictions[movie] = weighted_sum / sim_sum

    top_preds = pd.Series(predictions).sort_values(ascending=False).head(top_n)
    recommended_movies = movies_df[movies_df['movieId'].isin(top_preds.index)][['movieId', 'title']]
    recommended_movies['score'] = top_preds.values
    return recommended_movies

print("\nUser-Based Recommendations for User 1:")
print(recommend_user_based(1, train_matrix, user_similarity_df, movies))


User-Based Recommendations for User 1:

Top similar users to User 1:
User 266 (Similarity: 0.357): ['Trainspotting (1996)', "Monty Python's Life of Brian (1979)", 'Clockwork Orange, A (1971)']
User 313 (Similarity: 0.352): ['Escape from New York (1981)', 'Cell, The (2000)', "Charlie's Angels (2000)"]
User 368 (Similarity: 0.345): ["Schindler's List (1993)", 'Jaws (1975)', 'Saving Private Ryan (1998)']
User 57 (Similarity: 0.345): ['Blade Runner (1982)', 'Godfather, The (1972)', 'Sting, The (1973)']
User 91 (Similarity: 0.335): ['Desperado (1995)', 'Batman (1989)', 'Godfather, The (1972)']
User 469 (Similarity: 0.331): ['Once Upon a Time in America (1984)', 'Raging Bull (1980)', 'Exorcist, The (1973)']
User 39 (Similarity: 0.330): ['Contact (1997)', 'Hard Core Logo (1996)', 'American Beauty (1999)']
User 288 (Similarity: 0.330): ['Star Wars: Episode IV - A New Hope (1977)', '2001: A Space Odyssey (1968)', 'Little Big Man (1970)']
User 452 (Similarity: 0.328): ['Abyss, The (1989)', 'Rus

In [14]:
def recommend_item_based(user_id, user_movie_matrix, item_similarity_df, movies_df, top_n=5):
    user_ratings = user_movie_matrix.loc[user_id]
    rated_movies = user_ratings[user_ratings > 0].index

    print(f"\nUser {user_id}'s Top Rated Movies:")
    rated_titles = movies_df[movies_df['movieId'].isin(rated_movies)]['title'].tolist()
    print(rated_titles[:5])

    predictions = {}
    for movie in user_movie_matrix.columns:
        if movie not in rated_movies:
            sim_scores = item_similarity_df.loc[rated_movies, movie]
            weighted_sum = np.dot(sim_scores, user_ratings[rated_movies])
            sim_sum = sim_scores.sum()
            if sim_sum > 0:
                predictions[movie] = weighted_sum / sim_sum

    top_preds = pd.Series(predictions).sort_values(ascending=False).head(top_n)
    recommended_movies = movies_df[movies_df['movieId'].isin(top_preds.index)][['movieId', 'title']]
    recommended_movies['score'] = top_preds.values
    return recommended_movies

print("\nItem-Based Recommendations for User 1:")
print(recommend_item_based(1, train_matrix, item_similarity_df, movies))


Item-Based Recommendations for User 1:

User 1's Top Rated Movies:
['Toy Story (1995)', 'Grumpier Old Men (1995)', 'Heat (1995)', 'Seven (a.k.a. Se7en) (1995)', 'Usual Suspects, The (1995)']
      movieId                                    title  score
9173   148978          Blue Exorcist: The Movie (2012)    5.0
9732   193565                Gintama: The Movie (2010)    5.0
9734   193571                      Silver Spoon (2014)    5.0
9735   193573  Love Live! The School Idol Movie (2015)    5.0
9740   193587      Bungo Stray Dogs: Dead Apple (2018)    5.0


In [15]:
def recommend_hybrid(user_id, user_movie_matrix, user_similarity_df, item_similarity_df, movies_df, top_n=5):
    user_preds = recommend_user_based(user_id, user_movie_matrix, user_similarity_df, movies_df, top_n=10)
    item_preds = recommend_item_based(user_id, user_movie_matrix, item_similarity_df, movies_df, top_n=10)

    # Combine predictions
    combined = pd.concat([user_preds.set_index('movieId')['score'],
                          item_preds.set_index('movieId')['score']], axis=1).mean(axis=1)
    top_combined = combined.sort_values(ascending=False).head(top_n)

    recommended_movies = movies_df[movies_df['movieId'].isin(top_combined.index)][['movieId', 'title']]
    recommended_movies['score'] = top_combined.values
    return recommended_movies

print("\nHybrid Recommendations for User 1:")
print(recommend_hybrid(1, train_matrix, user_similarity_df, item_similarity_df, movies))


Hybrid Recommendations for User 1:

Top similar users to User 1:
User 266 (Similarity: 0.357): ['Trainspotting (1996)', "Monty Python's Life of Brian (1979)", 'Clockwork Orange, A (1971)']
User 313 (Similarity: 0.352): ['Escape from New York (1981)', 'Cell, The (2000)', "Charlie's Angels (2000)"]
User 368 (Similarity: 0.345): ["Schindler's List (1993)", 'Jaws (1975)', 'Saving Private Ryan (1998)']
User 57 (Similarity: 0.345): ['Blade Runner (1982)', 'Godfather, The (1972)', 'Sting, The (1973)']
User 91 (Similarity: 0.335): ['Desperado (1995)', 'Batman (1989)', 'Godfather, The (1972)']
User 469 (Similarity: 0.331): ['Once Upon a Time in America (1984)', 'Raging Bull (1980)', 'Exorcist, The (1973)']
User 39 (Similarity: 0.330): ['Contact (1997)', 'Hard Core Logo (1996)', 'American Beauty (1999)']
User 288 (Similarity: 0.330): ['Star Wars: Episode IV - A New Hope (1977)', '2001: A Space Odyssey (1968)', 'Little Big Man (1970)']
User 452 (Similarity: 0.328): ['Abyss, The (1989)', 'Rush Ho

In [16]:
def evaluate_recommendations(user_id, method, k=5, threshold=2.5):
    # Get recommendations based on method
    if method == 'user':
        preds = recommend_user_based(user_id, train_matrix, user_similarity_df, movies, top_n=k)
    elif method == 'item':
        preds = recommend_item_based(user_id, train_matrix, item_similarity_df, movies, top_n=k)
    else:  # hybrid
        preds = recommend_hybrid(user_id, train_matrix, user_similarity_df, item_similarity_df, movies, top_n=k)

    pred_movies = preds['movieId'].tolist()

    # Actual relevant movies from test set (rating > threshold)
    actual = test_matrix.loc[user_id]
    actual_positive = actual[actual > threshold].index.tolist()
    actual_positive_count = len(actual_positive)

    # Precision: Correct predictions / Total predicted
    hits = len(set(pred_movies) & set(actual_positive))
    precision = hits / k if k > 0 else 0

    # Recall: Correct predictions / Total relevant
    recall = hits / actual_positive_count if actual_positive_count > 0 else 0

    return precision, recall

# Test for User 1
for method in ['user', 'item', 'hybrid']:
    precision, recall = evaluate_recommendations(1, method, k=5)
    print(f"\n{method.capitalize()}-Based Metrics for User 1:")
    print(f"Precision@5: {precision:.3f}")
    print(f"Recall@5: {recall:.3f}")


Top similar users to User 1:
User 266 (Similarity: 0.357): ['Trainspotting (1996)', "Monty Python's Life of Brian (1979)", 'Clockwork Orange, A (1971)']
User 313 (Similarity: 0.352): ['Escape from New York (1981)', 'Cell, The (2000)', "Charlie's Angels (2000)"]
User 368 (Similarity: 0.345): ["Schindler's List (1993)", 'Jaws (1975)', 'Saving Private Ryan (1998)']
User 57 (Similarity: 0.345): ['Blade Runner (1982)', 'Godfather, The (1972)', 'Sting, The (1973)']
User 91 (Similarity: 0.335): ['Desperado (1995)', 'Batman (1989)', 'Godfather, The (1972)']
User 469 (Similarity: 0.331): ['Once Upon a Time in America (1984)', 'Raging Bull (1980)', 'Exorcist, The (1973)']
User 39 (Similarity: 0.330): ['Contact (1997)', 'Hard Core Logo (1996)', 'American Beauty (1999)']
User 288 (Similarity: 0.330): ['Star Wars: Episode IV - A New Hope (1977)', '2001: A Space Odyssey (1968)', 'Little Big Man (1970)']
User 452 (Similarity: 0.328): ['Abyss, The (1989)', 'Rush Hour (1998)', 'Ronin (1998)']
User 45 