<a href="https://colab.research.google.com/github/nabeelnazeer/MachineLearningAndParallel_Lab/blob/main/collaborativefilteringRecommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd

# Define dataset path
dataset_path = "/content/ml-100k"

# Load ratings data
ratings = pd.read_csv(
    f"{dataset_path}/u.data",
    sep="\t",
    names=["user_id", "movie_id", "rating", "timestamp"],
    engine="python"
)

# Load movie details (correcting column mismatch issues)
movies = pd.read_csv(
    f"{dataset_path}/u.item",
    sep="|",
    encoding="latin-1",
    header=None,  # Prevents pandas from treating the first row as column names
    usecols=[0, 1],  # Load only the movie_id and title columns
    names=["movie_id", "title"]  # Assign proper column names
)

# Merge datasets on movie_id
df = pd.merge(ratings, movies, on="movie_id", how="inner")

# Display first few rows
df.head()


Unnamed: 0,user_id,movie_id,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

# Create user-item matrix
user_item_matrix = df.pivot_table(index="user_id", columns="movie_id", values="rating").fillna(0)

# Compute cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix)

# Convert to a DataFrame for easier interpretation
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

user_similarity_df.head()


user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.166931,0.04746,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
2,0.166931,1.0,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.22679,0.161485,0.172268,0.105798
3,0.04746,0.110591,1.0,0.344151,0.021245,0.072415,0.066137,0.08306,0.06104,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.16189,0.101243,0.133416,0.026556
4,0.064358,0.178121,0.344151,1.0,0.031804,0.068044,0.09123,0.18806,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
5,0.378475,0.072979,0.021245,0.031804,1.0,0.237286,0.3736,0.24893,0.056847,0.201427,...,0.338794,0.08058,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941


In [5]:
def get_user_based_recommendations(target_user_id, num_recommendations=5):
    # Get similar users (sorted by similarity)
    similar_users = user_similarity_df[target_user_id].sort_values(ascending=False).index[1:]

    # Get movies watched by similar users
    watched_movies = set(df[df["user_id"] == target_user_id]["movie_id"])
    candidate_movies = df[df["user_id"].isin(similar_users) & ~df["movie_id"].isin(watched_movies)]

    # Aggregate movie ratings
    recommended_movies = candidate_movies.groupby("movie_id")["rating"].mean().sort_values(ascending=False).head(num_recommendations)

    # Return recommended movie titles
    return movies[movies["movie_id"].isin(recommended_movies.index)][["movie_id", "title"]]

# Get recommendations for user 10
get_user_based_recommendations(target_user_id=10)


Unnamed: 0,movie_id,title
1188,1189,Prefontaine (1997)
1200,1201,Marlene Dietrich: Shadow and Light (1996)
1292,1293,Star Kid (1997)
1466,1467,"Saint of Fort Washington, The (1993)"
1598,1599,Someone Else's America (1995)


In [6]:
# Create item-user matrix
item_user_matrix = df.pivot_table(index="movie_id", columns="user_id", values="rating").fillna(0)

# Compute cosine similarity between items
item_similarity = cosine_similarity(item_user_matrix)

# Convert to a DataFrame
item_similarity_df = pd.DataFrame(item_similarity, index=item_user_matrix.index, columns=item_user_matrix.index)

item_similarity_df.head()


movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.402382,0.330245,0.454938,0.286714,0.116344,0.620979,0.481114,0.496288,0.273935,...,0.035387,0.0,0.0,0.0,0.035387,0.0,0.0,0.0,0.047183,0.047183
2,0.402382,1.0,0.273069,0.502571,0.318836,0.083563,0.383403,0.337002,0.255252,0.171082,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078299,0.078299
3,0.330245,0.273069,1.0,0.324866,0.212957,0.106722,0.372921,0.200794,0.273669,0.158104,...,0.0,0.0,0.0,0.0,0.032292,0.0,0.0,0.0,0.0,0.096875
4,0.454938,0.502571,0.324866,1.0,0.334239,0.090308,0.489283,0.490236,0.419044,0.252561,...,0.0,0.0,0.094022,0.094022,0.037609,0.0,0.0,0.0,0.056413,0.075218
5,0.286714,0.318836,0.212957,0.334239,1.0,0.037299,0.334769,0.259161,0.272448,0.055453,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094211


In [7]:
def get_similar_movies(movie_id, num_recommendations=5):
    similar_movies = item_similarity_df[movie_id].sort_values(ascending=False).index[1:num_recommendations+1]
    return movies[movies["movie_id"].isin(similar_movies)][["movie_id", "title"]]

# Find movies similar to "Star Wars (1977)"
star_wars_id = movies[movies["title"].str.contains("Star Wars")].iloc[0]["movie_id"]
get_similar_movies(star_wars_id)


Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
126,127,"Godfather, The (1972)"
171,172,"Empire Strikes Back, The (1980)"
173,174,Raiders of the Lost Ark (1981)
180,181,Return of the Jedi (1983)


In [12]:
def hybrid_recommendations(user_id, movie_id, num_recommendations=5):
    # Get user-based recommendations
    user_recommendations = get_user_based_recommendations(user_id, num_recommendations).set_index("movie_id")

    # Get item-based recommendations
    item_recommendations = get_similar_movies(movie_id, num_recommendations).set_index("movie_id")

    # Merge recommendations (outer join to keep all results)
    hybrid = user_recommendations.join(item_recommendations, how="outer", lsuffix="_user", rsuffix="_item")

    # Fill NaN values with empty strings
    hybrid.fillna("", inplace=True)

    # Sort by movie_id and reset index
    return hybrid.sort_index().reset_index()

# Example: Get hybrid recommendations for user 10 and Star Wars (1977)
star_wars_id = 50  # Replace with the actual movie_id for Star Wars (1977)
hybrid_recommendations(10, star_wars_id)


Unnamed: 0,movie_id,title_user,title_item
0,1,,Toy Story (1995)
1,127,,"Godfather, The (1972)"
2,172,,"Empire Strikes Back, The (1980)"
3,174,,Raiders of the Lost Ark (1981)
4,181,,Return of the Jedi (1983)
5,1189,Prefontaine (1997),
6,1201,Marlene Dietrich: Shadow and Light (1996),
7,1293,Star Kid (1997),
8,1467,"Saint of Fort Washington, The (1993)",
9,1599,Someone Else's America (1995),


In [10]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score

# Function to evaluate recommendations
def evaluate_recommendations(true_ratings, predicted_ratings):
    # Align indices to only compare movies that exist in both actual and predicted sets
    common_movies = true_ratings.index.intersection(predicted_ratings.index)
    if len(common_movies) == 0:
        return {"Precision": 0, "Recall": 0, "F1-score": 0}

    true_labels = (true_ratings.loc[common_movies] >= 4).astype(int)
    predicted_labels = (predicted_ratings.loc[common_movies] >= 4).astype(int)

    precision = precision_score(true_labels, predicted_labels, zero_division=1)
    recall = recall_score(true_labels, predicted_labels, zero_division=1)
    f1 = 2 * (precision * recall) / (precision + recall + 1e-8)

    return {"Precision": precision, "Recall": recall, "F1-score": f1}

# Get actual ratings for user 10
actual_ratings = df[df["user_id"] == 10].set_index("movie_id")["rating"]

# Simulated predictions (ensure movie IDs exist in actual ratings)
predicted_ratings = pd.Series([4.5, 3.8, 4.2], index=[1, 50, 100])

# Evaluate
evaluate_recommendations(actual_ratings, predicted_ratings)


{'Precision': 1.0,
 'Recall': 0.6666666666666666,
 'F1-score': 0.7999999952000001}