# Case Study 5-1 – Identifying Similar Users in a Streaming Platform

This notebook walks through building a simple user–user collaborative filtering example on a toy movie ratings dataset.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt

# Set plot style for better visuals
sns.set(style="whitegrid", context="notebook")

In [None]:
# Create the ratings DataFrame directly from a Python dictionary
# Each row represents a user rating a particular movie.
data = {
    "user_id": [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5],
    "movie_id": [101, 102, 103, 101, 102, 104, 101, 103, 104, 102, 103, 104, 101, 102, 103],
    "rating":   [5,   3,   4,   4,   2,   5,   1,   2,   2,   5,   4,   4,   4,   4,   3],
}

ratings_df = pd.DataFrame(data)

# Display the raw ratings data
ratings_df

In [None]:
# Create the user–item matrix using a pivot table
# Rows correspond to users, columns correspond to movies, and values are ratings.
user_item_matrix = ratings_df.pivot_table(
    index="user_id",
    columns="movie_id",
    values="rating",
    fill_value=0  # Fill missing ratings with 0 to indicate no rating
)

# Show the resulting matrix
user_item_matrix

In [None]:
# Normalize user rating vectors using StandardScaler
# with_mean=False keeps zeros as zeros, which is helpful for sparse data.
scaler = StandardScaler(with_mean=False)

# Fit the scaler on the user–item matrix and transform it
normalized_matrix = scaler.fit_transform(user_item_matrix)

# Convert back to a DataFrame for readability
normalized_df = pd.DataFrame(
    normalized_matrix,
    index=user_item_matrix.index,
    columns=user_item_matrix.columns
)

normalized_df

In [None]:
# Compute the user–user cosine similarity matrix
# Each entry (i, j) shows how similar user i is to user j.
similarity_matrix = cosine_similarity(normalized_df)

# Wrap in a DataFrame with user IDs as both index and columns
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=user_item_matrix.index,
    columns=user_item_matrix.index
)

# Display the similarity matrix as a table
similarity_df

In [None]:
# Select a target user (user 1) and find the top 3 most similar users
target_user = 1

# Get similarity scores for the target user to all other users
target_similarities = similarity_df.loc[target_user]

# Exclude the target user themself by dropping their own index
target_similarities = target_similarities.drop(index=target_user)

# Sort users by similarity score in descending order
top_similar_users = target_similarities.sort_values(ascending=False).head(3)

top_similar_users

In [None]:
# Visualize the user–user similarity matrix as a heatmap
plt.figure(figsize=(6, 5))
heatmap = sns.heatmap(
    similarity_df,
    annot=True,
    cmap="viridis",
    fmt=".2f"
)

plt.title("User–User Cosine Similarity")
plt.xlabel("User ID")
plt.ylabel("User ID")

# Save the heatmap as an image file
plt.tight_layout()
plt.savefig("user_similarity_heatmap.png", dpi=300)
plt.show()