In [1]:
import pandas as pd

data = pd.read_csv("..\..\data\processed_data.csv")

In [2]:
data

Unnamed: 0,user_id,movie_id,rating,timestamp,age,gender,occupation,zip_code,title,release_date,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,195,241,3,881250949,49,0,20,415,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
1,304,241,5,886307828,23,0,14,690,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
2,5,241,4,883268170,42,0,6,758,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
3,233,241,4,891033261,60,0,15,707,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
4,62,241,3,875747190,31,0,11,542,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,862,1678,3,889289491,17,0,18,461,B. Monkey (1998),06-Feb-1998,...,0,0,0,0,0,1,0,1,0,0
99996,862,1677,1,889289570,17,0,18,461,Mat' i syn (1997),06-Feb-1998,...,0,0,0,0,0,0,0,0,0,0
99997,862,1679,2,889289570,17,0,18,461,Sliding Doors (1998),01-Jan-1998,...,0,0,0,0,0,1,0,0,0,0
99998,895,1680,3,887160722,28,0,20,653,You So Crazy (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0


In [3]:
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd

# -------------------------------
# Extended Knowledge Graph Construction
# -------------------------------
def build_extended_knowledge_graph(data, num_users, num_movies, genre_list):
    kg = {}
    
    # 1. Add user–movie interactions.
    for _, row in data.iterrows():
        user = row["user_id"]
        movie = row["movie_id"]
        kg.setdefault(user, []).append(movie)
        kg.setdefault(movie, []).append(user)
    
    # 2. Add movie–genre relationships.
    # Each genre in genre_list will be assigned a unique node id.
    genre_node_ids = {genre: num_users + num_movies + idx for idx, genre in enumerate(genre_list)}
    for _, row in data.iterrows():
        movie = row["movie_id"]
        for genre in genre_list:
            if row[genre] == 1:
                g_id = genre_node_ids[genre]
                kg.setdefault(movie, []).append(g_id)
                kg.setdefault(g_id, []).append(movie)
    
    # 3. Add user demographic information.
    # a. Gender nodes.
    gender_values = data["gender"].unique()  # e.g. ['M', 'F']
    base_demo = num_users + num_movies + len(genre_list)
    gender_node_ids = {g: base_demo + idx for idx, g in enumerate(gender_values)}
    for _, row in data.iterrows():
        user = row["user_id"]
        g_val = row["gender"]
        g_node = gender_node_ids[g_val]
        kg.setdefault(user, []).append(g_node)
        kg.setdefault(g_node, []).append(user)
    
    # b. Age nodes.
    age_values = data["age"].unique()
    base_age = base_demo + len(gender_node_ids)
    age_node_ids = {age: base_age + idx for idx, age in enumerate(age_values)}
    for _, row in data.iterrows():
        user = row["user_id"]
        age_val = row["age"]
        a_node = age_node_ids[age_val]
        kg.setdefault(user, []).append(a_node)
        kg.setdefault(a_node, []).append(user)
    
    # c. Occupation nodes.
    occ_values = data["occupation"].unique()
    base_occ = base_age + len(age_node_ids)
    occ_node_ids = {occ: base_occ + idx for idx, occ in enumerate(occ_values)}
    for _, row in data.iterrows():
        user = row["user_id"]
        occ_val = row["occupation"]
        o_node = occ_node_ids[occ_val]
        kg.setdefault(user, []).append(o_node)
        kg.setdefault(o_node, []).append(user)
    
    # d. Zip code nodes.
    zip_values = data["zip_code"].unique()
    base_zip = base_occ + len(occ_node_ids)
    zip_node_ids = {z: base_zip + idx for idx, z in enumerate(zip_values)}
    for _, row in data.iterrows():
        user = row["user_id"]
        zip_val = row["zip_code"]
        z_node = zip_node_ids[zip_val]
        kg.setdefault(user, []).append(z_node)
        kg.setdefault(z_node, []).append(user)
    
    total_entities = base_zip + len(zip_node_ids)
    return kg, genre_node_ids, gender_node_ids, age_node_ids, occ_node_ids, zip_node_ids, total_entities

# -------------------------------
# Assume your merged DataFrame 'data' contains the following columns:
# "user_id", "movie_id", "rating", "gender", "age", "occupation", "zip_code",
# and genre columns such as "unknown", "Action", "Adventure", etc.
# -------------------------------
# For the knowledge graph we include all genres (here we use the full list, including "unknown").
genre_list_kg = ["unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
                 "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery",
                 "Romance", "Sci-Fi", "Thriller", "War", "Western"]

num_users = data["user_id"].nunique()
num_movies = data["movie_id"].nunique()

kg, genre_node_ids, gender_node_ids, age_node_ids, occ_node_ids, zip_node_ids, total_entities = \
    build_extended_knowledge_graph(data, num_users, num_movies, genre_list_kg)

# -------------------------------
# Prepare Train and Test Data (Including Extra Features)
# -------------------------------
def prepare_train_test_data(data):
    # For extra input features we use 18 genre columns (excluding "unknown")
    genre_columns = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
                     "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
                     "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
    
    train_data, test_data = train_test_split(data, test_size=0.5, random_state=42)
    
    train_users = train_data["user_id"].values
    train_movies = train_data["movie_id"].values
    train_gender = train_data["gender"].values
    train_age = train_data["age"].values
    train_occupation = train_data["occupation"].values
    train_zip = train_data["zip_code"].values
    train_labels = train_data["rating"].values
    train_genres = train_data[genre_columns].values

    test_users = test_data["user_id"].values
    test_movies = test_data["movie_id"].values
    test_gender = test_data["gender"].values
    test_age = test_data["age"].values
    test_occupation = test_data["occupation"].values
    test_zip = test_data["zip_code"].values
    test_labels = test_data["rating"].values
    test_genres = test_data[genre_columns].values

    return (train_users, train_movies, train_gender, train_age, train_occupation, train_zip, train_labels, train_genres,
            test_users, test_movies, test_gender, test_age, test_occupation, test_zip, test_labels, test_genres)

(train_users, train_movies, train_gender, train_age, train_occupation, train_zip, train_labels, train_genres,
 test_users, test_movies, test_gender, test_age, test_occupation, test_zip, test_labels, test_genres) = prepare_train_test_data(data)

# -------------------------------
# Define the CKE Module (Using Extended Entity Space)
# -------------------------------
class CKE(torch.nn.Module):
    def __init__(self, num_entities, embedding_dim, kg, num_neighbors=10):
        super(CKE, self).__init__()
        self.entity_embedding = torch.nn.Embedding(num_entities, embedding_dim)
        self.relation_embedding = torch.nn.Embedding(1, embedding_dim)
        self.kg = kg
        self.num_neighbors = num_neighbors
        self.linear = torch.nn.Linear(embedding_dim, embedding_dim)
        
        torch.nn.init.xavier_uniform_(self.entity_embedding.weight)
        torch.nn.init.xavier_uniform_(self.relation_embedding.weight)
        torch.nn.init.xavier_uniform_(self.linear.weight)
        
    def aggregate_neighbors(self, entity_ids):
        agg_list = []
        for entity in entity_ids:
            entity_int = entity.item()
            neighbors = self.kg.get(entity_int, [])
            if len(neighbors) == 0:
                agg_list.append(torch.zeros(self.entity_embedding.embedding_dim, device=entity_ids.device))
            else:
                if len(neighbors) > self.num_neighbors:
                    sampled = np.random.choice(neighbors, self.num_neighbors, replace=False)
                else:
                    sampled = neighbors
                sampled_tensor = torch.tensor(sampled, dtype=torch.long, device=entity_ids.device)
                neighbor_embeds = self.entity_embedding(sampled_tensor)
                relation_embeds = self.relation_embedding(torch.zeros(len(sampled_tensor), dtype=torch.long, device=entity_ids.device))
                neighbor_translated = neighbor_embeds - relation_embeds
                agg_list.append(neighbor_translated.mean(dim=0))
        return torch.stack(agg_list)
    
    def forward(self, entity_ids):
        entity_embeds = self.entity_embedding(entity_ids)
        aggregated = self.aggregate_neighbors(entity_ids)
        combined = entity_embeds + self.linear(aggregated)
        return combined

# -------------------------------
# Define the Full Model with CKE and Extra Features
# -------------------------------
class FullModel(torch.nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim, kg, num_genres_feature, num_entities):
        super(FullModel, self).__init__()
        self.num_users = num_users
        self.user_embedding = torch.nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = torch.nn.Embedding(num_movies, embedding_dim)
        # CKE uses the full extended entity space (users, movies, genres, demographics)
        self.cke = CKE(num_entities, embedding_dim, kg, num_neighbors=10)
        # Input dimension:
        # 4 * embedding_dim from CF and CKE for user and movie +
        # 4 extra demographic features (gender, age, occupation, zip) +
        # num_genres_feature extra genre features.
        self.fc1 = torch.nn.Linear(embedding_dim * 4 + 4 + num_genres_feature, 128)
        self.fc2 = torch.nn.Linear(128, 64)
        self.fc3 = torch.nn.Linear(64, 1)
        
    def forward(self, user_ids, movie_ids, gender, age, occupation, zip_code, movie_genres):
        user_cf = self.user_embedding(user_ids)
        movie_cf = self.movie_embedding(movie_ids)
        user_cke = self.cke(user_ids)
        # For movies, offset the id by num_users to index into the full embedding table.
        movie_cke = self.cke(movie_ids + self.num_users)
        concat = torch.cat([
            user_cf, movie_cf, user_cke, movie_cke,
            gender.unsqueeze(1), age.unsqueeze(1),
            occupation.unsqueeze(1), zip_code.unsqueeze(1),
            movie_genres
        ], dim=1)
        x = F.relu(self.fc1(concat))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

embedding_dim = 16
# For extra features, we use 18 genre columns (excluding "unknown")
num_genres_feature = train_genres.shape[1]  # Should be 18

model = FullModel(num_users, num_movies, embedding_dim, kg, num_genres_feature, total_entities)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

# -------------------------------
# Convert Data to PyTorch Tensors
# -------------------------------
train_users = torch.tensor(train_users, dtype=torch.long)
train_movies = torch.tensor(train_movies, dtype=torch.long)
train_gender = torch.tensor(train_gender, dtype=torch.float)
train_age = torch.tensor(train_age, dtype=torch.float)
train_occupation = torch.tensor(train_occupation, dtype=torch.float)
train_zip = torch.tensor(train_zip, dtype=torch.float)
train_labels = torch.tensor(train_labels, dtype=torch.float)
train_genres = torch.tensor(train_genres, dtype=torch.float)

test_users = torch.tensor(test_users, dtype=torch.long)
test_movies = torch.tensor(test_movies, dtype=torch.long)
test_gender = torch.tensor(test_gender, dtype=torch.float)
test_age = torch.tensor(test_age, dtype=torch.float)
test_occupation = torch.tensor(test_occupation, dtype=torch.float)
test_zip = torch.tensor(test_zip, dtype=torch.float)
test_labels = torch.tensor(test_labels, dtype=torch.float)
test_genres = torch.tensor(test_genres, dtype=torch.float)

# -------------------------------
# Create DataLoader for Training
# -------------------------------
train_dataset = TensorDataset(train_users, train_movies, train_gender, train_age, train_occupation, train_zip, train_labels, train_genres)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)

# -------------------------------
# Train the Model
# -------------------------------
for epoch in range(10):
    model.train()
    for batch in train_loader:
        user_ids, movie_ids, gender, age, occupation, zip_code, labels, genres = batch
        optimizer.zero_grad()
        outputs = model(user_ids, movie_ids, gender, age, occupation, zip_code, genres)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

# -------------------------------
# Evaluate the Model
# -------------------------------
model.eval()
with torch.no_grad():
    test_outputs = model(test_users, test_movies, test_gender, test_age, test_occupation, test_zip, test_genres)
    test_loss = criterion(test_outputs.squeeze(), test_labels)
    print(f"Test Loss (MSE): {test_loss.item():.4f}")

# -------------------------------
# Get Predictions for Analysis (Correct and Flipped Gender)
# -------------------------------
with torch.no_grad():
    predictions_correct = model(test_users, test_movies, test_gender, test_age, test_occupation, test_zip, test_genres)
    flipped_gender = 1 - test_gender  # Flip gender: 0->1 and 1->0.
    predictions_flipped = model(test_users, test_movies, flipped_gender, test_age, test_occupation, test_zip, test_genres)

test_results = pd.DataFrame({
    "user_id": test_users.numpy(),
    "movie_id": test_movies.numpy(),
    "actual_rating": test_labels.numpy(),
    "predicted_rating_correct_gender": predictions_correct.squeeze().numpy(),
    "predicted_rating_flipped_gender": predictions_flipped.squeeze().numpy()
})

# Add movie genre columns (extra features) to test_results.
feature_genre_columns = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
                           "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
                           "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
df_genres = pd.DataFrame(test_genres.numpy(), columns=feature_genre_columns)
test_results = pd.concat([test_results, df_genres], axis=1)

print(test_results.head())

# -------------------------------
# Function to Get Top-N Recommendations for a Given User
# -------------------------------
def get_top_n_recommendations(user_id, n, results_df):
    user_results = results_df[results_df["user_id"] == user_id]
    if user_results.empty:
        print(f"No candidate movies found for user {user_id}.")
        return pd.DataFrame()
    top_n = user_results.sort_values(by="predicted_rating_correct_gender", ascending=False).head(n)
    return top_n

# Example: Get Top 5 Movie Recommendations for User 0.
top_movies = get_top_n_recommendations(user_id=0, n=5, results_df=test_results)
print("Top 5 recommended movies for user 0:")
print(top_movies)


Epoch 1, Loss: 1.0903459787368774
Epoch 2, Loss: 0.9513126611709595
Epoch 3, Loss: 0.8335803747177124
Epoch 4, Loss: 1.0314879417419434
Epoch 5, Loss: 0.730201780796051
Epoch 6, Loss: 0.8112560510635376
Epoch 7, Loss: 0.9730459451675415
Epoch 8, Loss: 0.7234387993812561
Epoch 9, Loss: 0.7365900278091431
Epoch 10, Loss: 1.010167121887207
Test Loss (MSE): 0.9216
   user_id  movie_id  actual_rating  predicted_rating_correct_gender  \
0       57       247            4.0                         3.800942   
1      363       324            4.0                         3.399367   
2      258       404            3.0                         3.592514   
3      497       521            3.0                         3.676121   
4      279       101            5.0                         3.189294   

   predicted_rating_flipped_gender  Action  Adventure  Animation  Children's  \
0                         3.841762     0.0        0.0        0.0         0.0   
1                         3.448565     0.0  

In [4]:
import numpy as np
import pandas as pd
import os

# Helper function to compute Discounted Cumulative Gain (DCG)
def dcg_at_k(relevances, k=None):
    """Compute DCG for a list of relevance scores.
    If k is None, use all elements."""
    if k is None:
        k = len(relevances)
    relevances = np.array(relevances)[:k]
    # Compute gains: you can use 2^r - 1 (exponential gain) or simply r.
    gains = 2 ** relevances - 1
    # Compute discount factors: log2(rank+1) where rank starts at 1
    discounts = np.log2(np.arange(2, 2 + len(relevances)))
    return np.sum(gains / discounts)

# Function to compute NDCG for one user given a particular prediction column
def ndcg_for_user(user_df, predicted_col, k=None):
    """
    Given a user's DataFrame (with actual ratings and predictions),
    compute NDCG based on sorting by the predicted column.
    """
    # Sort the user data by predicted rating (descending order)
    user_df_sorted = user_df.sort_values(by=predicted_col, ascending=False)
    actual_relevances = user_df_sorted["actual_rating"].values
    dcg = dcg_at_k(actual_relevances, k)
    
    # Compute the ideal DCG by sorting actual ratings in descending order
    ideal_relevances = np.sort(user_df["actual_rating"].values)[::-1]
    idcg = dcg_at_k(ideal_relevances, k)
    
    # If the ideal DCG is zero, return zero to avoid division by zero.
    return dcg / idcg if idcg > 0 else 0.0

# Calculate NDCG for each user in the test set
user_ndcg = {}
for user in test_results_popular["user_id"].unique():
    user_df = test_results_popular[test_results_popular["user_id"] == user]
    ndcg_correct = ndcg_for_user(user_df, "predicted_rating_correct_gender")
    ndcg_flipped = ndcg_for_user(user_df, "predicted_rating_flipped_gender")
    user_ndcg[user] = {"ndcg_correct": ndcg_correct, "ndcg_flipped": ndcg_flipped}

# Convert the dictionary to a DataFrame
ndcg_df = pd.DataFrame.from_dict(user_ndcg, orient="index").reset_index()
ndcg_df = ndcg_df.rename(columns={"index": "user_id"})

# Ensure the "results_top_n" directory exists
output_dir = "results_top_n"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "per_user_ndcg_scores_CKE_populer.csv")

# Save the DataFrame to CSV in the results_top_n directory
ndcg_df.to_csv(output_file, index=False)
print(f"Per-user NDCG scores saved to {output_file}")

# Optionally, print NDCG for each user to the console
print("Per-user NDCG scores:")
for user, scores in user_ndcg.items():
    print(f"User {user}: NDCG (ground truth gender) = {scores['ndcg_correct']:.4f}, NDCG (flipped gender) = {scores['ndcg_flipped']:.4f}")


NameError: name 'test_results_popular' is not defined

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F

# Function to compute Discounted Cumulative Gain (DCG)
def compute_dcg(relevance_scores):
    """
    Computes DCG given a list of relevance scores.
    DCG = sum(rel_i / log2(i + 1)), where rel_i is the rating at position i (1-based index)
    """
    return np.sum((2**relevance_scores - 1) / np.log2(np.arange(1, len(relevance_scores) + 1) + 1))

# Function to compute Normalized Discounted Cumulative Gain (NDCG)
def compute_ndcg(actual_ratings, predicted_ratings, top_n):
    """
    Computes NDCG for a given set of actual and predicted ratings.
    """
    # Get top-N predicted movie indices (sorted by predicted rating)
    top_n_pred_indices = np.argsort(predicted_ratings)[::-1][:top_n]
    
    # Get top-N actual movie ratings sorted in the ideal order
    ideal_ratings = np.sort(actual_ratings)[::-1][:top_n]

    # Retrieve the corresponding actual ratings for the predicted top-N movies
    predicted_ratings_sorted = actual_ratings[top_n_pred_indices]

    # Compute DCG and IDCG
    dcg = compute_dcg(predicted_ratings_sorted)
    idcg = compute_dcg(ideal_ratings)

    # Avoid division by zero
    return dcg / idcg if idcg > 0 else 0

# Function to evaluate NDCG over all users
def evaluate_ndcg(test_results, top_n=10):
    """
    Computes the average NDCG over all users in the test set.
    """
    ndcg_scores = []

    for user_id in test_results["user_id"].unique():
        user_results = test_results[test_results["user_id"] == user_id]

        # If user has fewer than top_n movies, skip
        if len(user_results) < top_n:
            continue

        actual_ratings = user_results["actual_rating"].values
        predicted_ratings = user_results["predicted_rating_correct_gender"].values

        ndcg_score = compute_ndcg(actual_ratings, predicted_ratings, top_n)
        ndcg_scores.append(ndcg_score)

    return np.mean(ndcg_scores) if ndcg_scores else 0

# Compute and print the NDCG score
ndcg_value = evaluate_ndcg(test_results, top_n=10)
print(f"Average NDCG@10: {ndcg_value:.4f}")


In [None]:
import pandas as pd
import numpy as np

# -------------------------------
# Step 9: Create Popular and Unpopular Movie Test Datasets
# -------------------------------

# Compute movie popularity based on the number of ratings
movie_popularity = data.groupby("movie_id")["rating"].count().reset_index()
movie_popularity = movie_popularity.rename(columns={"rating": "num_ratings"})

# Define threshold for popularity (median as the split point)
popularity_threshold = movie_popularity["num_ratings"].median()

# Identify popular and unpopular movies
popular_movies = movie_popularity[movie_popularity["num_ratings"] >= popularity_threshold]["movie_id"]
unpopular_movies = movie_popularity[movie_popularity["num_ratings"] < popularity_threshold]["movie_id"]

# Create new test datasets
test_results_popular = test_results[test_results["movie_id"].isin(popular_movies)]
test_results_unpopular = test_results[test_results["movie_id"].isin(unpopular_movies)]



In [None]:
import numpy as np
import pandas as pd
import os

def reciprocal_rank_at_k(relevances, k=None):
    """
    Compute the reciprocal rank for a list of actual relevance scores.
    The first occurrence of the maximum relevance is considered the correct item.
    """
    if k is None:
        k = len(relevances)
    relevances = np.array(relevances)[:k]
    max_rel = np.max(relevances)
    for idx, rel in enumerate(relevances, start=1):
        if rel == max_rel:
            return 1.0 / idx
    return 0.0

def mrr_for_user(user_df, predicted_col, k=None):
    """
    Given a user's DataFrame (with actual ratings and predictions),
    compute MRR based on sorting by the predicted column.
    """
    sorted_df = user_df.sort_values(by=predicted_col, ascending=False)
    actual_relevances = sorted_df["actual_rating"].values
    return reciprocal_rank_at_k(actual_relevances, k)

# Calculate MRR for each user using the specified prediction columns
user_mrr = {}
for user in genre_test_results["Sci-Fi"]["user_id"].unique():
    user_df = genre_test_results["Sci-Fi"][genre_test_results["Sci-Fi"]["user_id"] == user]
    mrr_correct = mrr_for_user(user_df, "predicted_rating_correct_gender")
    mrr_flipped = mrr_for_user(user_df, "predicted_rating_flipped_gender")
    user_mrr[user] = {"mrr_correct": mrr_correct, "mrr_flipped": mrr_flipped}

# Convert the results into a DataFrame
mrr_df = pd.DataFrame.from_dict(user_mrr, orient="index").reset_index()
mrr_df = mrr_df.rename(columns={"index": "user_id"})

# Ensure the output directory exists and save the results
output_dir = "results_top_n_mrr/unpopular"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "per_user_mrr_scores_CKE_unpopular.csv")
mrr_df.to_csv(output_file, index=False)
print(f"Per-user MRR scores saved to {output_file}")

# Optionally, print MRR for each user to the console
for user, scores in user_mrr.items():
    print(f"User {user}: MRR (ground truth) = {scores['mrr_correct']:.4f}, MRR (flipped) = {scores['mrr_flipped']:.4f}")


In [None]:
# Define the genre columns as they appear in your original data
genre_columns = [
    "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
    "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery",
    "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

# Create a DataFrame with unique movie genre information by dropping duplicates
movie_genres = data[['movie_id'] + genre_columns].drop_duplicates(subset='movie_id')

# Merge the unique movie genres into test_results using movie_id as the key
test_results_with_genre = test_results.merge(movie_genres, on='movie_id', how='left')

print(test_results)


In [8]:
# Assuming test_result is your prediction DataFrame with genre columns

# Define the genre columns you want to split by (excluding "unknown" if not needed)
genre_columns = [
    "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
    "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
    "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

# Create a dictionary to hold the filtered DataFrames for each genre
genre_test_results = {}

for genre in genre_columns:
    # Filter rows where the genre flag is 1 (i.e., the movie belongs to that genre)
    genre_test_results[genre] = test_results[test_results[genre] == 1]

# Example: access the test results for Action movies
test_result_action = genre_test_results["Adventure"]


In [9]:
import numpy as np
import pandas as pd
import os

# Helper function to compute Discounted Cumulative Gain (DCG)
def dcg_at_k(relevances, k=None):
    """Compute DCG for a list of relevance scores.
    If k is None, use all elements."""
    if k is None:
        k = len(relevances)
    relevances = np.array(relevances)[:k]

    # Compute gains: you can use 2^r - 1 (exponential gain) or simply r.
    gains = 2 ** relevances - 1

    # Compute discount factors: log2(rank+1), where rank starts at 1
    discounts = np.log2(np.arange(2, 2 + len(relevances)))
    return np.sum(gains / discounts)

# Function to compute NDCG for one user given a particular prediction column
def ndcg_for_user(user_df, predicted_col, k=None):
    """
    Given a user's DataFrame (with actual ratings and predictions),
    compute NDCG based on sorting by the predicted column.
    """
    # Sort by predicted rating in descending order
    user_df_sorted = user_df.sort_values(by=predicted_col, ascending=False)
    actual_relevances = user_df_sorted["actual_rating"].values
    dcg = dcg_at_k(actual_relevances, k)

    # Compute the ideal DCG by sorting actual ratings in descending order
    ideal_relevances = np.sort(user_df["actual_rating"].values)[::-1]
    idcg = dcg_at_k(ideal_relevances, k)

    # Avoid division by zero if IDCG is 0
    return dcg / idcg if idcg > 0 else 0.0


 
    genre_columns = [
         "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
         "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
         "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
    ]
    
    genre_test_results = {}
    for genre in genre_columns:
         genre_test_results[genre] = test_results[test_results[genre] == 1]


# Directory for saving the results
output_dir = "results_top_n"
os.makedirs(output_dir, exist_ok=True)

# Loop through each genre and compute per-user NDCG
for genre, genre_df in genre_test_results.items():
    user_ndcg = {}

    # Iterate over each user in this particular genre
    for user_id in genre_df["user_id"].unique():
        user_df = genre_df[genre_df["user_id"] == user_id]

        # Calculate NDCG for correct vs. flipped gender predictions
        ndcg_correct = ndcg_for_user(user_df, "predicted_rating_correct_gender")
        ndcg_flipped = ndcg_for_user(user_df, "predicted_rating_flipped_gender")

        user_ndcg[user_id] = {
            "ndcg_correct": ndcg_correct,
            "ndcg_flipped": ndcg_flipped
        }

    # Convert user_ndcg to a DataFrame
    ndcg_df = pd.DataFrame.from_dict(user_ndcg, orient="index").reset_index()
    ndcg_df.rename(columns={"index": "user_id"}, inplace=True)

    # Construct an output filename that includes the genre name
    output_dir = "results_top_n_genre/CKE"
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(
        output_dir, f"per_user_ndcg_scores_CKE_{genre.lower()}.csv"
    )
    ndcg_df.to_csv(output_file, index=False)
    print(f"Per-user NDCG scores saved for '{genre}' to: {output_file}")

    # (Optional) Print some results to the console
    print(f"--- NDCG for {genre} ---")
    for u_id, scores in user_ndcg.items():
        print(f"User {u_id}: "
              f"NDCG (ground truth gender) = {scores['ndcg_correct']:.4f}, "
              f"NDCG (flipped gender) = {scores['ndcg_flipped']:.4f}")
    print("------\n")


Per-user NDCG scores saved for 'Action' to: results_top_n_genre/CKE\per_user_ndcg_scores_CKE_action.csv
--- NDCG for Action ---
User 258: NDCG (ground truth gender) = 0.9274, NDCG (flipped gender) = 0.9988
User 652: NDCG (ground truth gender) = 0.8762, NDCG (flipped gender) = 0.8820
User 599: NDCG (ground truth gender) = 0.8817, NDCG (flipped gender) = 0.8995
User 372: NDCG (ground truth gender) = 0.9006, NDCG (flipped gender) = 0.9022
User 486: NDCG (ground truth gender) = 0.8649, NDCG (flipped gender) = 0.8699
User 386: NDCG (ground truth gender) = 0.9299, NDCG (flipped gender) = 0.9330
User 935: NDCG (ground truth gender) = 0.8322, NDCG (flipped gender) = 0.8365
User 216: NDCG (ground truth gender) = 0.7647, NDCG (flipped gender) = 0.7639
User 269: NDCG (ground truth gender) = 0.9133, NDCG (flipped gender) = 0.9113
User 301: NDCG (ground truth gender) = 0.6755, NDCG (flipped gender) = 0.6805
User 895: NDCG (ground truth gender) = 0.9708, NDCG (flipped gender) = 0.9719
User 449: NDCG