In [1]:
import pandas as pd

data = pd.read_csv("..\data\processed_data.csv")

In [2]:
data

Unnamed: 0,user_id,movie_id,rating,timestamp,age,gender,occupation,zip_code,title,release_date,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,195,241,3,881250949,49,0,20,415,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
1,304,241,5,886307828,23,0,14,690,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
2,5,241,4,883268170,42,0,6,758,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
3,233,241,4,891033261,60,0,15,707,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
4,62,241,3,875747190,31,0,11,542,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,862,1678,3,889289491,17,0,18,461,B. Monkey (1998),06-Feb-1998,...,0,0,0,0,0,1,0,1,0,0
99996,862,1677,1,889289570,17,0,18,461,Mat' i syn (1997),06-Feb-1998,...,0,0,0,0,0,0,0,0,0,0
99997,862,1679,2,889289570,17,0,18,461,Sliding Doors (1998),01-Jan-1998,...,0,0,0,0,0,1,0,0,0,0
99998,895,1680,3,887160722,28,0,20,653,You So Crazy (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0


In [3]:
import numpy as np
import pandas as pd
import os

# -------------------------------
# Assume 'data' is already loaded as your merged DataFrame.
# The DataFrame contains columns like "user_id", "movie_id", "rating",
# "gender", "age", "occupation", "zip_code", and genre columns.
# -------------------------------

# -------------------------------
# Prepare Train and Test Data
# -------------------------------
from sklearn.model_selection import train_test_split

def prepare_train_test_data(data):
    # For extra input features we use 18 genre columns (excluding "unknown")
    genre_columns = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
                     "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
                     "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
    
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    
    return train_data, test_data

_, test_data = prepare_train_test_data(data)

# -------------------------------
# Create Test Results DataFrame
# -------------------------------
# Here we assume that test_data contains one row per (user, movie, rating)
# We'll also keep the genre columns for evaluation purposes.
feature_genre_columns = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
                           "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
                           "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

# For the purpose of this demo, we use test_data as our candidate recommendation set.
# Each row is a candidate movie rated by the user.
test_results = test_data.copy()

# -------------------------------
# Generate Random Recommendation Scores
# -------------------------------
# Instead of model predictions, we randomly assign scores.
np.random.seed(50)  # For reproducibility

# Random recommendation scores for "ground truth gender" condition.
test_results["predicted_rating_correct_gender"] = np.random.rand(len(test_results))

# For flipped gender condition, assign a new set of random scores.
test_results["predicted_rating_flipped_gender"] = np.random.rand(len(test_results))

# -------------------------------
# DCG and NDCG Calculation Functions
# -------------------------------
def dcg_at_k(relevances, k=None):
    """Compute DCG for a list of relevance scores.
    If k is None, use all elements."""
    if k is None:
        k = len(relevances)
    relevances = np.array(relevances)[:k]
    gains = 2 ** relevances - 1
    discounts = np.log2(np.arange(2, 2 + len(relevances)))
    return np.sum(gains / discounts)

def ndcg_for_user(user_df, predicted_col, k=None):
    """
    Given a user's DataFrame (with actual ratings and predictions),
    compute NDCG based on sorting by the predicted column.
    """
    # Sort by predicted rating in descending order
    user_df_sorted = user_df.sort_values(by=predicted_col, ascending=False)
    actual_relevances = user_df_sorted["rating"].values
    dcg = dcg_at_k(actual_relevances, k)
    # Compute the ideal DCG by sorting actual ratings in descending order
    ideal_relevances = np.sort(user_df["rating"].values)[::-1]
    idcg = dcg_at_k(ideal_relevances, k)
    return dcg / idcg if idcg > 0 else 0.0

# -------------------------------
# Genre-based Evaluation: Calculate Per-User NDCG
# -------------------------------
genre_columns = [
    "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
    "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
    "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

genre_test_results = {}
for genre in genre_columns:
    genre_test_results[genre] = test_results[test_results[genre] == 1]

# Directory for saving the results
output_dir = "results_top_n_genre/Random_2"
os.makedirs(output_dir, exist_ok=True)

# Loop through each genre and compute per-user NDCG for both conditions.
for genre, genre_df in genre_test_results.items():
    user_ndcg = {}
    for user_id in genre_df["user_id"].unique():
        user_df = genre_df[genre_df["user_id"] == user_id]
        ndcg_correct = ndcg_for_user(user_df, "predicted_rating_correct_gender")
        ndcg_flipped = ndcg_for_user(user_df, "predicted_rating_flipped_gender")
        user_ndcg[user_id] = {
            "ndcg_correct": ndcg_correct,
            "ndcg_flipped": ndcg_flipped
        }
    ndcg_df = pd.DataFrame.from_dict(user_ndcg, orient="index").reset_index()
    ndcg_df.rename(columns={"index": "user_id"}, inplace=True)
    output_file = os.path.join(output_dir, f"per_user_ndcg_scores_random_{genre.lower()}.csv")
    ndcg_df.to_csv(output_file, index=False)
    print(f"Per-user NDCG scores saved for '{genre}' to: {output_file}")
    print(f"--- NDCG for {genre} ---")
    for u_id, scores in user_ndcg.items():
        print(f"User {u_id}: "
              f"NDCG (ground truth gender) = {scores['ndcg_correct']:.4f}, "
              f"NDCG (flipped gender) = {scores['ndcg_flipped']:.4f}")
    print("------\n")


Per-user NDCG scores saved for 'Action' to: results_top_n_genre/Random_2\per_user_ndcg_scores_random_action.csv
--- NDCG for Action ---
User 258: NDCG (ground truth gender) = 0.8340, NDCG (flipped gender) = 0.8340
User 652: NDCG (ground truth gender) = 0.7459, NDCG (flipped gender) = 0.6889
User 599: NDCG (ground truth gender) = 0.8630, NDCG (flipped gender) = 0.8978
User 372: NDCG (ground truth gender) = 0.8551, NDCG (flipped gender) = 0.9150
User 486: NDCG (ground truth gender) = 0.6570, NDCG (flipped gender) = 0.8547
User 386: NDCG (ground truth gender) = 0.8821, NDCG (flipped gender) = 0.7507
User 935: NDCG (ground truth gender) = 0.8064, NDCG (flipped gender) = 0.8108
User 216: NDCG (ground truth gender) = 0.6771, NDCG (flipped gender) = 0.7139
User 269: NDCG (ground truth gender) = 0.8367, NDCG (flipped gender) = 0.7428
User 301: NDCG (ground truth gender) = 1.0000, NDCG (flipped gender) = 1.0000
User 895: NDCG (ground truth gender) = 0.6707, NDCG (flipped gender) = 0.7581
User 4