In [1]:
import os
import glob
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from irt import Beta3

# ------------------ 1) Configuration ------------------
# List of all genres to process
genres = [
    "action", "adventure", "animation", "comedy", "crime", "documentary",
    "drama", "family", "fantasy", "history", "horror", "music", "mystery",
    "romance", "sci-fi", "thriller", "war", "western"
]

# Base directory containing subfolders (one per algorithm)
base_dir = os.path.abspath("../data/results_top_n_genre")

# Create directories for saving outputs
output_plot_dir = os.path.abspath("../data/plots")
output_csv_dir = os.path.abspath("../data/genre_user_info")
output_ability_dir = os.path.abspath("../data/abilityies")
os.makedirs(output_plot_dir, exist_ok=True)
os.makedirs(output_csv_dir, exist_ok=True)
os.makedirs(output_ability_dir, exist_ok=True)

# ------------------ 2) Function Definitions ------------------
def compute_sts_matrix_from_csv(csv_path):
    """
    Reads a CSV file containing columns:
      - ndcg_correct
      - ndcg_flipped
    Computes and returns the STS array using:
      STS = 1 - abs(ndcg_correct - ndcg_flipped)
    """
    df = pd.read_csv(csv_path)
    
    # Convert NDCG scores to torch tensors
    ndcg_correct = torch.tensor(df["ndcg_correct"].values, dtype=torch.float32)
    ndcg_flipped = torch.tensor(df["ndcg_flipped"].values, dtype=torch.float32)

    user_ids_array = df["user_id"]
    
    # Compute STS: 1 - abs(ndcg_correct - ndcg_flipped)
    sts_array = 1 - 10 * torch.abs(ndcg_correct - ndcg_flipped)
    
    return sts_array.numpy(), user_ids_array  # Return as a NumPy array


def loss_function(b4, df_matrix):
    """
    Simple loss: mean absolute difference between the predicted P(i,j)
    and the actual STS value in df_matrix.
    """
    loss_list = []
    for i in range(df_matrix.shape[0]):  # items/users
        for j in range(df_matrix.shape[1]):  # respondents/models
            pij_predicted = ICC_function(
                b4.abilities[j],        # ability of respondent/model j
                b4.difficulties[i],     # difficulty of item/user i
                b4.discriminations[i]   # discrimination of item/user i
            )
            # Compare to STS value
            res = abs(pij_predicted - df_matrix.iloc[i, j])
            loss_list.append(res)
    return np.mean(loss_list)


def ICC_function(abilities, difficulties, discriminations):
    """
    The specific item characteristic curve (ICC) function used by Beta3.
    """
    a = (1 - abilities) / abilities
    b = difficulties / (1 - difficulties)
    c = a * b
    d = c ** discriminations
    return 1 / (d + 1)


# ------------------ 3) Process Each Genre ------------------
for target_genre in genres:
    print(f"\nProcessing genre: {target_genre}")

    # ------------------ 4) Build the File Pattern ------------------
    pattern = os.path.join(base_dir, "*", f"per_user_ndcg_scores_*_{target_genre.lower()}.csv")
    csv_files = glob.glob(pattern)

    if not csv_files:
        print(f"⚠️ No CSV files found for genre '{target_genre}', skipping...")
        continue

    print(f"Found {len(csv_files)} CSV files for genre '{target_genre}'.")

    # ------------------ 5) Compute STS from CSV ------------------
    list_of_sts_arrays = []
    user_ids_array = pd.Series()

    for csv_file in csv_files:
        sts_array, user_ids_array_prev = compute_sts_matrix_from_csv(csv_file)

        if user_ids_array.empty:
            user_ids_array = user_ids_array_prev
        elif not user_ids_array.equals(user_ids_array_prev):
            raise ValueError(f"User ID mismatch in {csv_file}")

        list_of_sts_arrays.append(sts_array)
        print(f"Processed {csv_file}, STS array shape: {sts_array.shape}")

    # ------------------ 6) Combine STS Arrays into One Matrix ------------------
    final_matrix = np.vstack(list_of_sts_arrays).T  # shape: (num_users, num_models)
    print(f"Final STS matrix shape for '{target_genre}': {final_matrix.shape}")

    normalized_df = pd.DataFrame(final_matrix)

    # ------------------ 7) Beta3 IRT Pipeline ------------------
    subjects = normalized_df.shape[1]  # number of models/respondents
    items = normalized_df.shape[0]  # number of users/items

    # Initialize and run Beta3
    b4 = Beta3(
        learning_rate=10,
        epochs=5000,
        n_respondents=subjects,
        n_items=items,
        n_workers=-1,
        random_seed=1,
    )

    print(f"Fitting Beta3 model for '{target_genre}'...")
    b4.fit(normalized_df.values)
    print(f"Model fitting complete for '{target_genre}'.")

    loss = loss_function(b4, normalized_df)
    print(f"Final loss for '{target_genre}': {loss}")

    # ------------------ 8) Load and Merge User Data ------------------
    user_ids_ordered = user_ids_array.tolist()  # Extract user IDs in order
    user_info = pd.read_csv(os.path.abspath("../data/user_info_existing.csv"))  # Ensure this file contains `user_id, gender`
    #gender_map = {"M": 0, "F": 1}
    #user_info["gender"] = user_info["gender"].map(gender_map)

    user_beta3_results = pd.DataFrame({
        "user_id": user_ids_ordered,
        "discrimination": b4.discriminations,
        "difficulty": b4.difficulties
    })

    merged_df = user_beta3_results.merge(user_info, on="user_id", how="left")

    # ------------------ 9) Plot Results ------------------
    disc_values = np.array(merged_df["discrimination"])
    difficulty_values = np.array(merged_df["difficulty"])
    user_genders = np.array(merged_df["gender"])
    colors = np.where(user_genders == 0, "red", "blue")  # Male = Red, Female = Blue

    plt.figure(figsize=(12, 7))
    plt.scatter(disc_values, difficulty_values, c=colors, alpha=0.7)
    plt.title(f"Discrimination vs. Difficulty ({target_genre.capitalize()}, Colored by Gender)")
    plt.xlabel("Discrimination")
    plt.ylabel("Difficulty")
    plt.grid(alpha=0.3)

    import matplotlib.patches as mpatches
    red_patch = mpatches.Patch(color="red", label="Male")
    blue_patch = mpatches.Patch(color="blue", label="Female")
    plt.legend(handles=[red_patch, blue_patch])

    output_plot_path = os.path.join(output_plot_dir, f"discrimination_difficulty_plot_{target_genre}.png")
    plt.savefig(output_plot_path, dpi=300, bbox_inches="tight")
    plt.close()

    # ------------------ 10) Save Merged Data ------------------
    output_csv_path = os.path.join(output_csv_dir, f"genre_user_info_merged_{target_genre}.csv")
    merged_df.to_csv(output_csv_path, index=False)

    output_ability_path = os.path.join(output_ability_dir, f"ability_{target_genre}.csv")

    df_abilities = pd.DataFrame(b4.abilities)  # Convert NumPy array to DataFrame
    df_abilities.to_csv(output_ability_path, index=False)

    

    print(f"✅ Processed '{target_genre}': Plot saved at {output_plot_path}, Data saved at {output_csv_path}")

  

print("\n🎉 All genres processed successfully!")



Processing genre: sci-fi
Found 7 CSV files for genre 'sci-fi'.
Processed D:\master_reserch\data\results_top_n_genre\algo_1\per_user_ndcg_scores_algo_1_sci-fi.csv, STS array shape: (699,)
Processed D:\master_reserch\data\results_top_n_genre\algo_2\per_user_ndcg_scores_algo_2_sci-fi.csv, STS array shape: (699,)
Processed D:\master_reserch\data\results_top_n_genre\CKE\per_user_ndcg_scores_CKE_sci-fi.csv, STS array shape: (699,)
Processed D:\master_reserch\data\results_top_n_genre\KGAT\per_user_ndcg_scores_KGAT_sci-fi.csv, STS array shape: (699,)


  user_ids_array = pd.Series()


Processed D:\master_reserch\data\results_top_n_genre\KGCN\per_user_ndcg_scores_KGCN_sci-fi.csv, STS array shape: (699,)
Processed D:\master_reserch\data\results_top_n_genre\KGIN\per_user_ndcg_scores_KGIN_sci-fi.csv, STS array shape: (699,)
Processed D:\master_reserch\data\results_top_n_genre\NCFKG\per_user_ndcg_scores_NCFKG_sci-fi.csv, STS array shape: (699,)
Final STS matrix shape for 'sci-fi': (699, 7)
Fitting Beta3 model for 'sci-fi'...
Model fitting complete for 'sci-fi'.
Final loss for 'sci-fi': 0.059869773347893375
✅ Processed 'sci-fi': Plot saved at D:\master_reserch\data\plots\discrimination_difficulty_plot_sci-fi.png, Data saved at D:\master_reserch\data\genre_user_info\genre_user_info_merged_sci-fi.csv

🎉 All genres processed successfully!
