In [None]:
import os
import glob
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from irt import Beta3

# ------------------ 1) Configuration ------------------
# Explicit algorithm list
algos = ["Random","Random_2","CKE","KGAT","KGCN","KGIN","NCFKG"]

# List of target genres to include (in lower-case)
#genres_to_use = ["romance", "thriller", "action", "comedy", "drama","mystery","war","sci-fi"]

genres_to_use = ["action", "adventure", "animation", "children", "comedy", 
                 "crime", "documentary", "drama", "fantasy", "filmnoir", "horror",
                 "musical", "mystery", "romance", "sci-fi", "thriller", "war", "western"]

# Base directory containing algorithm subfolders
base_dir = os.path.abspath("../data/genre_wise/results_top_n_genre")

# Create the base folder for algorithm-wise outputs
algo_wise_dir = os.path.join(os.path.abspath("../data"), "algo_wise/run_2_version2")
os.makedirs(algo_wise_dir, exist_ok=True)

# ------------------ 2) Function Definitions ------------------
def compute_sts_matrix_from_csv(csv_path, user_ids_order=None):
    """
    Reads a CSV file containing columns:
      - user_id
      - ndcg_correct
      - ndcg_flipped
    If user_ids_order is provided, filters rows to include only those user_ids,
    and re-orders the rows according to user_ids_order.
    
    Computes and returns the STS array using:
      STS = 1 - 10 * abs(ndcg_correct - ndcg_flipped)
    Returns:
      sts_array: computed STS values as a NumPy array.
      user_ids: the list of user_ids (either extracted from the file or user_ids_order)
    """
    df = pd.read_csv(csv_path)
    if user_ids_order is not None:
        # Filter to keep only common users and re-order rows
        df = df[df["user_id"].isin(user_ids_order)]
        df = df.set_index("user_id").loc[user_ids_order].reset_index()
     
    
    ndcg_correct = torch.tensor(df["ndcg_correct"].values, dtype=torch.float32)
    ndcg_flipped = torch.tensor(df["ndcg_flipped"].values, dtype=torch.float32)
    
    #sts_array = 1 -  torch.abs(ndcg_correct - ndcg_flipped)
    #k = 10  # Adjust steepness parameter as needed
    #sts_array = torch.sigmoid(k * (1 - torch.abs(ndcg_correct - ndcg_flipped)) - k/2)

    alpha = 0.9  # Choose an exponent less than 1
    sts_array = (1 - torch.abs(ndcg_correct - ndcg_flipped)) ** alpha

    return sts_array.numpy(), df["user_id"]

def loss_function(b4, df_matrix):
    """
    Simple loss: mean absolute difference between the predicted P(i,j)
    and the actual STS value in df_matrix.
    """
    loss_list = []
    for i in range(df_matrix.shape[0]):  # items/users
        for j in range(df_matrix.shape[1]):  # respondents/models
            pij_predicted = ICC_function(
                b4.abilities[j],
                b4.difficulties[i],
                b4.discriminations[i]
            )
            loss_list.append(abs(pij_predicted - df_matrix.iloc[i, j]))
    return np.mean(loss_list)

def ICC_function(abilities, difficulties, discriminations):
    """
    The specific item characteristic curve (ICC) function used by Beta3.
    """
    a = (1 - abilities) / abilities
    b = difficulties / (1 - difficulties)
    c = a * b
    d = c ** discriminations
    return 1 / (d + 1)

# ------------------ 3) Process Each Algorithm ------------------
for algo_name in algos:
    algo_path = os.path.join(base_dir, algo_name)
    if not os.path.isdir(algo_path):
        print(f"⚠️ Directory not found: {algo_path}")
        continue

    print(f"\nProcessing algorithm: {algo_name}")

    # Create subdirectories for this algorithm under algo_wise
    algo_base_dir = os.path.join(algo_wise_dir, algo_name)
    algo_plot_dir = os.path.join(algo_base_dir, "plots")
    algo_csv_dir = os.path.join(algo_base_dir, "genre_user_info")
    algo_ability_dir = os.path.join(algo_base_dir, "abilityies")
    os.makedirs(algo_plot_dir, exist_ok=True)
    os.makedirs(algo_csv_dir, exist_ok=True)
    os.makedirs(algo_ability_dir, exist_ok=True)
    
    # ------------------ Load and Filter CSV Files ------------------
    pattern = os.path.join(algo_path, "per_user_ndcg_scores_*.csv")
    all_csv_files = glob.glob(pattern)
    
    # Filter files: only use those whose filename contains one of the target genres
    csv_files = [
        f for f in all_csv_files 
        if any(genre in f.lower() for genre in genres_to_use)
    ]
    
    if not csv_files:
        print(f"  ⚠️ No CSV files found in algorithm '{algo_name}' for target genres, skipping...")
        continue

    print(f"  Found {len(csv_files)} CSV files in algorithm '{algo_name}' for genres {genres_to_use}.")

    # ------------------ Determine the Intersection of User IDs ------------------
    user_ids_intersection = None
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        current_ids = set(df["user_id"])
        if user_ids_intersection is None:
            user_ids_intersection = current_ids
        else:
            user_ids_intersection = user_ids_intersection.intersection(current_ids)
    
    if not user_ids_intersection:
        print(f"  ⚠️ No common user_ids found across files in algorithm '{algo_name}', skipping...")
        continue

    common_user_ids = sorted(user_ids_intersection)
    print(f"  Common users across files: {len(common_user_ids)} users")

    # ------------------ Compute STS from Each CSV (Filtered) ------------------
    list_of_sts_arrays = []
    for csv_file in csv_files:
        sts_array, _ = compute_sts_matrix_from_csv(csv_file, user_ids_order=common_user_ids)
        list_of_sts_arrays.append(sts_array)
        print(f"    Processed {csv_file}, STS array shape: {sts_array.shape}")

    # ------------------ Combine STS Arrays into One Matrix ------------------
    final_matrix = np.vstack(list_of_sts_arrays).T  # shape: (num_common_users, num_files)
    
    print(final_matrix)
    print(f"  Final STS matrix shape in algorithm '{algo_name}': {final_matrix.shape}")

    normalized_df = pd.DataFrame(final_matrix)

    # ------------------ Beta3 IRT Pipeline ------------------
    subjects = normalized_df.shape[1]  # number of "respondents" (one per CSV file)
    items = normalized_df.shape[0]     # number of "items" (common users)

    b4 = Beta3(
        learning_rate=10,
        epochs=5000,
        n_respondents=subjects,
        n_items=items,
        n_workers=-1,
        random_seed=1,
    )

    print(f"  Fitting Beta3 model in algorithm '{algo_name}'...")
    b4.fit(normalized_df.values)
    print(f"  Model fitting complete in algorithm '{algo_name}'.")

    loss = loss_function(b4, normalized_df)
    print(f"  Final loss in algorithm '{algo_name}': {loss}")

    # ------------------ Load and Merge User Data ------------------
    user_info_path = os.path.abspath("../data/user_info_existing.csv")
    if not os.path.exists(user_info_path):
        print(f"  ⚠️ user_info_existing.csv not found at {user_info_path}, skipping merge.")
        continue

    user_info = pd.read_csv(user_info_path)
    user_beta3_results = pd.DataFrame({
        "user_id": common_user_ids,
        "discrimination": b4.discriminations,
        "difficulty": b4.difficulties
    })

    merged_df = user_beta3_results.merge(user_info, on="user_id", how="left")

    # ------------------ Plot Results ------------------
    disc_values = np.array(merged_df["discrimination"])
    difficulty_values = np.array(merged_df["difficulty"])
    user_genders = np.array(merged_df["gender"])
    colors = np.where(user_genders == 0, "red", "blue")  # assuming 0=Male, 1=Female

    plt.figure(figsize=(12, 7))
    plt.scatter(disc_values, difficulty_values, c=colors, alpha=0.7)
    plt.title(f"Discrimination vs. Difficulty (Algorithm: {algo_name}, Selected Genres)")
    plt.xlabel("Discrimination")
    plt.ylabel("Difficulty")
    plt.grid(alpha=0.3)

    import matplotlib.patches as mpatches
    red_patch = mpatches.Patch(color="red", label="Male")
    blue_patch = mpatches.Patch(color="blue", label="Female")
    plt.legend(handles=[red_patch, blue_patch])

    output_plot_path = os.path.join(algo_plot_dir, "discrimination_difficulty_plot_selected.png")
    plt.savefig(output_plot_path, dpi=300, bbox_inches="tight")
    plt.close()

    # ------------------ Save Merged Data and Ability Scores ------------------
    output_csv_path = os.path.join(algo_csv_dir, "genre_user_info_merged_selected.csv")
    merged_df.to_csv(output_csv_path, index=False)

    output_ability_path = os.path.join(algo_ability_dir, "ability_selected.csv")
    df_abilities = pd.DataFrame(b4.abilities)
    df_abilities.to_csv(output_ability_path, index=False)

    print(f"  ✅ Processed algorithm '{algo_name}':")
    print(f"     Plot saved at {output_plot_path}")
    print(f"     Merged data saved at {output_csv_path}")
    print(f"     Ability scores saved at {output_ability_path}")

print("\n🎉 All algorithms processed successfully!")



Processing algorithm: Random
  Found 17 CSV files in algorithm 'Random' for genres ['action', 'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'filmnoir', 'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western'].
  Common users across files: 20 users
    Processed D:\master_reserch\data\genre_wise\results_top_n_genre\Random\per_user_ndcg_scores_random_action.csv, STS array shape: (20,)
    Processed D:\master_reserch\data\genre_wise\results_top_n_genre\Random\per_user_ndcg_scores_random_adventure.csv, STS array shape: (20,)
    Processed D:\master_reserch\data\genre_wise\results_top_n_genre\Random\per_user_ndcg_scores_random_animation.csv, STS array shape: (20,)
    Processed D:\master_reserch\data\genre_wise\results_top_n_genre\Random\per_user_ndcg_scores_random_children's.csv, STS array shape: (20,)
    Processed D:\master_reserch\data\genre_wise\results_top_n_genre\Random\per_user_ndcg_scores_random_comedy.csv, ST

In [None]:
b4.abilities