In [19]:
import os
import pandas as pd
from scipy.stats import ttest_rel

In [9]:
# Define paths to the folders
path_similar_pair = 'data/statistical_test/video_software_conet'
path_dissimilar_pair = 'data/statistical_test/music_software_conet'

# Define the file name
file_user_id_mapping = 'user_id_mapping.txt'

# Construct full paths to the files
file_path_similar = os.path.join(path_similar_pair, file_user_id_mapping)
file_path_dissimilar = os.path.join(path_dissimilar_pair, file_user_id_mapping)

# Read the user_id_mapping.txt files into DataFrames
df_similar = pd.read_csv(file_path_similar, header=None, names=['new_id', 'original_id'])
df_dissimilar = pd.read_csv(file_path_dissimilar, header=None, names=['new_id', 'original_id'])

# Drop duplicates
df_similar.drop_duplicates(inplace=True)
df_dissimilar.drop_duplicates(inplace=True)

# Filter for overlapping original_ids
df_similar_filtered = df_similar[df_similar['original_id'].isin(df_dissimilar['original_id'])]
df_dissimilar_filtered = df_dissimilar[df_dissimilar['original_id'].isin(df_similar['original_id'])]

In [21]:
# Define filenames to HR, MRR, and NDCG per user
file_hr = 'user_hits.txt'
file_mrr = 'user_mrrs.txt'
file_ndcg = 'user_ndcg_at_10.txt'

# Construct full paths to the files
file_path_hr_similar = os.path.join(path_similar_pair, file_hr)
file_path_mrr_similar = os.path.join(path_similar_pair, file_mrr)
file_path_ndcg_similar = os.path.join(path_similar_pair, file_ndcg)

file_path_hr_dissimilar = os.path.join(path_dissimilar_pair, file_hr)
file_path_mrr_dissimilar = os.path.join(path_dissimilar_pair, file_mrr)
file_path_ndcg_dissimilar = os.path.join(path_dissimilar_pair, file_ndcg)

# Read the HR, MRR, and NDCG files into DataFrames
df_hr_similar = pd.read_csv(file_path_hr_similar, header=None, names=['hits'])
df_mrr_similar = pd.read_csv(file_path_mrr_similar, header=None, names=['mrr'])
df_ndcg_similar = pd.read_csv(file_path_ndcg_similar, header=None, names=['ndcg'])

df_hr_dissimilar = pd.read_csv(file_path_hr_dissimilar, header=None, names=['hits'])
df_mrr_dissimilar = pd.read_csv(file_path_mrr_dissimilar, header=None, names=['mrr'])
df_ndcg_dissimilar = pd.read_csv(file_path_ndcg_dissimilar, header=None, names=['ndcg'])

# Add new_id column based on the index
df_hr_similar['new_id'] = df_hr_similar.index + 1
df_mrr_similar['new_id'] = df_mrr_similar.index + 1
df_ndcg_similar['new_id'] = df_ndcg_similar.index + 1

df_hr_dissimilar['new_id'] = df_hr_dissimilar.index + 1
df_mrr_dissimilar['new_id'] = df_mrr_dissimilar.index + 1
df_ndcg_dissimilar['new_id'] = df_ndcg_dissimilar.index + 1

# Filter for overlapping new_ids
df_hr_similar_filtered = df_hr_similar[df_hr_similar['new_id'].isin(df_similar_filtered['new_id'])]
df_mrr_similar_filtered = df_mrr_similar[df_mrr_similar['new_id'].isin(df_similar_filtered['new_id'])]
df_ndcg_similar_filtered = df_ndcg_similar[df_ndcg_similar['new_id'].isin(df_similar_filtered['new_id'])]

df_hr_dissimilar_filtered = df_hr_dissimilar[df_hr_dissimilar['new_id'].isin(df_dissimilar_filtered['new_id'])]
df_mrr_dissimilar_filtered = df_mrr_dissimilar[df_mrr_dissimilar['new_id'].isin(df_dissimilar_filtered['new_id'])]
df_ndcg_dissimilar_filtered = df_ndcg_dissimilar[df_ndcg_dissimilar['new_id'].isin(df_dissimilar_filtered['new_id'])]

# Turn new_id into original_id
df_hr_similar_filtered = df_hr_similar_filtered.merge(df_similar_filtered, on='new_id', how='left')
df_mrr_similar_filtered = df_mrr_similar_filtered.merge(df_similar_filtered, on='new_id', how='left')
df_ndcg_similar_filtered = df_ndcg_similar_filtered.merge(df_similar_filtered, on='new_id', how='left')

df_hr_dissimilar_filtered = df_hr_dissimilar_filtered.merge(df_dissimilar_filtered, on='new_id', how='left')
df_mrr_dissimilar_filtered = df_mrr_dissimilar_filtered.merge(df_dissimilar_filtered, on='new_id', how='left')
df_ndcg_dissimilar_filtered = df_ndcg_dissimilar_filtered.merge(df_dissimilar_filtered, on='new_id', how='left')

# Ensure alignment by user_id
df_hr_similar_filtered.set_index('original_id', inplace=True)
df_hr_dissimilar_filtered.set_index('original_id', inplace=True)

df_mrr_similar_filtered.set_index('original_id', inplace=True)
df_mrr_dissimilar_filtered.set_index('original_id', inplace=True)

df_ndcg_similar_filtered.set_index('original_id', inplace=True)
df_ndcg_dissimilar_filtered.set_index('original_id', inplace=True)

In [22]:

# Perform paired t-tests
t_stat_hr, p_value_hr = ttest_rel(df_hr_similar_filtered['hits'], df_hr_dissimilar_filtered['hits'])
t_stat_mrr, p_value_mrr = ttest_rel(df_mrr_similar_filtered['mrr'], df_mrr_dissimilar_filtered['mrr'])
t_stat_ndcg, p_value_ndcg = ttest_rel(df_ndcg_similar_filtered['ndcg'], df_ndcg_dissimilar_filtered['ndcg'])

# Print the results
print(f"HR Paired t-test: t-statistic = {t_stat_hr}, p-value = {p_value_hr}")
print(f"MRR Paired t-test: t-statistic = {t_stat_mrr}, p-value = {p_value_mrr}")
print(f"NDCG Paired t-test: t-statistic = {t_stat_ndcg}, p-value = {p_value_ndcg}")

HR Paired t-test: t-statistic = -1.6101552808487936, p-value = 0.10736987980745498
MRR Paired t-test: t-statistic = -2.0717928443720592, p-value = 0.03828955995992485
NDCG Paired t-test: t-statistic = -2.056392303283855, p-value = 0.039749572302553146
