# Statistical Test Analysis

This notebook performs a two-tailed paired t-test to compare the performance metrics (HR, MRR, NDCG) between similar and dissimilar source domains. We use the performance metric results of overlapping users between two different source domains.

## 1. Setup

### 1.1 Import Libraries

In [None]:
import os
import pandas as pd
from scipy.stats import ttest_rel
import numpy as np

## 2. Perform two-tailed paired t-test

### 2.1 Approach 1 (Reproduction):

In [None]:
# Define the parent directory containing all subfolders
parent_directory = 'data/statistical_test'

# Dynamically list all subfolders
subfolders = [f.path for f in os.scandir(parent_directory) if f.is_dir()]

# Define the filenames for the metrics
metrics_files = {
    'HR': 'user_hits.txt',
    'MRR': 'user_mrrs.txt',
    'NDCG': 'user_ndcg_at_10.txt',
}

# Define target and source domain pairs
target_to_sources = {
    'software': ['videogames', 'musicalinstruments'],
    'musicalinstruments': ['videogames', 'software'],
    'videogames': ['musicalinstruments', 'software'],
}

# Function to apply Benjamini-Hochberg correction
def benjamini_hochberg_correction(p_values, alpha=0.05):
    p_values = np.array(p_values)
    n = len(p_values)
    sorted_indices = np.argsort(p_values)
    sorted_p_values = p_values[sorted_indices]
    adjusted_p_values = np.empty(n)
    for i in range(n):
        adjusted_p_values[i] = sorted_p_values[i] * n / (i + 1)
    adjusted_p_values = np.minimum.accumulate(adjusted_p_values[::-1])[::-1]
    adjusted_p_values[adjusted_p_values > 1] = 1
    return adjusted_p_values, sorted_indices

# Initialize a list to store results
results = []

# Group folders by target domain and model
grouped_folders = {}
for folder in subfolders:
    folder_name = os.path.basename(folder)
    parts = folder_name.split('_')
    if len(parts) < 4:
        continue  # Skip invalid folder names
    source_domain, target_domain, model = parts[2], parts[3], parts[4]

    if target_domain not in grouped_folders:
        grouped_folders[target_domain] = {}
    if model not in grouped_folders[target_domain]:
        grouped_folders[target_domain][model] = {}

    grouped_folders[target_domain][model][source_domain] = folder

# Iterate over target domains and models
for target_domain, models in grouped_folders.items():
    if target_domain not in target_to_sources:
        continue

    sources = target_to_sources[target_domain]
    more_similar_source, less_similar_source = sources

    for model, source_folders in models.items():
        # Ensure both source folders are available
        if more_similar_source not in source_folders or less_similar_source not in source_folders:
            continue

        similar_folder = source_folders[more_similar_source]
        dissimilar_folder = source_folders[less_similar_source]

        # Paths to user_id_mapping
        file_user_id_mapping = 'user_id_mapping.txt'
        file_path_similar = os.path.join(similar_folder, file_user_id_mapping)
        file_path_dissimilar = os.path.join(dissimilar_folder, file_user_id_mapping)

        # Read the user_id_mapping files
        df_similar = pd.read_csv(file_path_similar, header=None, names=['new_id', 'original_id']).drop_duplicates()
        df_dissimilar = pd.read_csv(file_path_dissimilar, header=None, names=['new_id', 'original_id']).drop_duplicates()

        # Filter for overlapping original_ids
        df_similar_filtered = df_similar[df_similar['original_id'].isin(df_dissimilar['original_id'])]
        df_dissimilar_filtered = df_dissimilar[df_dissimilar['original_id'].isin(df_similar['original_id'])]

        # Read metric files and perform paired t-tests
        for metric, file_name in metrics_files.items():
            file_path_metric_similar = os.path.join(similar_folder, file_name)
            file_path_metric_dissimilar = os.path.join(dissimilar_folder, file_name)

            # Read metric data
            df_metric_similar = pd.read_csv(file_path_metric_similar, header=None, names=[metric.lower()])
            df_metric_dissimilar = pd.read_csv(file_path_metric_dissimilar, header=None, names=[metric.lower()])

            # Add new_id column based on index
            df_metric_similar['new_id'] = df_metric_similar.index + 1
            df_metric_dissimilar['new_id'] = df_metric_dissimilar.index + 1

            # Filter for overlapping new_ids
            df_metric_similar_filtered = df_metric_similar[df_metric_similar['new_id'].isin(df_similar_filtered['new_id'])]
            df_metric_dissimilar_filtered = df_metric_dissimilar[df_metric_dissimilar['new_id'].isin(df_dissimilar_filtered['new_id'])]

            # Ensure alignment by user_id
            df_metric_similar_filtered = df_metric_similar_filtered.merge(df_similar_filtered, on='new_id', how='left').set_index('original_id')
            df_metric_dissimilar_filtered = df_metric_dissimilar_filtered.merge(df_dissimilar_filtered, on='new_id', how='left').set_index('original_id')

            # Perform paired t-test
            t_stat, p_value = ttest_rel(
                df_metric_similar_filtered[metric.lower()],
                df_metric_dissimilar_filtered[metric.lower()]
            )
            results.append({
                'model': model,
                'target': target_domain,
                'metric': metric,
                'more_similar_source': more_similar_source,
                'less_similar_source': less_similar_source,
                't_statistic': t_stat,
                'p_value': p_value
            })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Apply Benjamini-Hochberg correction
p_values = results_df['p_value'].values
adjusted_p_values, sorted_indices = benjamini_hochberg_correction(p_values)

# Add adjusted p-values to the DataFrame
results_df['adjusted_p_value'] = adjusted_p_values[np.argsort(sorted_indices)]

# Save the results to a CSV file
results_df.to_csv('paired_t_test_results_with_bh.csv', index=False)

# Display the results
print(results_df)


### 2.2 Approach 2 (Semantic Similarity):

#### Full splits

In [None]:

# Define the parent directory containing all subfolders
parent_directory = 'data/statistical_test/own_research/full_splits'

# Dynamically list all subfolders
subfolders = [f.path for f in os.scandir(parent_directory) if f.is_dir()]

# Define the filenames for the metrics
metrics_files = {
    'HR': 'user_hits.txt',
    'MRR': 'user_mrrs.txt',
    'NDCG': 'user_ndcg_at_10.txt',
}

# Define target and source domain pairs
target_to_sources = [
    ('books', ['electronics', 'house']),
    ('books', ['movies', 'electronics']),
    ('books', ['movies', 'house']),
]

# Function to apply Benjamini-Hochberg correction
def benjamini_hochberg_correction(p_values, alpha=0.05):
    p_values = np.array(p_values)
    n = len(p_values)
    sorted_indices = np.argsort(p_values)
    sorted_p_values = p_values[sorted_indices]
    adjusted_p_values = np.empty(n)
    for i in range(n):
        adjusted_p_values[i] = sorted_p_values[i] * n / (i + 1)
    adjusted_p_values = np.minimum.accumulate(adjusted_p_values[::-1])[::-1]
    adjusted_p_values[adjusted_p_values > 1] = 1
    return adjusted_p_values, sorted_indices

# Initialize a list to store results
results = []

# Group folders by target domain and model
grouped_folders = {}
for folder in subfolders:
    folder_name = os.path.basename(folder)
    parts = folder_name.split('_')
    if len(parts) < 4:
        continue  # Skip invalid folder names
    source_domain, target_domain, model = parts[3], parts[2], parts[4]

    if target_domain not in grouped_folders:
        grouped_folders[target_domain] = {}
    if model not in grouped_folders[target_domain]:
        grouped_folders[target_domain][model] = {}

    grouped_folders[target_domain][model][source_domain] = folder

# Iterate over each target-source pair combination
for target_domain, sources in target_to_sources:
    if target_domain not in grouped_folders:
        continue

    more_similar_source, less_similar_source = sources

    models = grouped_folders.get(target_domain, {})
    for model, source_folders in models.items():
        # Ensure both source folders are available
        if more_similar_source not in source_folders or less_similar_source not in source_folders:
            print(f"Skipping model {model} for target {target_domain} due to missing sources.")
            continue

        similar_folder = source_folders[more_similar_source]
        dissimilar_folder = source_folders[less_similar_source]

        # Paths to user_id_mapping
        file_user_id_mapping = 'user_id_mapping.txt'
        file_path_similar = os.path.join(similar_folder, file_user_id_mapping)
        file_path_dissimilar = os.path.join(dissimilar_folder, file_user_id_mapping)

        # Read the user_id_mapping files
        df_similar = pd.read_csv(file_path_similar, header=None, names=['new_id', 'original_id']).drop_duplicates()
        df_dissimilar = pd.read_csv(file_path_dissimilar, header=None, names=['new_id', 'original_id']).drop_duplicates()

        # Filter for overlapping original_ids
        df_similar_filtered = df_similar[df_similar['original_id'].isin(df_dissimilar['original_id'])]
        df_dissimilar_filtered = df_dissimilar[df_dissimilar['original_id'].isin(df_similar['original_id'])]

        # Read metric files and perform paired t-tests
        for metric, file_name in metrics_files.items():
            file_path_metric_similar = os.path.join(similar_folder, file_name)
            file_path_metric_dissimilar = os.path.join(dissimilar_folder, file_name)

            # Read metric data
            df_metric_similar = pd.read_csv(file_path_metric_similar, header=None, names=[metric.lower()])
            df_metric_dissimilar = pd.read_csv(file_path_metric_dissimilar, header=None, names=[metric.lower()])

            # Add new_id column based on index
            df_metric_similar['new_id'] = df_metric_similar.index + 1
            df_metric_dissimilar['new_id'] = df_metric_dissimilar.index + 1

            # Filter for overlapping new_ids
            df_metric_similar_filtered = df_metric_similar[df_metric_similar['new_id'].isin(df_similar_filtered['new_id'])]
            df_metric_dissimilar_filtered = df_metric_dissimilar[df_metric_dissimilar['new_id'].isin(df_dissimilar_filtered['new_id'])]

            # Ensure alignment by user_id
            df_metric_similar_filtered = df_metric_similar_filtered.merge(df_similar_filtered, on='new_id', how='left').set_index('original_id')
            df_metric_dissimilar_filtered = df_metric_dissimilar_filtered.merge(df_dissimilar_filtered, on='new_id', how='left').set_index('original_id')

            # Perform paired t-test
            t_stat, p_value = ttest_rel(
                df_metric_similar_filtered[metric.lower()],
                df_metric_dissimilar_filtered[metric.lower()]
            )
            results.append({
                'model': model,
                'target': target_domain,
                'metric': metric,
                'more_similar_source': more_similar_source,
                'less_similar_source': less_similar_source,
                't_statistic': t_stat,
                'p_value': p_value
            })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Apply Benjamini-Hochberg correction
p_values = results_df['p_value'].values
adjusted_p_values, sorted_indices = benjamini_hochberg_correction(p_values)

# Add adjusted p-values to the DataFrame
results_df['adjusted_p_value'] = adjusted_p_values[np.argsort(sorted_indices)]

# Save the results to a CSV file
results_df.to_csv('paired_t_test_results_with_bh.csv', index=False)

# Display the results
print(results_df)


#### Top/Bottom Splits

In [None]:
# Define the parent directory containing all subfolders
parent_directory = 'data/statistical_test/own_research/top_bottom'

# Dynamically list all subfolders
subfolders = [f.path for f in os.scandir(parent_directory) if f.is_dir()]

# Define the filenames for the metrics
metrics_files = {
    'HR': 'user_hits.txt',
    'MRR': 'user_mrrs.txt',
    'NDCG': 'user_ndcg_at_10.txt',
}

# Define target and source domain pairs
target_to_sources = [
    ('books', ['elt30', 'elb30']),
    ('books', ['hot30', 'hob30']),
    ('books', ['mot30', 'mob30']),
]

# Function to apply Benjamini-Hochberg correction
def benjamini_hochberg_correction(p_values, alpha=0.05):
    p_values = np.array(p_values)
    n = len(p_values)
    sorted_indices = np.argsort(p_values)
    sorted_p_values = p_values[sorted_indices]
    adjusted_p_values = np.empty(n)
    for i in range(n):
        adjusted_p_values[i] = sorted_p_values[i] * n / (i + 1)
    adjusted_p_values = np.minimum.accumulate(adjusted_p_values[::-1])[::-1]
    adjusted_p_values[adjusted_p_values > 1] = 1
    return adjusted_p_values, sorted_indices

# Initialize a list to store results
results = []

# Group folders by target domain and model
grouped_folders = {}
for folder in subfolders:
    folder_name = os.path.basename(folder)
    parts = folder_name.split('_')
    if len(parts) < 4:
        continue  # Skip invalid folder names
    source_domain, target_domain, model = parts[3], parts[2], parts[4]

    if target_domain not in grouped_folders:
        grouped_folders[target_domain] = {}
    if model not in grouped_folders[target_domain]:
        grouped_folders[target_domain][model] = {}

    grouped_folders[target_domain][model][source_domain] = folder

# Iterate over each target-source pair combination
for target_domain, sources in target_to_sources:
    if target_domain not in grouped_folders:
        continue

    more_similar_source, less_similar_source = sources

    models = grouped_folders.get(target_domain, {})
    for model, source_folders in models.items():
        # Ensure both source folders are available
        if more_similar_source not in source_folders or less_similar_source not in source_folders:
            print(f"Skipping model {model} for target {target_domain} due to missing sources.")
            continue

        similar_folder = source_folders[more_similar_source]
        dissimilar_folder = source_folders[less_similar_source]

        # Paths to user_id_mapping
        file_user_id_mapping = 'user_id_mapping.txt'
        file_path_similar = os.path.join(similar_folder, file_user_id_mapping)
        file_path_dissimilar = os.path.join(dissimilar_folder, file_user_id_mapping)

        # Read the user_id_mapping files
        df_similar = pd.read_csv(file_path_similar, header=None, names=['new_id', 'original_id']).drop_duplicates()
        df_dissimilar = pd.read_csv(file_path_dissimilar, header=None, names=['new_id', 'original_id']).drop_duplicates()

        # Filter for overlapping original_ids
        df_similar_filtered = df_similar[df_similar['original_id'].isin(df_dissimilar['original_id'])]
        df_dissimilar_filtered = df_dissimilar[df_dissimilar['original_id'].isin(df_similar['original_id'])]

        # Read metric files and perform paired t-tests
        for metric, file_name in metrics_files.items():
            file_path_metric_similar = os.path.join(similar_folder, file_name)
            file_path_metric_dissimilar = os.path.join(dissimilar_folder, file_name)

            # Read metric data
            df_metric_similar = pd.read_csv(file_path_metric_similar, header=None, names=[metric.lower()])
            df_metric_dissimilar = pd.read_csv(file_path_metric_dissimilar, header=None, names=[metric.lower()])

            # Add new_id column based on index
            df_metric_similar['new_id'] = df_metric_similar.index + 1
            df_metric_dissimilar['new_id'] = df_metric_dissimilar.index + 1

            # Filter for overlapping new_ids
            df_metric_similar_filtered = df_metric_similar[df_metric_similar['new_id'].isin(df_similar_filtered['new_id'])]
            df_metric_dissimilar_filtered = df_metric_dissimilar[df_metric_dissimilar['new_id'].isin(df_dissimilar_filtered['new_id'])]

            # Ensure alignment by user_id
            df_metric_similar_filtered = df_metric_similar_filtered.merge(df_similar_filtered, on='new_id', how='left').set_index('original_id')
            df_metric_dissimilar_filtered = df_metric_dissimilar_filtered.merge(df_dissimilar_filtered, on='new_id', how='left').set_index('original_id')

            # Perform paired t-test
            t_stat, p_value = ttest_rel(
                df_metric_similar_filtered[metric.lower()],
                df_metric_dissimilar_filtered[metric.lower()]
            )
            results.append({
                'model': model,
                'target': target_domain,
                'metric': metric,
                'more_similar_source': more_similar_source,
                'less_similar_source': less_similar_source,
                't_statistic': t_stat,
                'p_value': p_value
            })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Apply Benjamini-Hochberg correction
p_values = results_df['p_value'].values
adjusted_p_values, sorted_indices = benjamini_hochberg_correction(p_values)

# Add adjusted p-values to the DataFrame
results_df['adjusted_p_value'] = adjusted_p_values[np.argsort(sorted_indices)]

# Save the results to a CSV file
results_df.to_csv('paired_t_test_results_with_bh.csv', index=False)

# Display the results
print(results_df)


### 2.3 Approach 3 (Pattern Similarity):

In [None]:
# Define the parent directory containing all subfolders
parent_directory = 'data/statistical_test/own_research/pattern_similarity'

# Dynamically list all subfolders
subfolders = [f.path for f in os.scandir(parent_directory) if f.is_dir()]

# Define the filenames for the metrics
metrics_files = {
    'HR': 'user_hits.txt',
    'MRR': 'user_mrrs.txt',
    'NDCG': 'user_ndcg_at_10.txt',
}

# Define target and source domain pairs
target_to_sources = [
    ('books', ['elt25', 'elb25']),
    ('books', ['hot25', 'hob25']),
    ('books', ['mot25', 'mob25']),
]

# Function to apply Benjamini-Hochberg correction
def benjamini_hochberg_correction(p_values, alpha=0.05):
    p_values = np.array(p_values)
    n = len(p_values)
    sorted_indices = np.argsort(p_values)
    sorted_p_values = p_values[sorted_indices]
    adjusted_p_values = np.empty(n)
    for i in range(n):
        adjusted_p_values[i] = sorted_p_values[i] * n / (i + 1)
    adjusted_p_values = np.minimum.accumulate(adjusted_p_values[::-1])[::-1]
    adjusted_p_values[adjusted_p_values > 1] = 1
    return adjusted_p_values, sorted_indices

# Initialize a list to store results
results = []

# Group folders by target domain and model
grouped_folders = {}
for folder in subfolders:
    folder_name = os.path.basename(folder)
    parts = folder_name.split('_')
    if len(parts) < 4:
        continue  # Skip invalid folder names
    source_domain, target_domain, model = parts[3], parts[2], parts[4]

    if target_domain not in grouped_folders:
        grouped_folders[target_domain] = {}
    if model not in grouped_folders[target_domain]:
        grouped_folders[target_domain][model] = {}

    grouped_folders[target_domain][model][source_domain] = folder

# Iterate over each target-source pair combination
for target_domain, sources in target_to_sources:
    if target_domain not in grouped_folders:
        continue

    more_similar_source, less_similar_source = sources

    models = grouped_folders.get(target_domain, {})
    for model, source_folders in models.items():
        # Ensure both source folders are available
        if more_similar_source not in source_folders or less_similar_source not in source_folders:
            print(f"Skipping model {model} for target {target_domain} due to missing sources.")
            continue

        similar_folder = source_folders[more_similar_source]
        dissimilar_folder = source_folders[less_similar_source]

        # Paths to user_id_mapping
        file_user_id_mapping = 'user_id_mapping.txt'
        file_path_similar = os.path.join(similar_folder, file_user_id_mapping)
        file_path_dissimilar = os.path.join(dissimilar_folder, file_user_id_mapping)

        # Read the user_id_mapping files
        df_similar = pd.read_csv(file_path_similar, header=None, names=['new_id', 'original_id']).drop_duplicates()
        df_dissimilar = pd.read_csv(file_path_dissimilar, header=None, names=['new_id', 'original_id']).drop_duplicates()

        # Filter for overlapping original_ids
        df_similar_filtered = df_similar[df_similar['original_id'].isin(df_dissimilar['original_id'])]
        df_dissimilar_filtered = df_dissimilar[df_dissimilar['original_id'].isin(df_similar['original_id'])]

        # Read metric files and perform paired t-tests
        for metric, file_name in metrics_files.items():
            file_path_metric_similar = os.path.join(similar_folder, file_name)
            file_path_metric_dissimilar = os.path.join(dissimilar_folder, file_name)

            # Read metric data
            df_metric_similar = pd.read_csv(file_path_metric_similar, header=None, names=[metric.lower()])
            df_metric_dissimilar = pd.read_csv(file_path_metric_dissimilar, header=None, names=[metric.lower()])

            # Add new_id column based on index
            df_metric_similar['new_id'] = df_metric_similar.index + 1
            df_metric_dissimilar['new_id'] = df_metric_dissimilar.index + 1

            # Filter for overlapping new_ids
            df_metric_similar_filtered = df_metric_similar[df_metric_similar['new_id'].isin(df_similar_filtered['new_id'])]
            df_metric_dissimilar_filtered = df_metric_dissimilar[df_metric_dissimilar['new_id'].isin(df_dissimilar_filtered['new_id'])]

            # Ensure alignment by user_id
            df_metric_similar_filtered = df_metric_similar_filtered.merge(df_similar_filtered, on='new_id', how='left').set_index('original_id')
            df_metric_dissimilar_filtered = df_metric_dissimilar_filtered.merge(df_dissimilar_filtered, on='new_id', how='left').set_index('original_id')

            # Perform paired t-test
            t_stat, p_value = ttest_rel(
                df_metric_similar_filtered[metric.lower()],
                df_metric_dissimilar_filtered[metric.lower()]
            )
            results.append({
                'model': model,
                'target': target_domain,
                'metric': metric,
                'more_similar_source': more_similar_source,
                'less_similar_source': less_similar_source,
                't_statistic': t_stat,
                'p_value': p_value
            })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Apply Benjamini-Hochberg correction
p_values = results_df['p_value'].values
adjusted_p_values, sorted_indices = benjamini_hochberg_correction(p_values)

# Add adjusted p-values to the DataFrame
results_df['adjusted_p_value'] = adjusted_p_values[np.argsort(sorted_indices)]

# Save the results to a CSV file
results_df.to_csv('paired_t_test_results_with_bh.csv', index=False)

# Display the results
print(results_df)
