In [1]:
import pandas as pd
import ast
import json
import os

In [2]:
# Define the strategy variable
strategy = "and"  # You can change this to "and"

# Define paths based on the selected strategy
base_mds_path = os.path.join(
    r'C:\Users\Mingshan\PycharmProjects\infmax-mds-ltm-mln\raw_results_2\data\raw_results\mds_based',
    strategy,
    'small_real'
)

base_rank_path = os.path.join(
    r'C:\Users\Mingshan\PycharmProjects\infmax-mds-ltm-mln\raw_results_2\data\raw_results\rank_based',
    strategy
)

# Read data from files
mds_df = pd.read_csv(
    os.path.join(base_mds_path, 'averaged_results.csv'),
    sep=','
)

normal_df = pd.read_csv(
    os.path.join(base_rank_path, 'results--ver-43_1.csv'),
    sep=','
)

# Print for verification
print("MDS DataFrame:", mds_df.head())
print("Normal DataFrame:", normal_df.head())

MDS DataFrame:   network protocol  seed_budget  mi_value  ss_method       gain  \
0    aucs      AND           15       0.1    d^deg_c  56.862745   
1    aucs      AND           15       0.1   d^deg_cd  56.862745   
2    aucs      AND           15       0.1  d^nghb_sd  62.745098   
3    aucs      AND           15       0.1    d^p_rnk  70.588235   
4    aucs      AND           15       0.1  d^p_rnk_m  60.784314   

   simulation_length  seed_nb  exposed_nb  unexposed_nb  \
0                3.0     10.0        39.0          22.0   
1                3.0     10.0        39.0          22.0   
2                3.0     10.0        42.0          19.0   
3                4.0     10.0        46.0          15.0   
4                3.0     10.0        41.0          20.0   

                                     expositions_rec  
0  [10.0, 22.0, 5.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0...  
1  [10.0, 22.0, 5.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0...  
2  [10.0, 21.0, 10.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0....  
3  [10.

In [3]:
def parse_expositions(rec_str):
    """
    Convert a semicolon-separated string into a list of integers.
    Example: "1;23;35;2" -> [1, 23, 35, 2]
    """
    if pd.isna(rec_str) or rec_str.strip() == '':
        return []
    return list(map(int, rec_str.split(';')))

In [4]:

# List of networks
networks = ['aucs', 'ckm_physicians', 'eu_transportation', 'eu_transport_klm', 'lazega']

# List of method pairs
mds_methods = ['d^deg_c', 'd^deg_cd', 'd^nghb_sd', 'd^p_rnk', 'd^p_rnk_m', 'd^v_rnk', 'd^v_rnk_m']
normal_methods = ['deg_c', 'deg_cd', 'nghb_sd', 'p_rnk', 'p_rnk_m', 'v_rnk', 'v_rnk_m']

# List of mi_values and seed_budgets
mi_values = [round(0.1 * i, 1) for i in range(1, 10)]

# Define seed budgets per strategy
seed_budgets_or = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30]
seed_budgets_and = [15, 20, 25, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
# Select based on strategy
seed_budgets = seed_budgets_and if strategy == "and" else seed_budgets_or

# Ensure correct data types
mds_df['mi_value'] = mds_df['mi_value'].astype(float)
mds_df['seed_budget'] = mds_df['seed_budget'].astype(int)
normal_df['mi_value'] = normal_df['mi_value'].astype(float)
normal_df['seed_budget'] = normal_df['seed_budget'].astype(int)


In [6]:
mds_df['expositions_rec'] = mds_df['expositions_rec'].apply(ast.literal_eval)

In [7]:
results = {network: {mds_method: {} for mds_method in mds_methods} for network in networks}

In [8]:
results

{'aucs': {'d^deg_c': {},
  'd^deg_cd': {},
  'd^nghb_sd': {},
  'd^p_rnk': {},
  'd^p_rnk_m': {},
  'd^v_rnk': {},
  'd^v_rnk_m': {}},
 'ckm_physicians': {'d^deg_c': {},
  'd^deg_cd': {},
  'd^nghb_sd': {},
  'd^p_rnk': {},
  'd^p_rnk_m': {},
  'd^v_rnk': {},
  'd^v_rnk_m': {}},
 'eu_transportation': {'d^deg_c': {},
  'd^deg_cd': {},
  'd^nghb_sd': {},
  'd^p_rnk': {},
  'd^p_rnk_m': {},
  'd^v_rnk': {},
  'd^v_rnk_m': {}},
 'eu_transport_klm': {'d^deg_c': {},
  'd^deg_cd': {},
  'd^nghb_sd': {},
  'd^p_rnk': {},
  'd^p_rnk_m': {},
  'd^v_rnk': {},
  'd^v_rnk_m': {}},
 'lazega': {'d^deg_c': {},
  'd^deg_cd': {},
  'd^nghb_sd': {},
  'd^p_rnk': {},
  'd^p_rnk_m': {},
  'd^v_rnk': {},
  'd^v_rnk_m': {}}}

In [9]:
# Iterate through each network
for network in networks:
    print(f"Processing network: {network}")
    # Iterate through each pair of MDS and normal methods
    for mds_method, normal_method in zip(mds_methods, normal_methods):
        print(f"  Comparing method pair: MDS='{mds_method}' vs Rank-Based='{normal_method}'")
        # Iterate through each mi_value
        for mi_value in mi_values:
            # Iterate through each seed_budget
            for seed_budget in seed_budgets:
                # Filter rows for the current combination in MDS dataset
                mds_row = mds_df[
                    (mds_df['network'] == network) &
                    (mds_df['ss_method'] == mds_method) &
                    (mds_df['mi_value'] == mi_value) &
                    (mds_df['seed_budget'] == seed_budget)
                ]
                # Filter rows for the current combination in rank-based dataset
                normal_row = normal_df[
                    (normal_df['network'] == network) &
                    (normal_df['ss_method'] == normal_method) &
                    (normal_df['mi_value'] == mi_value) &
                    (normal_df['seed_budget'] == seed_budget)
                ]

                # Proceed if both rows are found
                if not mds_row.empty and not normal_row.empty:
                    # Assuming there's only one row per combination
                    mds_expositions = mds_row.iloc[0]['expositions_rec']
                    normal_expositions = parse_expositions(normal_row.iloc[0]['expositions_rec'])

                    # Determine the maximum number of spread rounds
                    max_rounds = max(len(mds_expositions), len(normal_expositions))

                    # Initialize lists to hold per-round differences
                    differences = []
                    for i in range(max_rounds):
                        mds_val = mds_expositions[i] if i < len(mds_expositions) else 0
                        normal_val = normal_expositions[i] if i < len(normal_expositions) else 0
                        diff = mds_val - normal_val
                        differences.append(diff)
                    
                    # Store the differences in the results dictionary
                    # Keyed by mi_value and seed_budget
                    if mi_value not in results[network][mds_method]:
                        results[network][mds_method][mi_value] = {}
                    if seed_budget not in results[network][mds_method][mi_value]:
                        results[network][mds_method][mi_value][seed_budget] = differences
                else:
                    # Handle missing rows if necessary
                    # For example, log missing combinations or assign NaN
                    pass  # Currently ignoring missing combinations


Processing network: aucs
  Comparing method pair: MDS='d^deg_c' vs Rank-Based='deg_c'
  Comparing method pair: MDS='d^deg_cd' vs Rank-Based='deg_cd'
  Comparing method pair: MDS='d^nghb_sd' vs Rank-Based='nghb_sd'
  Comparing method pair: MDS='d^p_rnk' vs Rank-Based='p_rnk'
  Comparing method pair: MDS='d^p_rnk_m' vs Rank-Based='p_rnk_m'
  Comparing method pair: MDS='d^v_rnk' vs Rank-Based='v_rnk'
  Comparing method pair: MDS='d^v_rnk_m' vs Rank-Based='v_rnk_m'
Processing network: ckm_physicians
  Comparing method pair: MDS='d^deg_c' vs Rank-Based='deg_c'
  Comparing method pair: MDS='d^deg_cd' vs Rank-Based='deg_cd'
  Comparing method pair: MDS='d^nghb_sd' vs Rank-Based='nghb_sd'
  Comparing method pair: MDS='d^p_rnk' vs Rank-Based='p_rnk'
  Comparing method pair: MDS='d^p_rnk_m' vs Rank-Based='p_rnk_m'
  Comparing method pair: MDS='d^v_rnk' vs Rank-Based='v_rnk'
  Comparing method pair: MDS='d^v_rnk_m' vs Rank-Based='v_rnk_m'
Processing network: eu_transportation
  Comparing method p

In [12]:
# Define the output file name based on the strategy
output_file = f"per_round_average_{strategy}.json"

# Save the results to the file
with open(output_file, 'w') as file:
    json.dump(results, file, indent=4)

print(f"Results saved to {output_file}")

Results saved to per_round_average_and.json


In [26]:
average_over_seed_budget = {}

for network_name, methods_dict in results.items():
    average_over_seed_budget[network_name] = {}
    for method, mi_values_dict in methods_dict.items():
        average_over_seed_budget[network_name][method] = {}
        for mi_value, seed_budgets_dict in mi_values_dict.items():
            # Collect all seed budget lists
            seed_budget_lists = list(seed_budgets_dict.values())
            
            if not seed_budget_lists:
                # Handle empty seed budget lists
                average_over_seed_budget[network_name][method][mi_value] = []
                continue
            
            # Determine the maximum length among all seed budget lists
            max_length = max(len(lst) for lst in seed_budget_lists)
            
            # Pad shorter lists with zeros to match max_length
            padded_lists = [
                lst + [0] * (max_length - len(lst)) for lst in seed_budget_lists
            ]
            
            # Initialize a list to store the sum of each element
            summed_elements = [0] * max_length
            
            # Sum each element across all padded lists
            for lst in padded_lists:
                for idx, value in enumerate(lst):
                    summed_elements[idx] += value
            
            # Calculate the average for each element
            num_lists = len(padded_lists)
            averaged_list = [s / num_lists for s in summed_elements]
            
            # Store the averaged list
            average_over_seed_budget[network_name][method][mi_value] = averaged_list

In [27]:
# Define the output file name based on the strategy
output_file = f"average_over_seed_budget_{strategy}.json"

# Save the results to the file
with open(output_file, 'w') as file:
    json.dump(average_over_seed_budget, file, indent=4)

print(f"Results saved to {output_file}")

Results saved to average_over_seed_budget_and.json


In [28]:
# This step aggregates the averages across different mi_values for each network and method
average_over_mi_value = {}

for network_name, methods_dict in average_over_seed_budget.items():
    average_over_mi_value[network_name] = {}
    for method, mi_values_dict in methods_dict.items():
        # Collect all averaged lists across mi_values
        averaged_lists = list(mi_values_dict.values())
        
        if not averaged_lists:
            # Handle empty mi_values
            average_over_mi_value[network_name][method] = []
            continue
        
        # Determine the maximum length among all averaged lists
        max_length = max(len(lst) for lst in averaged_lists)
        
        # Pad shorter lists with zeros to match max_length
        padded_averaged_lists = [
            lst + [0] * (max_length - len(lst)) for lst in averaged_lists
        ]
        
        # Initialize a list to store the sum of each element
        summed_elements = [0] * max_length
        
        # Sum each element across all padded averaged lists
        for lst in padded_averaged_lists:
            for idx, value in enumerate(lst):
                summed_elements[idx] += value
        
        # Calculate the average for each element
        num_lists = len(padded_averaged_lists)
        final_averaged_list = [s / num_lists for s in summed_elements]
        
        # Store the final averaged list
        average_over_mi_value[network_name][method] = final_averaged_list

In [29]:
output_file = f"average_over_seed_budget_and_mi_value_{strategy}.json"
with open(output_file, 'w') as file:
    json.dump(average_over_mi_value, file, indent=4)

print(f"Results saved to {output_file}")

Results saved to average_over_seed_budget_and_mi_value_and.json


In [30]:
# Final averaging over methods
average_over_methods = {}

for network_name, methods_dict in average_over_mi_value.items():
    # Collect all method-averaged lists for each network
    method_lists = list(methods_dict.values())
    
    if not method_lists:
        # Handle case where no methods exist for this network
        average_over_methods[network_name] = []
        continue
    
    # Determine the maximum length among all lists
    max_length = max(len(lst) for lst in method_lists)
    
    # Pad all lists with zeros to match the maximum length
    padded_lists = [
        lst + [0] * (max_length - len(lst)) for lst in method_lists
    ]
    
    # Perform element-wise averaging
    n = len(padded_lists)
    summed_elements = [0] * max_length
    for lst in padded_lists:
        for idx, value in enumerate(lst):
            summed_elements[idx] += value
    
    averaged_list = [s / n for s in summed_elements]
    
    # Store the final averaged list for this network
    average_over_methods[network_name] = averaged_list

In [31]:
output_file = f"average_over_seed_budget_mi_value_and_method_{strategy}.json"
with open(output_file, 'w') as file:
    json.dump(average_over_methods, file, indent=4)

print(f"Results saved to {output_file}")

Results saved to average_over_seed_budget_mi_value_and_method_and.json


In [22]:
average_over_seed_budget

{'aucs': {'d^deg_c': {0.1: [0.0,
    -4.071428571428571,
    0.6428571428571429,
    0.7142857142857143,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0],
   0.2: [0.0,
    -2.2142857142857144,
    -0.5714285714285714,
    0.5,
    0.0,
    -0.07142857142857142,
    -0.07142857142857142,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0],
   0.3: [0.0,
    -1.1428571428571428,
    -1.6428571428571428,
    1.1428571428571428,
    -0.35714285714285715,
    -0.14285714285714285,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0,
    0.0

In [23]:
average_over_mi_value

{'aucs': {'d^deg_c': [0.0,
   -1.047619047619048,
   -0.8412698412698413,
   0.3571428571428571,
   0.25396825396825395,
   0.015873015873015876,
   -0.007936507936507936,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0],
  'd^deg_cd': [0.0,
   -1.7063492063492065,
   -0.4365079365079365,
   0.11111111111111113,
   0.24603174603174605,
   0.23809523809523814,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0],
  'd^nghb_sd': [0.0,
   -1.5793650793650793,
   -0.3571428571428572,
   -0.2619047619047619,
   -0.03968253968253968,
   0.0873015873015873,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
 

In [24]:
average_over_methods

{'aucs': [0.0,
  -1.6950113378684806,
  -0.6031746031746031,
  -0.06802721088435373,
  0.021541950113378682,
  0.0634920634920635,
  -0.02040816326530612,
  -0.005668934240362812,
  -0.0011337868480725624,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'ckm_physicians': [0.0,
  -4.343537414965986,
  -0.8956916099773243,
  -0.7777777777777779,
  0.4308390022675737,
  0.43650793650793657,
  0.44104308390022684,
  0.1326530612244898,
  0.09183673469387757,
  0.05442176870748299,
  0.051020408163265314,
  0.0022675736961451248,
  0.0022675736961451248,
  0.005668934240362812,
  0.003401360544217687,
  -0.003401360544217687,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'eu_transportation': [0.0,
  -0.22108843537414966,
  0.02494331065759637,
  0.010204081632653062,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  

In [25]:
average_over_methods

{'aucs': [0.0,
  -1.6950113378684806,
  -0.6031746031746031,
  -0.06802721088435373,
  0.021541950113378682,
  0.0634920634920635,
  -0.02040816326530612,
  -0.005668934240362812,
  -0.0011337868480725624,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'ckm_physicians': [0.0,
  -4.343537414965986,
  -0.8956916099773243,
  -0.7777777777777779,
  0.4308390022675737,
  0.43650793650793657,
  0.44104308390022684,
  0.1326530612244898,
  0.09183673469387757,
  0.05442176870748299,
  0.051020408163265314,
  0.0022675736961451248,
  0.0022675736961451248,
  0.005668934240362812,
  0.003401360544217687,
  -0.003401360544217687,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 'eu_transportation': [0.0,
  -0.22108843537414966,
  0.02494331065759637,
  0.010204081632653062,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  