In [1]:
import os
import sys
import numpy as np
import pandas as pd
from collections import defaultdict

In [2]:
df_data = pd.read_csv('DeepLINK/Real_data_analyses/human_microbiome/data/microbiome_data_common.csv')
df_data.head()

Unnamed: 0,X195,X197,X199,X287,X294,X316,X470,X539,X545,X546,...,X1905288,X1907578,X1912856,X1915310,X1934254,X1944646,X2025876,X2026240,X2051905,real_y
0,0.000899,0.0,2.83e-06,0.0,0.0,0.0,0.0,0.0,0.0,6.5e-05,...,8.85e-111,0.0,9.5e-05,0.0,0.0,1.7e-05,0.018672,0.0,0.0,0
1,7.5e-05,1.47e-45,0.0,1.68e-07,2.18e-08,0.0,0.0,0.0,5e-06,0.0,...,2.89e-33,0.0,4e-05,5.27e-249,2e-06,2e-06,0.025775,0.0,2.79e-196,0
2,0.000275,3.56e-05,0.0,0.0,0.0,0.0,4.7e-05,2e-06,0.0,0.0,...,0.0,0.0,9.4e-05,0.0,0.0,2.6e-05,0.014592,0.0,0.0,0
3,0.000133,2.9e-190,6.77e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.7e-05,0.0,0.0,2e-06,0.006446,0.0,1.3899999999999998e-64,0
4,0.001387,0.00028571,0.0,1.51e-06,0.0,0.0,2e-06,2e-06,0.0,0.000873,...,7.44e-06,2e-06,6.2e-05,1.72e-05,6e-06,1.4e-05,0.028903,4.41e-07,1.55e-05,0


In [3]:
feature_names = df_data.columns[:-1]
feature_names

Index(['X195', 'X197', 'X199', 'X287', 'X294', 'X316', 'X470', 'X539', 'X545',
       'X546',
       ...
       'X1902136', 'X1905288', 'X1907578', 'X1912856', 'X1915310', 'X1934254',
       'X1944646', 'X2025876', 'X2026240', 'X2051905'],
      dtype='object', length=274)

In [4]:
def summarize(explanation_res_path):
    df_exp = pd.read_csv(explanation_res_path, header=None)
    df_exp.columns = ['sample_id', 'feature_id', 'importance']

    single_ranks = [[] for _ in range(len(feature_names))]
    interaction_ranks = defaultdict(list)


    sample_num = df_exp['sample_id'].max() + 1
    for i in range(sample_num):
        sub_df = df_exp[df_exp['sample_id'] == i]
        sub_df = sub_df.reset_index(drop=True)
        for fi, row in sub_df.iterrows():
            feature_id = row['feature_id']
            feature_id = [int(x) for x in feature_id.split('-')]
            if len(feature_id) == 1:
                single_ranks[feature_id[0]].append(fi + 1)
            else:
                feature_id = sorted(feature_id)
                for p in range(len(feature_id)):
                    for q in range(p + 1, len(feature_id)):
                        interaction_ranks[(feature_id[p], feature_id[q])].append(fi + 1)

    avg_single_ranks = []
    for fi, ranks in enumerate(single_ranks):
        avg_single_ranks.append(np.mean(ranks))

    metric_interaction_ranks = {}
    for pair, ranks in interaction_ranks.items():
        metric_interaction_ranks[pair] = (np.sum(ranks) + (len(feature_names) + 1) * (sample_num - len(ranks))) / sample_num

    # avg_ranks = [(feature_names[i], avg_single_ranks[i]) for i in range(len(feature_names))] + [(f'{feature_names[pair[0]]} - {feature_names[pair[1]]}', metric_interaction_ranks[pair]) for pair in metric_interaction_ranks.keys()]
    # sorted_avg_ranks = sorted(avg_ranks, key=lambda x: x[1])

    # df_result = pd.DataFrame(
    #     {
    #         'feature': [pair[0] for pair in sorted_avg_ranks],
    #         'avg_rank': [pair[1] for pair in sorted_avg_ranks],
    #     }
    # )
    # return df_result

    single_sorted_idx = np.argsort(avg_single_ranks)
    df_single = pd.DataFrame(
        {
            'feature': [feature_names[i] for i in single_sorted_idx],
            'avg_rank': [avg_single_ranks[i] for i in single_sorted_idx],
            'h_rank': [np.min(single_ranks[i]) if len(single_ranks[i]) > 0 else len(feature_names) for i in single_sorted_idx],
            'l_rank': [np.max(single_ranks[i]) if len(single_ranks[i]) > 0 else len(feature_names) for i in single_sorted_idx],
        }
    )

    sorted_metric_interaction_ranks = sorted(metric_interaction_ranks.items(), key=lambda x: x[1])
    df_interaction = pd.DataFrame(
        {
            'feature': [f'{feature_names[pair[0]]} - {feature_names[pair[1]]}' for pair, _ in sorted_metric_interaction_ranks],
            'avg_rank': [np.mean(interaction_ranks[pair]) for pair, _ in sorted_metric_interaction_ranks],
            'count':  [len(interaction_ranks[pair]) for pair, _ in sorted_metric_interaction_ranks],
            'h_rank': [np.min(interaction_ranks[pair]) for pair, _ in sorted_metric_interaction_ranks],
            'l_rank': [np.max(interaction_ranks[pair]) for pair, _ in sorted_metric_interaction_ranks],
        }
    )

    return df_single, df_interaction


def run_summarize(explanation_res_path):
    parent_folder = os.path.dirname(explanation_res_path)
    df_single, df_interaction = summarize(explanation_res_path)
    df_single.to_csv(os.path.join(parent_folder, 'sorted_single_features.csv'), index=False)
    df_interaction.to_csv(os.path.join(parent_folder, 'sorted_interaction_features.csv'), index=False)


run_summarize('explanations/human_microbiome_best/top_3/explanation.csv')
run_summarize('explanations/human_microbiome_best/top_5/explanation.csv')
run_summarize('explanations/human_microbiome_best/top_10/explanation.csv')