### For simulated data, we run the same clustering and tree inference algorithm as MACHINA for better comparison. Their tree inference algorithm gets rid of clusters sometimes. In those instances, fix up the tsvs with the correct character index to character label so that the inferred trees appropriately correspond

In [1]:
import os

repo_dir = os.path.join(os.getcwd(), "../")
machina_sims_data_dir = os.path.join(repo_dir, 'data', 'machina_sims')

In [63]:
import fnmatch
from metient.util import data_extraction_util as dutil
import pandas as pd

sites = ["m8", "m5"]
mig_types = ["M", "mS", "R", "S"]

for site in sites:

    for mig_type in mig_types:
        site_mig_data_dir = os.path.join(machina_sims_data_dir, site, mig_type)
        seeds = fnmatch.filter(os.listdir(site_mig_data_dir), 'reads_seed*.tsv')
        seeds = [s.replace(".tsv", "").replace("reads_seed", "") for s in seeds]
        for seed in seeds:
            cluster_fn = os.path.join(machina_sims_data_dir, f"{site}_clustered_input", f"cluster_{mig_type}_seed{seed}.txt")
            all_mut_trees_fn = os.path.join(machina_sims_data_dir, f"{site}_mut_trees", f"mut_trees_{mig_type}_seed{seed}.txt")
            ref_var_fn = os.path.join(machina_sims_data_dir, f"{site}_clustered_input", f"cluster_{mig_type}_seed{seed}.tsv")
            idx_to_cluster_label = dutil.get_idx_to_cluster_label(cluster_fn, ignore_polytomies=True)
            data = dutil.get_adj_matrices_from_spruce_mutation_trees(all_mut_trees_fn, idx_to_cluster_label, is_sim_data=True)
            #print(ref_var_fn)

            #print(len(df))
            for tree_num, (adj_matrix, pruned_idx_to_label) in enumerate(data):
                df = pd.read_csv(ref_var_fn, sep="\t", skiprows=3)
                df['var_read_prob'] = 0.5
                df['site_category'] = df.apply(lambda row: 'primary' if row['anatomical_site_label']=="P" else 'metastasis', axis=1)
                df['num_mutations'] = df.apply(lambda row: len(row['character_label'].split(";")), axis=1)
                # take out any mutations not used in the adjacency matrix
                x = len(df)
                df = df[df['character_label'].isin(pruned_idx_to_label.values())]
                pruned_label_to_idx = {v:k for k,v in pruned_idx_to_label.items()}
                df['character_index'] = df.apply(lambda row: pruned_label_to_idx[row['character_label']], axis=1)
                df.to_csv(os.path.join(machina_sims_data_dir, f"{site}_clustered_input_corrected", f"cluster_{mig_type}_seed{seed}_tree{tree_num}.tsv"), sep="\t")
                if len(df) != x:
                    print(site, mig_type, seed, tree_num)
                

m8 mS 4 0
m8 mS 4 1
m8 mS 4 2
m8 mS 4 3
m8 mS 4 4
m8 mS 4 5
m8 mS 4 6
m8 mS 4 7
m8 S 0 0
m8 S 0 1
m8 S 0 2
m8 S 0 3
m8 S 0 4
m8 S 0 5
m8 S 0 6
m8 S 0 7
m8 S 0 8
m8 S 0 9
m8 S 0 10
m8 S 0 11
m8 S 0 12
m8 S 0 13
m8 S 0 14
m8 S 0 15
m8 S 0 16
m8 S 0 17
m8 S 0 18
m8 S 0 19
m8 S 0 20
m8 S 0 21
m8 S 0 22
m8 S 0 23
m8 S 0 24
m8 S 0 25
m8 S 0 26
m8 S 0 27
m8 S 0 28
m8 S 0 29
m8 S 0 30
m8 S 0 31
m8 S 0 32
m8 S 0 33
m8 S 0 34
m8 S 0 35
m8 S 0 36
m8 S 0 37
m8 S 0 38
m8 S 0 39
m8 S 0 40
m8 S 0 41
m8 S 0 42
m8 S 0 43
m8 S 0 44
m8 S 0 45
m8 S 0 46
m8 S 0 47
m8 S 0 48
m8 S 0 49
m8 S 0 50
m8 S 0 51
m8 S 0 52
m8 S 0 53
m8 S 0 54
m8 S 0 55
m8 S 0 56
m8 S 0 57
m8 S 0 58
m8 S 0 59
m8 S 0 60
m8 S 0 61
m8 S 0 62
m8 S 0 63
m8 S 0 64
m8 S 0 65
m8 S 0 66
m8 S 0 67
m8 S 0 68
m8 S 0 69
m8 S 0 70
m8 S 0 71
m8 S 0 72
m8 S 0 73
m8 S 0 74
m8 S 0 75
m8 S 0 76
m8 S 0 77
m8 S 0 78
m8 S 0 79
m8 S 0 80
m8 S 0 81
m8 S 0 82
m8 S 0 83
m8 S 0 84
m8 S 0 85
m8 S 0 86
m8 S 0 87
m8 S 0 88
m8 S 0 89
m8 S 0 90
m8 S 0 91
m8 S 0 92


In [72]:
def get_index_to_cluster_label_from_corrected_sim_tsv(ref_var_fn):
    df = pd.read_csv(ref_var_fn, sep="\t")
    idx_to_label = {}
    labels = df['character_label'].unique()
    for label in labels:
        idx = df[df['character_label']==label]['character_index'].unique().item()
        idx_to_label[idx] = label
    return idx_to_label
        
for site in sites:

    for mig_type in mig_types:
        site_mig_data_dir = os.path.join(machina_sims_data_dir, site, mig_type)
        seeds = fnmatch.filter(os.listdir(site_mig_data_dir), 'reads_seed*.tsv')
        seeds = [s.replace(".tsv", "").replace("reads_seed", "") for s in seeds]
        for seed in seeds:
            all_mut_trees_fn = os.path.join(machina_sims_data_dir, f"{site}_mut_trees", f"mut_trees_{mig_type}_seed{seed}.txt")
            print(site,mig_type,seed)
            trees = fnmatch.filter(os.listdir(os.path.join(machina_sims_data_dir, f"{site}_clustered_input_corrected")), f"cluster_{mig_type}_seed{seed}_tree*.tsv")
            print(len(trees))

            #print(len(df))
#             x = None
            for tree_num in range(len(trees)):
                ref_var_fn = os.path.join(machina_sims_data_dir, f"{site}_clustered_input_corrected", f"cluster_{mig_type}_seed{seed}_tree{tree_num}.tsv")
                idx_to_cluster_label = get_index_to_cluster_label_from_corrected_sim_tsv(ref_var_fn)
                data = dutil.get_adj_matrices_from_spruce_mutation_trees(all_mut_trees_fn, idx_to_cluster_label, is_sim_data=True)
                assert(data[tree_num][1] == idx_to_cluster_label)
                tree = data[tree_num][0]
                if site=='m8' and mig_type=='mS' and seed=='4' and tree_num==0:
                    print(tree)
                    print(idx_to_cluster_label)
                
                
                

m8 M 19
48
m8 M 35
48
m8 M 172
180
m8 M 76
4
m8 M 216
6
m8 M 45
2
m8 M 7
6
m8 M 239
2
m8 M 241
6
m8 M 243
12
m8 mS 5
1
m8 mS 8
2
m8 mS 2
2
m8 mS 3
8
m8 mS 10
4
m8 mS 4
8
[[0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 