### Test how migration histories change when 1) only using SNVs in copy number neutral regions, 2) using MACHINA's clsutering, and 3) SPRUCE generated trees

In [None]:
import sys
import pandas as pd
import os
from metient.metient import *
from metient.util import data_extraction_util as dutil

repo_dir = os.path.join(os.getcwd(), "../")
data_dir = os.path.join(repo_dir, 'data', 'sanborn_melanoma_2015')
TREE_DIR = os.path.join(data_dir, 'spruce_mutation_trees')    
TSV_DIR = os.path.join(data_dir, 'machina_clustering')    

print_config = PrintConfig(visualize=True, k_best_trees=20)
OUTPUT_DIR = os.path.join(data_dir, "metient_outputs_no_cna_mach_cluster_spruce_trees")
COLORS = ["#6aa84fff","#c27ba0ff", "#e69138ff", "#be5742e1", "#2496c8ff", "#674ea7ff"]
PATIENTS = ["A", "B", "C", "D", "E", "F", "G"]

idx_to_labels = []
# Prep inputs for Metient
for patient_id in PATIENTS:
    tsv_fn = os.path.join(TSV_DIR, f"{patient_id}_0.95.tsv")
    df = pd.read_csv(tsv_fn, sep="\t",skiprows=3)
    print(patient_id)
    df['num_mutations'] = df.apply(lambda row: len(row['character_label'].split("_")), axis=1)
    df['site_category'] = df.apply(lambda row: 'primary' if row['sample_label']=='primary' else 'metastasis', axis=1)
    df['var_read_prob'] = 0.5
    
    # This is to fix the fact the SPRUCE will filter out some of the clustered mutations. We fix this by
    # removing those mutation clusters from the input TSVs
    unpruned_idx_to_label = pd.Series(df['character_label'].values, index=df['character_index']).to_dict()
    mut_tree_fn = os.path.join(TREE_DIR, f"{patient_id}_mut_trees.txt")
    _, pruned_idx_to_label = dutil.get_adj_matrices_from_spruce_mutation_trees(mut_tree_fn, unpruned_idx_to_label)[0]
    pruned_label_to_idx = {v:k for k,v in pruned_idx_to_label.items()}
    print("pruned", len(unpruned_idx_to_label)-len(pruned_idx_to_label), "mutation clusters")
    df = df[df['character_label'].isin(list(pruned_idx_to_label.values()))]
    df['character_index'] = df.apply(lambda row:pruned_label_to_idx[row['character_label']], axis=1)

    idx_to_labels.append(pruned_idx_to_label)
    df['full_label'] = df['character_label']
    df['character_label'] = df.apply(lambda row:dutil.get_pruned_mut_label(row['character_label'], "_", ":"), axis=1)
    df.to_csv(os.path.join(TSV_DIR,f"{patient_id}_metient_input.tsv"), sep="\t")
    

In [5]:
mut_trees_fns = [os.path.join(TREE_DIR, f"{patient_id}_mut_trees.txt") for patient_id in PATIENTS]
ref_var_fns = [os.path.join(TSV_DIR, f"{patient_id}_metient_input.tsv") for patient_id in PATIENTS]

trees = []
for mut_tree_fn,idx_to_label in zip(mut_trees_fns, idx_to_labels):
    tree, pruned_idx_to_label = dutil.get_adj_matrices_from_spruce_mutation_trees(mut_tree_fn, idx_to_label)[0]
    print(tree)
    assert(idx_to_label==pruned_idx_to_label)
    trees.append(tree)
print(trees)
run_names = [f"{pid}_calibrate" for pid in PATIENTS]
calibrate(trees, ref_var_fns, print_config, OUTPUT_DIR, run_names, bias_weights=True, custom_colors=COLORS, solve_polytomies=False)


[]


ValueError: Inputs Ts, tsv_fns, and run_names must have equal length (length = number of patients in cohort