### Setup paths

In [1]:
import sys
import os
import seaborn as sns
import pandas as pd
import torch
import matplotlib.pyplot as plt
from src.lib import vertex_labeling
from src.util import data_extraction_util as data_util
from src.util import pairtree_data_extraction_util as pt_util
from src.util import vertex_labeling_util as vert_util
from src.util import plotting_util as plot_util

REPO_DIR = os.path.join(os.getcwd(), "../../")
os.chdir(REPO_DIR)

plt.rcParams['figure.figsize'] = [3, 3]

TRACERX_DATA_DIR = os.path.join(repo_dir, 'src/data/tracerx_nsclc/')
PT_CLUSTERED_TSVS_DIR = os.path.join(TRACERX_DATA_DIR, 'patient_data', 'pairtree_clustered')
PYCLONE_CLUSTERED_TSVS_DIR = os.path.join(TRACERX_DATA_DIR, 'patient_data', 'pyclone_clustered')

ORCHARD_TREES_DIR = os.path.join(TRACERX_DATA_DIR, 'orchard_trees')
CONIPHER_TREES_DIR = os.path.join(TRACERX_DATA_DIR, 'conipher_outputs', 'TreeBuilding')

custom_colors = ["#6aa84fff","#c27ba0ff", "#e69138ff", "#be5742e1", "#2496c8ff", "#674ea7ff"] + sns.color_palette("Paired").as_hex()


CUDA GPU: False


### Setup metient hyperparameters

In [2]:
def find_labeling(ref_var_fn, tree, custom_colors, primary_site, patient_name):    
    ref_matrix, var_matrix, unique_sites, idx_to_cluster_label = data_util.get_ref_var_matrices_from_real_data(ref_var_fn)

    for ix in idx_to_cluster_label:
        og_label_muts = idx_to_cluster_label[ix].split(';') # e.g. CUL3:2:225371655:T;TRPM6:9:77431650:C
        if len(og_label_muts) > 3:
            og_label_muts = og_label_muts[:3]
        gene_names = []
        for mut_label in og_label_muts:
            gene_names.append(mut_label.split(":")[0])
        idx_to_cluster_label[ix] = ("_").join(gene_names)

    print(f"Anatomical sites: {unique_sites}")   
    primary_idx = unique_sites.index(primary_site)
    p = torch.nn.functional.one_hot(torch.tensor([primary_idx]), num_classes=len(unique_sites)).T
    weights = vertex_labeling.Weights(data_fit=1.0, mig=3.0, comig=2.0, seed_site=1.0, reg=2.0, gen_dist=0.0)
    print_config = plot_util.PrintConfig(visualize=False, verbose=False, viz_intermeds=False, k_best_trees=3)
    vertex_labeling.get_migration_history(tree, ref_matrix, var_matrix, unique_sites, p, idx_to_cluster_label, weights,
                                          print_config, os.path.join(TRACERX_DATA_DIR, "metient_outputs"), patient_name,
                                          custom_colors=custom_colors, batch_size=32, max_iter=100)
        

### Run metient with PyClone clustering + CONIPHER trees

In [3]:
def run_patient(patient):
    space = "x"*44
    tsv_fn = os.path.join(PYCLONE_CLUSTERED_TSVS_DIR, f"{patient}_clustered_SNVs.tsv")
    print(f"{space} PATIENT {patient} {space}")
    df = pd.read_csv(tsv_fn, delimiter="\t")
    primary_sites = list(df[df['sample_type']=='primary']['anatomical_site_label'].unique())
    if (len(primary_sites) > 1):
        print("*Multiple primary samples, running metient once for each possible primary*")
    conipher_tree = data_util
    for primary_site in primary_sites:
        print(f"Primary site: {primary_site}")
        run_name = f"{patient}_{primary_site}"
        tree_fn = os.path.join(CONIPHER_TREES_DIR, f"{patient}_conipher_SNVsallTrees.txt")
        trees = data_util.get_adj_matrices_from_all_conipher_trees(tree_fn)
        find_labeling(tsv_fn, trees[0], custom_colors, primary_site, run_name)
    
def run_patients(patients):
    for patient in patients:
        run_patient(patient)

patients = set()
for file in os.listdir(PYCLONE_CLUSTERED_TSVS_DIR):
    if ".tsv" in file:
        patients.add(file.split("_")[0])
print(f"{len(patients)} patients, {patients}")

patients=['CRUK0022']
run_patients(list(patients))


126 patients, {'CRUK0497', 'CRUK0343', 'CRUK0636', 'CRUK0596', 'CRUK0589', 'CRUK0013', 'CRUK0745', 'CRUK0361', 'CRUK0872', 'CRUK0041', 'CRUK0097', 'CRUK0748', 'CRUK0537', 'CRUK0733', 'CRUK0035', 'CRUK0027', 'CRUK0516', 'CRUK0178', 'CRUK0063', 'CRUK0698', 'CRUK0625', 'CRUK0530', 'CRUK0092', 'CRUK0691', 'CRUK0242', 'CRUK0495', 'CRUK0666', 'CRUK0766', 'CRUK0422', 'CRUK0510', 'CRUK0476', 'CRUK0514', 'CRUK0087', 'CRUK0742', 'CRUK0737', 'CRUK0083', 'CRUK0736', 'CRUK0722', 'CRUK0730', 'CRUK0410', 'CRUK0256', 'CRUK0707', 'CRUK0719', 'CRUK0074', 'CRUK0004', 'CRUK0010', 'CRUK0301', 'CRUK0487', 'CRUK0003', 'CRUK0418', 'CRUK0465', 'CRUK0557', 'CRUK0325', 'CRUK0640', 'CRUK0485', 'CRUK0372', 'CRUK0810', 'CRUK0044', 'CRUK0311', 'CRUK0598', 'CRUK0084', 'CRUK0467', 'CRUK0718', 'CRUK0762', 'CRUK0587', 'CRUK0245', 'CRUK0368', 'CRUK0250', 'CRUK0444', 'CRUK0453', 'CRUK0799', 'CRUK0609', 'CRUK0052', 'CRUK0496', 'CRUK0559', 'CRUK0572', 'CRUK0227', 'CRUK0048', 'CRUK0693', 'CRUK0567', 'CRUK0468', 'CRUK0617', '

ValueError: Number of mutations/mutation clusters should be consistent (ref_matrix.shape[1] == var_matrix.shape[1] == T.shape[0])

### Run metient with pairtree clustering + orchard trees

In [2]:
def run_patient(patient):
    space = "x"*44
    tsv_fn = os.path.join(PT_CLUSTERED_TSVS_DIR, f"{patient}_SNVs.tsv")
    print(f"{space} PATIENT {patient} {space}")
    df = pd.read_csv(tsv_fn, delimiter="\t")
    primary_sites = list(df[df['sample_type']=='primary']['anatomical_site_label'].unique())
    if (len(primary_sites) > 1):
        print("*Multiple primary samples, running metient once for each possible primary*")

    for primary_site in primary_sites:
        print(f"Primary site: {primary_site}")
        run_name = f"{patient}_{primary_site}"
        tree_fn = os.path.join(ORCHARD_TREES_DIR, f"{patient}.results.npz")
        data = pt_util.get_adj_matrices_from_pairtree_results(tree_fn)
        tree, _ = data[0] # Use best tree
        find_labeling(tsv_fn, tree, custom_colors, primary_site, run_name)
    
def run_patients(patients):
    for patient in patients:
        run_patient(patient)

patients = set()
for file in os.listdir(ORCHARD_TREES_DIR):
    if "_clustered.results.npz" in file:
        patients.add(file.split(".")[0])
print(f"{len(patients)} patients, {patients}")

run_patients(list(patients))

126 patients, {'CRUK0344_clustered', 'CRUK0620_clustered', 'CRUK0444_clustered', 'CRUK0799_clustered', 'CRUK0468_clustered', 'CRUK0035_clustered', 'CRUK0625_clustered', 'CRUK0666_clustered', 'CRUK0227_clustered', 'CRUK0598_clustered', 'CRUK0485_clustered', 'CRUK0698_clustered', 'CRUK0084_clustered', 'CRUK0872_clustered', 'CRUK0467_clustered', 'CRUK0794_clustered', 'CRUK0487_clustered', 'CRUK0290_clustered', 'CRUK0083_clustered', 'CRUK0497_clustered', 'CRUK0009_clustered', 'CRUK0004_clustered', 'CRUK0745_clustered', 'CRUK0516_clustered', 'CRUK0496_clustered', 'CRUK0730_clustered', 'CRUK0572_clustered', 'CRUK0719_clustered', 'CRUK0609_clustered', 'CRUK0284_clustered', 'CRUK0097_clustered', 'CRUK0530_clustered', 'CRUK0183_clustered', 'CRUK0810_clustered', 'CRUK0714_clustered', 'CRUK0510_clustered', 'CRUK0451_clustered', 'CRUK0590_clustered', 'CRUK0736_clustered', 'CRUK0722_clustered', 'CRUK0048_clustered', 'CRUK0388_clustered', 'CRUK0769_clustered', 'CRUK0528_clustered', 'CRUK0707_cluster

NameError: name 'find_labeling' is not defined