## How to run Metient-calibrate on a subset of real metastatic NSCLC patients (TRACERx)

### To run this notebook, you'll need metient installed:

```bash
mamba create -n "met" python=3.8.8 ipython
mamba activate met
pip install metient
```

### Import libraries and setup paths

In [None]:
import os
from metient import metient as met

### Setup paths
tutorial_dir = os.path.join(os.getcwd(), "metient", "data", "tutorial")
# Path to where our input clone trees and tsvs are located (clustered with pyclone -> clone tree from pairtree)
input_dir = os.path.join(tutorial_dir, "inputs")
# Path to save outputs
output_dir = os.path.join(tutorial_dir, "outputs")

In [11]:
from metient.util import data_extraction_util as dutil
import numpy as np

def get(df, clusters_tsv_fn):
    pyclone_df = pd.read_csv(clusters_tsv_fn, delimiter="\t")
    mut_name_to_cluster_id = dict()
    cluster_id_to_mut_names = dict()
    # 1. Get mapping between mutation names and PyClone cluster ids
    for _, row in df.iterrows():
        mut_items = row['character_label'].split(":")
        cluster_id = pyclone_df[(pyclone_df['CHR']==int(mut_items[1]))&(pyclone_df['POS']==int(mut_items[2]))&(pyclone_df['REF']==mut_items[3])]['treeCLUSTER'].unique()
        assert(len(cluster_id) <= 1)
        if len(cluster_id) == 1:
            cluster_id = int(cluster_id.item())
            mut_name_to_cluster_id[row['character_label']] = cluster_id
            if cluster_id not in cluster_id_to_mut_names:
                cluster_id_to_mut_names[cluster_id] = set()
            else:
                cluster_id_to_mut_names[cluster_id].add(row['character_label'])
       
    # 2. Set new names for clustered mutations
    cluster_id_to_cluster_name = {k:";".join(list(v)) for k,v in cluster_id_to_mut_names.items()}
    return cluster_id_to_cluster_name, mut_name_to_cluster_id
    
import pandas as pd
patients = ["CRUK0003", "CRUK0010", "CRUK0013", "CRUK0029" ]
tracerx_dir = os.path.join(os.getcwd(), "metient", "data", "tracerx_nsclc")
for patient in patients:
    df = pd.read_csv(os.path.join(tracerx_dir, "patient_data", f"{patient}_SNVs.tsv"), delimiter="\t", index_col=0)
    cluster_id_to_cluster_name, mut_name_to_cluster_id = get(df, os.path.join(tracerx_dir, 'conipher_outputs', 'TreeBuilding', f"{patient}_conipher_SNVstreeTable_cleaned.tsv"))
    df['var_read_prob'] = df.apply(lambda row: dutil.calc_var_read_prob(row['major_cn'], row['minor_cn'], row['purity']), axis=1)
    df['site_category'] = df.apply(lambda row: 'primary' if 'primary' in row['anatomical_site_label'] else 'metastasis', axis=1)
    df['cluster_index'] = df.apply(lambda row: mut_name_to_cluster_id[row['character_label']] if row['character_label'] in mut_name_to_cluster_id else np.nan, axis=1)
    df = df.dropna(subset=['cluster_index'])
    df = df[['anatomical_site_index', 'anatomical_site_label', 'cluster_index', 'character_label',
             'ref', 'var', 'var_read_prob', 'site_category']]
    print(df['cluster_index'].unique())
    df['cluster_index'] = df['cluster_index'].astype(int)
    print(df['cluster_index'].unique())
    df.to_csv(os.path.join(os.getcwd(), "metient", "data", "tutorial","inputs", f"{patient}_SNVs.tsv"), sep="\t", index=False)
    

[ 1.  0.  6.  9.  3. 17. 10. 12. 13.  4.  7. 11.  5. 15. 14.  8.  2. 18.
 16.]
[ 1  0  6  9  3 17 10 12 13  4  7 11  5 15 14  8  2 18 16]
[ 0.  1.  3.  8.  5. 11. 10.  7. 13.  9.  4.  2.  6. 12.]
[ 0  1  3  8  5 11 10  7 13  9  4  2  6 12]
[0. 1. 5. 2. 7. 4. 6. 3.]
[0 1 5 2 7 4 6 3]
[ 0.  5.  2.  4.  9.  1. 17. 12. 14.  7. 10. 15.  3.  8. 16. 11. 13.  6.]
[ 0  5  2  4  9  1 17 12 14  7 10 15  3  8 16 11 13  6]


### An example of the expected tsv file format for patient CRUK0010
**The required columns are:**
`anatomical_site_index, anatomical_site_label, cluster_index, character_label, ref, var, var_read_prob, site_category`

In [12]:
import pandas as pd
pd.read_csv(os.path.join(input_dir, "CRUK0010_SNVs.tsv"), sep="\t")

Unnamed: 0,anatomical_site_index,anatomical_site_label,cluster_index,character_label,ref,var,var_read_prob,site_category
0,0,metachronousMet_BR_LN1,0,SETD2:3:47103798:G,128,40,0.250,metastasis
1,1,metachronousMet_BR_LN2,0,SETD2:3:47103798:G,212,64,0.200,metastasis
2,2,metachronousMet_BR_LN3,0,SETD2:3:47103798:G,188,47,0.220,metastasis
3,3,LN_SU_FLN1,0,SETD2:3:47103798:G,422,96,0.250,metastasis
4,4,primary_SU_T1,0,SETD2:3:47103798:G,81,236,0.701,primary
...,...,...,...,...,...,...,...,...
1159,1,metachronousMet_BR_LN2,6,MAN2C1:15:75651122:C,528,0,0.100,metastasis
1160,2,metachronousMet_BR_LN3,6,MAN2C1:15:75651122:C,532,0,0.110,metastasis
1161,3,LN_SU_FLN1,6,MAN2C1:15:75651122:C,494,11,0.125,metastasis
1162,4,primary_SU_T1,6,MAN2C1:15:75651122:C,624,0,0.305,primary


## Step 1: Load filepaths to clone trees and tsv files for each patient

In [13]:
patients = ["CRUK0003", "CRUK0010", "CRUK0013", "CRUK0029" ]
clone_tree_fns = [os.path.join(input_dir, f"{patient}_tree.txt") for patient in patients]
ref_var_fns = [os.path.join(input_dir, f"{patient}_SNVs.tsv") for patient in patients]

## Step 2: Run Metient-calibrate

In [14]:
weights = met.Weights(gen_dist=0.1)
print_config = met.PrintConfig(visualize=True, verbose=False, k_best_trees=5)
met.calibrate(clone_tree_fns, ref_var_fns, print_config, 
              output_dir, patients)


Saving results to /lila/data/morrisq/divyak/projects/metient/metient/data/tutorial/outputs/calibrate
Overwriting existing directory at /lila/data/morrisq/divyak/projects/metient/metient/data/tutorial/outputs/calibrate
Calibrating for patient: CRUK0003


ValueError: Input tsv needs required columns: ['anatomical_site_index', 'anatomical_site_label', 'character_label', 'ref', 'var', 'var_read_prob', 'site_category']

## Step 3: Use the pickle file outputs for downstream analysis

### In addition to the visualizations that Metient provides, we also save pkl.gz files for each Metient run that contain all the results of the run.

In [9]:
import gzip
import pickle

with gzip.open(os.path.join(output_dir,"calibrate", "CRUK0003_primary_SU_T1.pkl.gz") ,"rb") as f:
    pckl = pickle.load(f)
print(pckl.keys())

# V is the best ancestral labeling
V = pckl['ancestral_labelings'][0]
# A is the adjacency matrix that is the input clone tree + inferred leaf nodes
A = pckl['full_adjacency_matrices'][0]
# G represents the migration graph
G = met.get_migration_graph(V, A)

print("\nmigration graph:\n", G)
# Get the seeding pattern for this patient (e.g. "polyconal single-source seeding")
seeding_pattern = met.get_verbose_seeding_pattern(V, A)
print("\ninferred seeding pattern:\n", seeding_pattern)

dict_keys(['ancestral_labelings', 'losses', 'full_tree_node_idx_to_labels', 'full_adjacency_matrices', 'ordered_anatomical_sites', 'loss_dict', 'primary_site', 'subclonal_presence_matrices', 'pars_weights', 'soft_v', 'gen_dist_matrices'])

migration graph:
 tensor([[0., 0.],
        [2., 0.]])

inferred seeding pattern:
 polyclonal primary single-source seeding
