In [1]:
import pyreadr
import os
import pandas as pd
import glob

REPO_DIR = os.path.join(os.getcwd(), "../../")

TRACERX_DATA_DIR = "/data/morrisq/divyak/data/tracerx_nsclc_2023/tracerx_full_tree_output_extracted"
OUTPUT_DIR = os.path.join(REPO_DIR, "data/tracerx_nsclc/tracerx_provided_patient_data")

# Get all patient ids
pattern, suffix = "*_ccf_table_pyclone_clean.csv", "_ccf_table_pyclone_clean.csv"
file_paths = glob.glob(os.path.join(TRACERX_DATA_DIR, pattern))
pids = [os.path.basename(file_path).replace(suffix, "") for file_path in file_paths]
len(pids)

126

### Extract TRACERx provided information on trees and phyloCCFs

In [2]:
CORRECTED_TREE_STR = "$graph_pyclone$Corrected_tree"
CCF_TABLE_STR = "$nested_pyclone$ccf_cluster_table"
CLONALITY_TABLE_STR = "$clonality_out$clonality_table_corrected"
PYCLONE_CLSTR_STR = "PycloneCluster"
GEN_DIST_STR = "$graph_pyclone$edgelength"
MAX_REACHED_STR = "[ reached 'max' "
SKIP_STRINGS = ["i", "last_ancestor terminal_node"]

import json 
from pyensembl import EnsemblRelease
ensembl = EnsemblRelease(75) # hg19

def collect_data(fn, matching_str):
    with open(fn, "r") as f:
        data = []
        collect = False

        for line in f:
            if matching_str in line:
                collect = True
            elif collect:
                line = line.strip()
                if not line or MAX_REACHED_STR in line:  # Empty line or newline character encountered
                    collect = False
                else:
                    items = line.split()
                    data.append(items)
        return data
    
def get_gene_name(chromosome, position):
    # Fetch the gene that overlaps with the specified position
    gene = ensembl.genes_at_locus(chromosome, position)
    if gene:
        return gene[0].gene_name
    else:
        return f"{chromosome}:{position}"
    
def write_best_tree(pid):
    with open(os.path.join(TRACERX_DATA_DIR, f"{pid}.txt"), "r") as f:
        edges = []
        unique_clusters = set()
        collect = False
        for line in f:
            if CORRECTED_TREE_STR in line:
                collect = True
            elif collect:
                line = line.strip()
                if line:  # If the line is not empty
                    if line not in SKIP_STRINGS:
                        items = line.replace('\"', "").split()
                        edges.append((int(items[1]), int(items[2]))) 
                        unique_clusters.add(int(items[1]))
                        unique_clusters.add(int(items[2]))
                else:  # Empty line or newline character encountered
                    collect = False
    unique_clusters = sorted(list(unique_clusters)) 
    # Make it zero-indexed, and remove any clusters not used in trees
    old_clust_to_new_clust = {unique_clusters[i]:i for i in range(len(unique_clusters))}
    with open(os.path.join(OUTPUT_DIR, f"{pid}_tree.txt"), 'w') as f:
        for edge in edges:
            i = old_clust_to_new_clust[edge[0]]
            j = old_clust_to_new_clust[edge[1]]
            f.write(f"{i} {j}")
            f.write("\n")
    return old_clust_to_new_clust

def write_gen_dist(pid, old_clust_to_new_clust):
    fn = os.path.join(TRACERX_DATA_DIR, f"{pid}.txt")
    with open(fn, 'r') as file:
        lines = file.readlines()
    
        # Initialize variables
        edge_lengths_started = False
        edge_lengths_dict = {}
        keys, values = [], []
        for line in lines:
            # Check for the starting line of edge lengths
            if "$graph_pyclone$edgelength" in line:
                edge_lengths_started = True
                ctr = 0
                continue
            # Stop collecting edge lengths if the trunk section starts
            if "$graph_pyclone$trunk" in line:
                break
            # Collect edge lengths
            if edge_lengths_started:
                ctr += 1
                # Split line by spaces and filter out empty strings
                parts = list(filter(None, line.strip().split(" ")))
                # If parts contain integers, they are the keys
                if ctr % 2 == 0:
                    keys.extend([int(part) for part in parts])
                # If parts contain integers, they are the values
                else:
                    values.extend([int(part) for part in parts])
                    # Build dictionary from keys and values
    edge_lengths_dict = {old_clust_to_new_clust[k]:v for k,v in zip(keys,values) if k in old_clust_to_new_clust}
    with open(os.path.join(OUTPUT_DIR, f'{pid}_edge_lengths.json'), 'w') as f:
        json.dump(edge_lengths_dict, f, indent=4)
    
def write_cluster_to_mutation_names(pid, old_clust_to_new_clust):
    fn = os.path.join(TRACERX_DATA_DIR, f"{pid}.txt")
    data = collect_data(fn, PYCLONE_CLSTR_STR)
    if len(data[0]) == 3:
        print(pid, "3 lines in cluster info")
            
    clstr_to_mut_names = dict()
    for entry in data:
        clstr_idx = 2 if len(entry) == 3 else 1
            
        # Only keep data for clusters actually used in trees
        if int(entry[clstr_idx]) not in old_clust_to_new_clust:
            continue
        clstr = old_clust_to_new_clust[int(entry[clstr_idx])]
        mut_items = entry[0].split(":")
        gene_name = get_gene_name(mut_items[1], int(mut_items[2]))
        if clstr not in clstr_to_mut_names:
            clstr_to_mut_names[clstr] = []
        clstr_to_mut_names[clstr].append(gene_name)
    #print(clstr_to_mut_names)
    missing_clstr_labels = set(old_clust_to_new_clust.values()) - set(clstr_to_mut_names.keys())
    print("missing_clstr_labels", missing_clstr_labels)
    if len(missing_clstr_labels) > 0:
        for missing_clstr in missing_clstr_labels:
            clstr_to_mut_names[missing_clstr] = [missing_clstr]
    assert(len(clstr_to_mut_names.keys())==len(old_clust_to_new_clust.values()))
    with open(os.path.join(OUTPUT_DIR, f'{pid}_cluster_id_to_mut_names.json'), 'w') as f:
        json.dump(clstr_to_mut_names, f, indent=4)
    
def format_df_data(data, old_clust_to_new_clust):
    sample_names,df_data = [],{}
    for d in data:
        if "CRUK" in d[0]:
            sample_names.extend(d)
        else:
            # Only keep data for clusters actually used in trees
            if int(d[0]) in old_clust_to_new_clust:
                cluster_num = old_clust_to_new_clust[int(d[0])]
                if cluster_num not in df_data:
                    df_data[cluster_num] = []
                df_data[cluster_num].extend(d[1:])
    df = pd.DataFrame.from_dict(df_data, columns=sample_names, orient='index')
    df = df.rename_axis('cluster').reset_index()
    return df
    
def write_clonalities(pid, old_clust_to_new_clust):
    fn = os.path.join(TRACERX_DATA_DIR, f"{pid}.txt")
    data = collect_data(fn, CLONALITY_TABLE_STR)
    df = format_df_data(data, old_clust_to_new_clust)
    df.to_csv(os.path.join(OUTPUT_DIR, f"{pid}_clonalities.csv"), index=False)
    
def write_clone_phylo_ccfs(pid, old_clust_to_new_clust):
    fn = os.path.join(TRACERX_DATA_DIR, f"{pid}.txt")
    data = collect_data(fn, CCF_TABLE_STR)
    df = format_df_data(data, old_clust_to_new_clust)
    df.to_csv(os.path.join(OUTPUT_DIR, f"{pid}_phyloccfs.csv"), index=False)

pid_to_old_clust_to_new_clust = dict()
for pid in pids:
    print(pid)
    old_clust_to_new_clust = write_best_tree(pid)
    pid_to_old_clust_to_new_clust[pid] = old_clust_to_new_clust
    write_clone_phylo_ccfs(pid, old_clust_to_new_clust)
    write_clonalities(pid, old_clust_to_new_clust)
    write_cluster_to_mutation_names(pid, old_clust_to_new_clust)
    write_gen_dist(pid, old_clust_to_new_clust)


CRUK0468
missing_clstr_labels set()
CRUK0299
CRUK0299 3 lines in cluster info
missing_clstr_labels set()
CRUK0296
CRUK0296 3 lines in cluster info
missing_clstr_labels set()
CRUK0422
CRUK0422 3 lines in cluster info
missing_clstr_labels set()
CRUK0667
missing_clstr_labels set()
CRUK0036
missing_clstr_labels set()
CRUK0290
CRUK0290 3 lines in cluster info
missing_clstr_labels set()
CRUK0022
missing_clstr_labels set()
CRUK0236
missing_clstr_labels set()
CRUK0497
missing_clstr_labels set()
CRUK0337
CRUK0337 3 lines in cluster info
missing_clstr_labels set()
CRUK0087
missing_clstr_labels set()
CRUK0745
CRUK0745 3 lines in cluster info
missing_clstr_labels set()
CRUK0625
CRUK0625 3 lines in cluster info
missing_clstr_labels set()
CRUK0707
missing_clstr_labels set()
CRUK0044
CRUK0044 3 lines in cluster info
missing_clstr_labels set()
CRUK0552
missing_clstr_labels set()
CRUK0412
missing_clstr_labels set()
CRUK0596
CRUK0596 3 lines in cluster info
missing_clstr_labels set()
CRUK0730
CRUK0730 3

In [3]:
with open(os.path.join(TRACERX_DATA_DIR, f'tracerx_clust_to_metient_cluster.json'), 'w') as f:
        json.dump(pid_to_old_clust_to_new_clust, f, indent=4)

### Get clone presences

In [4]:
import numpy as np
import torch
from metient.util import vertex_labeling_util as vutil

    
def get_node_to_sample_to_ccf(pid):
    # Convert phyloCCF info to dict
    ccf_fn = os.path.join(OUTPUT_DIR, f"{pid}_phyloccfs.csv")
    ccf_df = pd.read_csv(ccf_fn, index_col=False)
    ccf_df.set_index('cluster', inplace=True)
    node_to_sample_to_phyloccf = ccf_df.to_dict(orient='index')
    
    # Convert clonality info to dict
    clonality_fn = os.path.join(OUTPUT_DIR, f"{pid}_clonalities.csv")
    clonality_df = pd.read_csv(clonality_fn, index_col=False)
    clonality_df.set_index('cluster', inplace=True)
    node_to_sample_to_clonality = clonality_df.to_dict(orient='index')

    for node in node_to_sample_to_clonality:
        for sample in node_to_sample_to_clonality[node]:
            node_to_sample_to_phyloccf[node][sample] /= 100.0
            if node_to_sample_to_clonality[node][sample] == 'clonal':
                node_to_sample_to_phyloccf[node][sample] = 1.0

    return node_to_sample_to_phyloccf

def get_node_to_sample_to_clone_proportion(tree, samples, node_to_sample_to_phyloccf):
    node_to_sample_to_clone_proportion = {}
    leaves = vutil.get_leaves(tree)
    #print("leaves", leaves)
    for leaf in leaves:
        node_to_sample_to_clone_proportion[leaf] = {}
        for sample in samples:
            node_to_sample_to_clone_proportion[leaf][sample] = node_to_sample_to_phyloccf[leaf][sample]
    
    
    reverse_bfs = vutil.reverse_bfs_order(tree)
    #print("reverse_bfs", reverse_bfs) 
    for sample in samples:
        for node in reverse_bfs:
            if node not in node_to_sample_to_clone_proportion:
                node_to_sample_to_clone_proportion[node] = {}

            descendants = vutil.get_descendants(tree, node)
#             descendant_ccf_sum = sum([node_to_sample_to_phyloccf[d][sample] for d in descendants])
            descendant_clone_prop_sum = sum([node_to_sample_to_clone_proportion[d][sample] for d in descendants])

            node_to_sample_to_clone_proportion[node][sample] = node_to_sample_to_phyloccf[node][sample]-descendant_clone_prop_sum
    
    
    return node_to_sample_to_clone_proportion



### Get sample information and mutation information

In [5]:
import re
from metient.util import plotting_util as plutil

sample_info_df = pd.read_csv(os.path.join(REPO_DIR, "data/tracerx_nsclc/","sample_overview_original.txt"), sep="\t")

def remove_region_suffix(input_string):
    pattern = r'\.R\d+$'
    modified_string = re.sub(pattern, '', input_string)
    return modified_string

def get_site_to_samples(pid):
    site_to_samples = {}
    site_to_category = {} # primary or metstasis
    patient_samples = sample_info_df[sample_info_df['tumour_id']==pid]
    for _,row in patient_samples.iterrows():
        region = row['region']
        sample_type = row['sampleTypeDetail']
        site = remove_region_suffix(region).replace(pid, sample_type)
        if site not in site_to_samples:
            site_to_samples[site] = []
        site_to_samples[site].append(region)
        site_to_category[site] = row['sampleType']
    return site_to_samples, site_to_category

def get_clstr_to_muts(pid):
    with open(os.path.join(OUTPUT_DIR, f'{pid}_cluster_id_to_mut_names.json'), 'r') as f:
        mut_names_data = json.load(f)
    with open(os.path.join(OUTPUT_DIR, f'{pid}_edge_lengths.json'), 'r') as f:
        edge_data = json.load(f)
        
    clstr_to_muts_info = {}
    for clstr in mut_names_data:
        mutations = mut_names_data[clstr]
        shortened_label = plutil.pruned_mut_label(mutations, True, True)
        clstr_to_muts_info[int(clstr)] = (mutations, shortened_label, edge_data[clstr])
    return clstr_to_muts_info


In [6]:
sample_info_df

Unnamed: 0,patient_id,tumour_id,region,sampleType,sampleTypeDetail
0,CRUK0010,CRUK0010,CRUK0010_SU_T1.R1,primary,primary
1,CRUK0010,CRUK0010,CRUK0010_SU_T1.R2,primary,primary
2,CRUK0010,CRUK0010,CRUK0010_SU_FLN1,metastasis,LN
3,CRUK0010,CRUK0010,CRUK0010_BR_LN1,metastasis,metachronousMet
4,CRUK0010,CRUK0010,CRUK0010_BR_LN2,metastasis,metachronousMet
...,...,...,...,...,...
689,CRUK0872,CRUK0872,CRUK0872_SU_T1.R1,primary,primary
690,CRUK0872,CRUK0872,CRUK0872_SU_T1.R2,primary,primary
691,CRUK0872,CRUK0872,CRUK0872_SU_T1.R3,primary,primary
692,CRUK0872,CRUK0872,CRUK0872_SU_T1.R4,primary,primary


In [7]:
sample_info_df[sample_info_df['patient_id']=='CRUK0620']

Unnamed: 0,patient_id,tumour_id,region,sampleType,sampleTypeDetail
186,CRUK0620,CRUK0620_Tumour1,CRUK0620_SU_LN1,metastasis,LN
187,CRUK0620,CRUK0620_Tumour1,CRUK0620_SU_T1.R5,primary,primary
188,CRUK0620,CRUK0620_Tumour1,CRUK0620_SU_T2.R1,primary,primary
189,CRUK0620,CRUK0620_Tumour1,CRUK0620_SU_T2.R2,primary,primary
190,CRUK0620,CRUK0620_Tumour1,CRUK0620_SU_T2.R3,primary,primary
191,CRUK0620,CRUK0620_Tumour1,CRUK0620_SU_T2.R4,primary,primary
192,CRUK0620,CRUK0620_Tumour1,CRUK0620_SU_FLN3,metastasis,LN
193,CRUK0620,CRUK0620_Tumour1,CRUK0620_SU_FLN2,metastasis,LN


### Write all inputs for Metient to file

In [8]:
import csv
import pprint
from metient.util import data_extraction_util as dutil

def process_patient(pid):
    print(pid)
    
    tree_fn = os.path.join(OUTPUT_DIR, f"{pid}_tree.txt")
    tree = dutil.get_adjacency_matrix_from_txt_edge_list(tree_fn)
    num_nodes = tree.shape[0]
    
    node_to_sample_to_phyloccf = get_node_to_sample_to_ccf(pid)

    samples = set(node_to_sample_to_phyloccf[0].keys())
    node_to_sample_to_clone_proportion = get_node_to_sample_to_clone_proportion(tree, samples, node_to_sample_to_phyloccf)
    #pprint.pprint(node_to_sample_to_clone_proportion)
    site_to_samples, site_to_category = get_site_to_samples(pid)
    clstr_to_muts_info = get_clstr_to_muts(pid)
    data = []
    for node in range(num_nodes):
        for site_idx, site_label in enumerate(site_to_samples.keys()):
            associated_samples = site_to_samples[site_label]
            presence = 0
            for sample in associated_samples:
                if node_to_sample_to_clone_proportion[node][sample] > 0.0:
                    presence = 1
            site_category = site_to_category[site_label]
            mutations = clstr_to_muts_info[node][0]
            character_label = clstr_to_muts_info[node][1]
            num_mutations = clstr_to_muts_info[node][2]
            
            data.append([site_idx, site_label, node, character_label, presence, site_category, num_mutations])
   
    new_snvs_df = pd.DataFrame(data, columns=['anatomical_site_index', 'anatomical_site_label', 'cluster_index',
                                              'cluster_label', 'present', 'site_category', 'num_mutations'])
    if set(new_snvs_df['site_category']) != set(['primary', 'metastasis']):
        print(new_snvs_df['site_category'].unique())
    output_fn = os.path.join(OUTPUT_DIR, f"{pid}_SNVs.tsv")
    with open(output_fn, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerow(new_snvs_df.columns)
        for _, row in new_snvs_df.iterrows():
            writer.writerow(row)

for pid in pids:
    process_patient(pid)
        

CUDA GPU: False
CRUK0468
CRUK0299
CRUK0296
CRUK0422
CRUK0667
CRUK0036
CRUK0290
CRUK0022
CRUK0236
CRUK0497
CRUK0337
CRUK0087
CRUK0745
CRUK0625
CRUK0707
CRUK0044
CRUK0552
CRUK0412
CRUK0596
CRUK0730
CRUK0691
CRUK0719
CRUK0178
CRUK0372_Tumour1
CRUK0256
CRUK0584
CRUK0666
CRUK0451
CRUK0284
CRUK0589
CRUK0587
CRUK0476
CRUK0567
CRUK0010
CRUK0004
CRUK0702
CRUK0352
CRUK0572
CRUK0227
CRUK0487
CRUK0636
CRUK0013
CRUK0496
CRUK0084_Tumour2
CRUK0736
CRUK0242
CRUK0557
CRUK0410
CRUK0530
CRUK0519
CRUK0100
CRUK0543
CRUK0465
CRUK0029
CRUK0742
CRUK0721_Tumour1
CRUK0872
CRUK0343
CRUK0722
CRUK0598
CRUK0478
CRUK0444
CRUK0063
CRUK0286
CRUK0485
CRUK0817
CRUK0495
CRUK0099
CRUK0041
CRUK0003
CRUK0514
CRUK0467
CRUK0620_Tumour1
CRUK0737
CRUK0083
CRUK0074
CRUK0484
CRUK0810
CRUK0415
CRUK0325
CRUK0510
CRUK0407
CRUK0092
CRUK0528
CRUK0344
CRUK0089
CRUK0559
CRUK0250
CRUK0762
CRUK0766
CRUK0311
CRUK0590
CRUK0183
CRUK0368
CRUK0609
CRUK0733
CRUK0418
CRUK0035
CRUK0769
CRUK0748
CRUK0369
CRUK0090
CRUK0301_Tumour1
CRUK0452
CRUK0472