### Load paths

In [1]:
import pyreadr
import os
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

REPO_DIR = os.path.join(os.getcwd(), "../")

TRACERX_DATA_DIR = os.path.join(REPO_DIR, "data/tracerx_nsclc/")
OUTPUT_DIR = os.path.join(REPO_DIR, "data/tracerx_nsclc/patient_data")
CONIPHER_DIR = os.path.join(REPO_DIR, "data/tracerx_nsclc/conipher_inputs")

### Load sample overview information (from TRACERx publicly available data)

In [2]:
sample_info_df= pd.read_csv(os.path.join(TRACERX_DATA_DIR,"sample_overview_original.txt"), sep="\t")
print(sample_info_df['sampleType'].value_counts())
print(sample_info_df['sampleTypeDetail'].value_counts())
sample_info_df

primary       476
metastasis    218
Name: sampleType, dtype: int64
primary            476
LN                 148
metachronousMet     67
synchronousMet       3
Name: sampleTypeDetail, dtype: int64


Unnamed: 0,patient_id,tumour_id,region,sampleType,sampleTypeDetail
0,CRUK0010,CRUK0010,CRUK0010_SU_T1.R1,primary,primary
1,CRUK0010,CRUK0010,CRUK0010_SU_T1.R2,primary,primary
2,CRUK0010,CRUK0010,CRUK0010_SU_FLN1,metastasis,LN
3,CRUK0010,CRUK0010,CRUK0010_BR_LN1,metastasis,metachronousMet
4,CRUK0010,CRUK0010,CRUK0010_BR_LN2,metastasis,metachronousMet
...,...,...,...,...,...
689,CRUK0872,CRUK0872,CRUK0872_SU_T1.R1,primary,primary
690,CRUK0872,CRUK0872,CRUK0872_SU_T1.R2,primary,primary
691,CRUK0872,CRUK0872,CRUK0872_SU_T1.R3,primary,primary
692,CRUK0872,CRUK0872,CRUK0872_SU_T1.R4,primary,primary


### Load purity and ploidy information (from TRACERx publicly available data)

In [3]:
purity_ploidy_df = pd.read_csv(os.path.join(TRACERX_DATA_DIR, "20220808_purityandploidy.txt"), sep="\t")
purity_ploidy_df

Unnamed: 0,Patient,Sample,final.purity,final.ploidy
0,CRUK0090,CRUK0090_SU_T1.R2,0.54,3.15
1,CRUK0090,CRUK0090_BP_T1.R1,0.74,3.25
2,CRUK0090,CRUK0090_BR_T1.R1,0.20,3.30
3,CRUK0090,CRUK0090_SU_T1.R1,0.58,3.20
4,CRUK0090,CRUK0090_BP_T2.R1,0.23,3.40
...,...,...,...,...
689,CRUK0810,CRUK0810_SU_T1.R4,0.49,3.85
690,CRUK0810,CRUK0810_SU_T1.R1,0.25,3.70
691,CRUK0810,CRUK0810_SU_T1.R8,0.20,3.95
692,CRUK0810,CRUK0810_SU_T1.R6,0.39,3.65


### Load all patient mutation data (from TRACERx publicly available data)

In [4]:
# NOTE: not including this rda file in the repo because its big, but it can be downloaded here: https://zenodo.org/records/7649257
patient_data_all = pyreadr.read_r(os.path.join('/data/morrisq/divyak/data/tracerx_nsclc_2023/mutTableAll.cloneInfo.20220726.rda'))
patient_data_all_df = patient_data_all['mutTableAll']
print("Patient data columns:\n ", patient_data_all_df.columns)
print(patient_data_all_df[:10]['RegionSum'])

patient_data_all_df = patient_data_all_df[['mutation_id', 'patient_id', 'Hugo_Symbol','exonic.func','NucleotideChange', 'RegionSum' , 'MajorCPN_SC', 'MinorCPN_SC', 'chr', 'start', 'stop', 'ref', 'var']]
unique_patients = patient_data_all_df['patient_id'].unique()
print("Number of unique patients:", len(unique_patients))
patient_data_all_df

Patient data columns:
  Index(['mutation_id', 'patient_id', 'tumour_id', 'chr', 'start', 'stop', 'ref',
       'var', 'Hugo_Symbol', 'func', 'exonic.func', 'NucleotideChange',
       'AAChange', 'GL_VAF', 'GL_nAlt', 'GL_depth', 'ITHState', 'RegionSum',
       'Is.present', 'PyCloneClonal_SC', 'PyCloneCluster_SC',
       'cleanCluster_SC', 'PhyloCCF_SC', 'combTiming_SC', 'MutCPN_SC',
       'MajorCPN_SC', 'MinorCPN_SC', 'DriverMut', 'treeClones',
       'seedingClones', 'clonalClones', 'sharedClones', 'primaryClones',
       'metClones'],
      dtype='object')
0              BR_LN1:40/168;BR_LN2:64/276;BR_LN3:47/235;SU_FLN1:96/518;SU_T1.R1:236/317;SU_T1.R2:80/227
1               BR_LN1:14/106;BR_LN2:18/155;BR_LN3:18/138;SU_FLN1:12/102;SU_T1.R1:94/153;SU_T1.R2:27/175
2              BR_LN1:44/191;BR_LN2:55/233;BR_LN3:75/260;SU_FLN1:86/288;SU_T1.R1:148/245;SU_T1.R2:92/270
3             BR_LN1:71/525;BR_LN2:61/694;BR_LN3:60/675;SU_FLN1:57/485;SU_T1.R1:385/647;SU_T1.R2:103/654
4             

Unnamed: 0,mutation_id,patient_id,Hugo_Symbol,exonic.func,NucleotideChange,RegionSum,MajorCPN_SC,MinorCPN_SC,chr,start,stop,ref,var
0,CRUK0010:3:47103798:G,CRUK0010,SETD2,stopgain SNV,c.C6148T,BR_LN1:40/168;BR_LN2:64/276;BR_LN3:47/235;SU_FLN1:96/518;SU_T1.R1:236/317;SU_T1.R2:80/227,SU_T1.R1:3;SU_T1.R2:2;SU_FLN1:2;BR_LN1:2;BR_LN2:2;BR_LN3:2,SU_T1.R1:0;SU_T1.R2:0;SU_FLN1:0;BR_LN1:0;BR_LN2:0;BR_LN3:0,chr3,47103798,47103798,G,A
1,CRUK0010:6:45459895:C,CRUK0010,RUNX2,,,BR_LN1:14/106;BR_LN2:18/155;BR_LN3:18/138;SU_FLN1:12/102;SU_T1.R1:94/153;SU_T1.R2:27/175,SU_T1.R1:2;SU_T1.R2:1;SU_FLN1:1;BR_LN1:1;BR_LN2:1;BR_LN3:1,SU_T1.R1:0;SU_T1.R2:1;SU_FLN1:1;BR_LN1:1;BR_LN2:1;BR_LN3:1,chr6,45459895,45459895,C,G
2,CRUK0010:1:156105064:C,CRUK0010,LMNA,synonymous SNV,c.C561T,BR_LN1:44/191;BR_LN2:55/233;BR_LN3:75/260;SU_FLN1:86/288;SU_T1.R1:148/245;SU_T1.R2:92/270,SU_T1.R1:2;SU_T1.R2:2;SU_FLN1:3;BR_LN1:2;BR_LN2:2;BR_LN3:3,SU_T1.R1:0;SU_T1.R2:0;SU_FLN1:0;BR_LN1:0;BR_LN2:0;BR_LN3:0,chr1,156105064,156105064,C,T
3,CRUK0010:6:25850091:C,CRUK0010,SLC17A3,stopgain SNV,c.G979T,BR_LN1:71/525;BR_LN2:61/694;BR_LN3:60/675;SU_FLN1:57/485;SU_T1.R1:385/647;SU_T1.R2:103/654,SU_T1.R1:2;SU_T1.R2:1;SU_FLN1:2;BR_LN1:1;BR_LN2:1;BR_LN3:1,SU_T1.R1:0;SU_T1.R2:1;SU_FLN1:1;BR_LN1:1;BR_LN2:1;BR_LN3:1,chr6,25850091,25850091,C,A
4,CRUK0010:8:16001109:G,CRUK0010,MSR1,nonsynonymous SNV,c.C991G,BR_LN1:16/116;BR_LN2:21/141;BR_LN3:27/180;SU_FLN1:49/281;SU_T1.R1:76/152;SU_T1.R2:31/139,SU_T1.R1:1;SU_T1.R2:1;SU_FLN1:1;BR_LN1:1;BR_LN2:1;BR_LN3:1,SU_T1.R1:0;SU_T1.R2:0;SU_FLN1:0;BR_LN1:0;BR_LN2:0;BR_LN3:0,chr8,16001109,16001109,G,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...
136310,CRUK0872:22:40080280:C,CRUK0872,CACNA1I,,,SU_FLN1:18/668;SU_LN1:1/446;SU_T1.R1:0/602;SU_T1.R2:0/501;SU_T1.R3:0/443;SU_T1.R4:0/463,SU_T1.R1:3;SU_T1.R2:3;SU_T1.R3:2;SU_T1.R4:3;SU_FLN1:3,SU_T1.R1:2;SU_T1.R2:2;SU_T1.R3:2;SU_T1.R4:3;SU_FLN1:2,chr22,40080280,40080280,C,T
136311,CRUK0872:14:99872776:T,CRUK0872,SETD3,,,SU_FLN1:0/573;SU_LN1:0/518;SU_T1.R1:0/609;SU_T1.R2:0/625;SU_T1.R3:0/581;SU_T1.R4:13/510,SU_T1.R1:2;SU_T1.R2:2;SU_T1.R3:2;SU_T1.R4:2;SU_FLN1:2,SU_T1.R1:2;SU_T1.R2:2;SU_T1.R3:2;SU_T1.R4:2;SU_FLN1:2,chr14,99872776,99872776,T,C
136312,CRUK0872:8:110131566:C,CRUK0872,TRHR,stopgain SNV,c.C1079A,SU_FLN1:0/562;SU_LN1:3/616;SU_T1.R1:2/716;SU_T1.R2:0/763;SU_T1.R3:0/722;SU_T1.R4:16/650,SU_T1.R1:2;SU_T1.R2:2;SU_T1.R3:2;SU_T1.R4:2;SU_FLN1:2,SU_T1.R1:2;SU_T1.R2:2;SU_T1.R3:2;SU_T1.R4:2;SU_FLN1:2,chr8,110131566,110131566,C,A
136313,CRUK0872:1:51032853:G,CRUK0872,FAF1,nonsynonymous SNV,c.C1164G,SU_FLN1:0/389;SU_LN1:0/394;SU_T1.R1:0/464;SU_T1.R2:0/484;SU_T1.R3:11/449;SU_T1.R4:0/428,SU_T1.R1:2;SU_T1.R2:2;SU_T1.R3:2;SU_T1.R4:2;SU_FLN1:2,SU_T1.R1:2;SU_T1.R2:2;SU_T1.R3:2;SU_T1.R4:2;SU_FLN1:1,chr1,51032853,51032853,G,C


In [5]:
patient_data_all_df[patient_data_all_df['patient_id']=="CRUK0010"]

Unnamed: 0,mutation_id,patient_id,Hugo_Symbol,exonic.func,NucleotideChange,RegionSum,MajorCPN_SC,MinorCPN_SC,chr,start,stop,ref,var
0,CRUK0010:3:47103798:G,CRUK0010,SETD2,stopgain SNV,c.C6148T,BR_LN1:40/168;BR_LN2:64/276;BR_LN3:47/235;SU_FLN1:96/518;SU_T1.R1:236/317;SU_T1.R2:80/227,SU_T1.R1:3;SU_T1.R2:2;SU_FLN1:2;BR_LN1:2;BR_LN2:2;BR_LN3:2,SU_T1.R1:0;SU_T1.R2:0;SU_FLN1:0;BR_LN1:0;BR_LN2:0;BR_LN3:0,chr3,47103798,47103798,G,A
1,CRUK0010:6:45459895:C,CRUK0010,RUNX2,,,BR_LN1:14/106;BR_LN2:18/155;BR_LN3:18/138;SU_FLN1:12/102;SU_T1.R1:94/153;SU_T1.R2:27/175,SU_T1.R1:2;SU_T1.R2:1;SU_FLN1:1;BR_LN1:1;BR_LN2:1;BR_LN3:1,SU_T1.R1:0;SU_T1.R2:1;SU_FLN1:1;BR_LN1:1;BR_LN2:1;BR_LN3:1,chr6,45459895,45459895,C,G
2,CRUK0010:1:156105064:C,CRUK0010,LMNA,synonymous SNV,c.C561T,BR_LN1:44/191;BR_LN2:55/233;BR_LN3:75/260;SU_FLN1:86/288;SU_T1.R1:148/245;SU_T1.R2:92/270,SU_T1.R1:2;SU_T1.R2:2;SU_FLN1:3;BR_LN1:2;BR_LN2:2;BR_LN3:3,SU_T1.R1:0;SU_T1.R2:0;SU_FLN1:0;BR_LN1:0;BR_LN2:0;BR_LN3:0,chr1,156105064,156105064,C,T
3,CRUK0010:6:25850091:C,CRUK0010,SLC17A3,stopgain SNV,c.G979T,BR_LN1:71/525;BR_LN2:61/694;BR_LN3:60/675;SU_FLN1:57/485;SU_T1.R1:385/647;SU_T1.R2:103/654,SU_T1.R1:2;SU_T1.R2:1;SU_FLN1:2;BR_LN1:1;BR_LN2:1;BR_LN3:1,SU_T1.R1:0;SU_T1.R2:1;SU_FLN1:1;BR_LN1:1;BR_LN2:1;BR_LN3:1,chr6,25850091,25850091,C,A
4,CRUK0010:8:16001109:G,CRUK0010,MSR1,nonsynonymous SNV,c.C991G,BR_LN1:16/116;BR_LN2:21/141;BR_LN3:27/180;SU_FLN1:49/281;SU_T1.R1:76/152;SU_T1.R2:31/139,SU_T1.R1:1;SU_T1.R2:1;SU_FLN1:1;BR_LN1:1;BR_LN2:1;BR_LN3:1,SU_T1.R1:0;SU_T1.R2:0;SU_FLN1:0;BR_LN1:0;BR_LN2:0;BR_LN3:0,chr8,16001109,16001109,G,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,CRUK0010:16:24202576:G,CRUK0010,PRKCB,,,BR_LN1:15/599;BR_LN2:0/836;BR_LN3:0/928;SU_FLN1:0/805;SU_T1.R1:0/1274;SU_T1.R2:2/971,SU_T1.R1:2;SU_T1.R2:2;SU_FLN1:2;BR_LN1:2;BR_LN2:2;BR_LN3:2,SU_T1.R1:2;SU_T1.R2:2;SU_FLN1:2;BR_LN1:2;BR_LN2:2;BR_LN3:2,chr16,24202576,24202576,G,A
238,CRUK0010:1:151584864:G,CRUK0010,SNX27,nonsynonymous SNV,c.G187A,BR_LN1:0/273;BR_LN2:0/409;BR_LN3:0/435;SU_FLN1:0/923;SU_T1.R1:0/350;SU_T1.R2:11/446,SU_T1.R1:2;SU_T1.R2:2;SU_FLN1:3;BR_LN1:2;BR_LN2:2;BR_LN3:3,SU_T1.R1:0;SU_T1.R2:0;SU_FLN1:0;BR_LN1:0;BR_LN2:0;BR_LN3:0,chr1,151584864,151584864,G,A
239,CRUK0010:8:623776:G,CRUK0010,ERICH1,synonymous SNV,c.C576T,BR_LN1:0/548;BR_LN2:0/756;BR_LN3:18/736;SU_FLN1:0/717;SU_T1.R1:0/609;SU_T1.R2:0/684,SU_T1.R1:1;SU_T1.R2:1;SU_FLN1:1;BR_LN1:1;BR_LN2:1;BR_LN3:1,SU_T1.R1:0;SU_T1.R2:0;SU_FLN1:0;BR_LN1:0;BR_LN2:0;BR_LN3:0,chr8,623776,623776,G,A
240,CRUK0010:1:178310696:A,CRUK0010,RASAL2,,,BR_LN1:0/391;BR_LN2:0/569;BR_LN3:0/651;SU_FLN1:0/607;SU_T1.R1:0/550;SU_T1.R2:12/551,SU_T1.R1:2;SU_T1.R2:2;SU_FLN1:3;BR_LN1:2;BR_LN2:2;BR_LN3:3,SU_T1.R1:0;SU_T1.R2:0;SU_FLN1:0;BR_LN1:0;BR_LN2:0;BR_LN3:0,chr1,178310696,178310696,A,G


### Prep data for input into tree building methods (CONIPHER, PairTree, Orchard) and migration history methods (Metient and MACHINA)


In [8]:
'''
For Matient/MACHINA, need a tsv with minimum columns: ['#sample_index', 'sample_label', 'anatomical_site_index',
'anatomical_site_label', 'character_index', 'character_label','ref', 'var']

For CONIPHER, need a tsv with minimum columns: ['CASE_ID', 'SAMPLE', 'CHR', 'POS', 'REF', 'ALT', 
'REF_COUNT', 'VAR_COUNT', 'DEPTH', 'COPY_NUMBER_A', 'COPY_NUMBER_B', 'ACF', 'PLOIDY']
'''

import re
from metient.util import pairtree_data_extraction_util as pt_util
import numpy as np

def remove_region_suffix(input_string):
    pattern = r'\.FR\d+$'
    modified_string = re.sub(pattern, '', input_string)
    pattern = r'\.R\d+$'
    modified_string = re.sub(pattern, '', modified_string)
    return modified_string

def generate_sample_and_mut_info(row):
    '''
    Given a row in the patient_data_all_df dataframe, yields three objects:
        (1) the first for MACHINA/Matient input, 
        with columns: [sample_name, mutation_name, sample_type_detailed, ref_count,var_count, sample_type, purity, ploidy, major_cn, minor_cn]
        e.g. ["SETD3:2:14:99872776:T", "BR_LN1", "LN_SU_FLN1", 43,270, "metastasis", 0.54, 3.19]
        
        (2) the second for CONIPHER input,
        with columns: [patient_id, sample_name, chr, pos, ref, alt, ref_count, var_count, depth,
        copy_number_a, copy_number_b, purity, ploidy]
        
    '''
    mut_id = ":".join(row['mutation_id'].split(":")[1:])
    mut_name = f"{row['Hugo_Symbol']}:{mut_id}"
        
    # RegionSum  column values need to be split by sample 
    # (e.g. "SU_FLN1:0/562;SU_LN1:3/616")    
    sample_name_to_mut_counts = {x.split(":")[0]:x.split(":")[1] for x in row['RegionSum'].split(";")}
    # Don't include mutations without copy number information
    if isinstance(row['MajorCPN_SC'], float) or isinstance(row['MinorCPN_SC'], float):
        yield None
        return
    sample_name_to_major_cn = {x.split(":")[0]:x.split(":")[1] for x in row['MajorCPN_SC'].split(";")}
    sample_name_to_minor_cn = {x.split(":")[0]:x.split(":")[1] for x in row['MinorCPN_SC'].split(";")}

    for sample_name in sample_name_to_mut_counts.keys():
        mut_count = sample_name_to_mut_counts[sample_name]
        # Lookup sample info in sample_overview dataframe
        patient_sample_name = row['patient_id']+"_"+sample_name
        sample_info = sample_info_df[sample_info_df['region']==patient_sample_name]
        # Low purity samples
        if len(sample_info) == 0:
            yield None
            continue
        
        sample_type = sample_info['sampleType'].item()
        sample_type_detailed = sample_info['sampleTypeDetail'].item()+"_"+sample_name
        
        # Lookup purity and ploidy info
        purity_ploidy_info = purity_ploidy_df[purity_ploidy_df["Sample"]==patient_sample_name]
        if len(purity_ploidy_info) == 0:
            yield None
            continue
        purity = purity_ploidy_info["final.purity"].item()
        ploidy =  purity_ploidy_info["final.ploidy"].item()
        
        if purity <= 0.1:
            yield None
            continue
            
        # Get ref and var counts
        var = int(mut_count.split("/")[0])
        total = int(mut_count.split("/")[1])
        ref = total - var
        
        # According to TRACERx: muts w/o copy number calls in this dataset are muts that either do not overlap with 
        # copy number  values – (e.g. on sex chromosomes or mutations from regions with low purity), and therefore 
        # not included in clustering and phylogenetic tree building.  
        if (sample_name not in sample_name_to_major_cn) or (sample_name not in sample_name_to_minor_cn):
            yield None
            continue
        cn_a_count = sample_name_to_major_cn[sample_name]
        cn_b_count = sample_name_to_minor_cn[sample_name]

        # remove_region_suffix(sample_type_detailed) means samples from multiple regions
        # of the same primary get mapped to the same anatomical site label, e.g. primary.T1.R1 
        # and primary.T1.R2 get mapped to primary.T1. This means they get pooled together below as well.
        mig_hist_input = [mut_name, patient_sample_name, remove_region_suffix(sample_type_detailed), 
                          ref, var,  sample_type_detailed, sample_type, purity, ploidy, cn_a_count, cn_b_count]
            
        conipher_input = [ row['patient_id'], patient_sample_name, row['chr'].replace('chr', ''), row['start'], 
                          row['ref'], row['var'], ref, var, total,cn_a_count, cn_b_count, purity, ploidy ]
        
        yield mig_hist_input, conipher_input

        
def format_mig_hist_df(data):
    sample_mut_df = pd.DataFrame(data)
    sample_mut_df.columns = ["character_label", "sample_label", "anatomical_site_label", "ref",
                            "var", "original_anatomical_site_label", "sample_type", "purity", "ploidy", "major_cn", "minor_cn"]
    
    # Add indices for mutations, samples and anatomical sites as needed for input format
    sample_mut_df['sample_index'] = sample_mut_df.apply(lambda row: list(sample_mut_df['sample_label'].unique()).index(row["sample_label"]), axis=1)    
    sample_mut_df['character_index'] = sample_mut_df.apply(lambda row: list(sample_mut_df['character_label'].unique()).index(row["character_label"]), axis=1)
    sample_mut_df['anatomical_site_index'] = sample_mut_df.apply(lambda row: list(sample_mut_df['anatomical_site_label'].unique()).index(row["anatomical_site_label"]), axis=1)
    
    sample_mut_df = sample_mut_df[['sample_index', "sample_label", "anatomical_site_index", "anatomical_site_label", 
                                 'character_index', "character_label", "ref","var", "original_anatomical_site_label", 
                                   "sample_type","purity", "ploidy", "major_cn", "minor_cn"]]

    return sample_mut_df

def format_conipher_df(data):
    sample_mut_df = pd.DataFrame(data)
    sample_mut_df.columns = ['CASE_ID', 'SAMPLE', 'CHR', 'POS', 'REF', 'ALT', 'REF_COUNT', 'VAR_COUNT', 
                             'DEPTH', 'COPY_NUMBER_A', 'COPY_NUMBER_B', 'ACF', 'PLOIDY']
    sample_mut_df = sample_mut_df.dropna()
    return sample_mut_df

def write_pyclone_dfs(data, patient_id, output_dir):
    '''
    Return a dictionary of dfs, one for each sample, each with columns: 
        [mutation_id, ref_counts, var_counts, normal_cn, minor_cn, major_cn]
    '''
    patient_pyclone_dir = os.path.join(output_dir, "pyclone_analysis", patient_id)
    if not os.path.exists(patient_pyclone_dir):
        os.makedirs(patient_pyclone_dir)
    
    mig_hist_df = format_mig_hist_df(data)
    for sample in mig_hist_df['sample_label'].unique():
        sample_subset = mig_hist_df[mig_hist_df['sample_label']==sample]
        sample_subset['normal_cn'] = 2
        sample_subset.rename(columns={'character_label': 'mutation_id', 'ref': 'ref_counts', 'var': 'var_counts'}, inplace=True)
        sample_subset = sample_subset[["mutation_id", "ref_counts","var_counts","normal_cn","minor_cn","major_cn"]]
        sample_subset['major_cn'] = sample_subset['major_cn'].astype(int)
        sample_subset['minor_cn'] = sample_subset['minor_cn'].astype(int)
        sample_subset = sample_subset[sample_subset["major_cn"]!=0]
        sample_subset.to_csv(os.path.join(patient_pyclone_dir, f"{patient_id}_{sample}.tsv"), sep="\t")        
    
    # Setup PyClone command to run 
    cmd = [f"bsub -n 8 -W 100:00 -R rusage[mem=8] -o output_{patient_id}.log -e error_{patient_id}.log", "PyClone run_analysis_pipeline", "--in_files"]
    
    for sample in mig_hist_df['sample_label'].unique():
        cmd.append(os.path.join(patient_pyclone_dir,  f"{patient_id}_{sample}.tsv"))
    cmd += ["--working_dir", patient_pyclone_dir, "--tumour_contents"]
    
    # Add tumour cell proportions for each sample
    for sample in mig_hist_df['sample_label'].unique():
        purity = float(purity_ploidy_df[(purity_ploidy_df['Patient']==patient_id) &(purity_ploidy_df['Sample']==sample)]['final.purity'].item())
        cmd.append(str(purity))
    # Add sample names
    cmd.append("--samples")
    for sample in mig_hist_df['sample_label'].unique():
        cmd.append(sample)
    cmd += ["--max_clusters", "30", "--num_iters", "10000", "--burnin", "5000"]
    return " ".join(cmd)
        


### Write all tsvs/json/ssm files

In [None]:
pyclone_cmds = []
for patient_id in unique_patients:
    print(patient_id)
    output_rows = []
    subset = patient_data_all_df[patient_data_all_df['patient_id'] == patient_id]
    print(len(subset))
    
    mig_hist_data = []
    conipher_data = []
    for _, row in subset.iterrows(): # each row is one mutation
        generator = generate_sample_and_mut_info(row)
        for data in generator:
            if data == None:
                continue
            mig_hist_input, conipher_input = data
            mig_hist_data.append(mig_hist_input)
            conipher_data.append(conipher_input)
    mig_hist_df = format_mig_hist_df(mig_hist_data)
    conipher_df = format_conipher_df(conipher_data)
    # Write migration history inputs
    mig_hist_df.to_csv(os.path.join(OUTPUT_DIR, f"{patient_id}_SNVs.tsv"), sep="\t")
    # Write conipher inputs
    conipher_df.to_csv(os.path.join(CONIPHER_DIR, f"{patient_id}_conipher_SNVs.tsv"), sep="\t")
    # Write pairtree inputs
    pt_util.write_pairtree_inputs(mig_hist_df, patient_id, OUTPUT_DIR)
    # Write pyclone inputs
    pyclone_cmd = write_pyclone_dfs(mig_hist_data, patient_id, os.path.join(OUTPUT_DIR, "../"))
    pyclone_cmds.append(pyclone_cmd)

CRUK0010
242


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

CRUK0013
246


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

CRUK0284
1331


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

CRUK0361
1484


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

CRUK0497
1654


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

CRUK0452
525


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

CRUK0472
988


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

CRUK0484
1333


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

CRUK0418
11308


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

CRUK0721
2856


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

CRUK0702
646


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

CRUK0794
366


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

CRUK0004
550


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

CRUK0027
1157


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

CRUK0044
577


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_subset['normal_cn'] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#re

CRUK0052
1541


In [7]:
for cmd in pyclone_cmds:
    print(cmd)

NameError: name 'pyclone_cmds' is not defined

In [None]:
# Assemble conipher clustering and tree building commands
with open(os.path.join(TRACERX_DATA_DIR, 'conipher_outputs', 'TreeBuilding', "run_all_clustering_tree_building_test.sh"), "w") as f:
    f.write("#!/bin/bash")
    f.write("\n")
    for patient_id in unique_patients:
        f.write(f"bsub -J \"conipher_job_$p\" -n 8 -W 100:00 -o output%I.log -e error_%I.log ./run_single_clustering_tree_building.sh ../../conipher_inputs/ /data/morrisq/divyak/projects/CONIPHER-wrapper/ ./ {patient_id}_conipher_SNVs")
        f.write("\n")

### Above is all on individual mutations. **After clustering mutations using PyClone**, we have to prep clustered inputs for Metient

#### Take PyClone clusters and create pooled tsvs

In [None]:
import numpy as np
import os
from metient.util import pairtree_data_extraction_util as pt_util
from metient.util import data_extraction_util as dutil
import csv
CONIPHER_CLUSTERING_DIR = os.path.join(TRACERX_DATA_DIR, 'conipher_outputs', 'TreeBuilding')


final_cols = ['anatomical_site_index','anatomical_site_label', 'cluster_index', 'character_index','character_label', 'ref', 'var', 'var_read_prob', 'site_category']

for patient_id in unique_patients:
    # Load conipher pyclone clusters
    clusters_tsv_fn = os.path.join(CONIPHER_CLUSTERING_DIR, f"{patient_id}_conipher_SNVstreeTable_cleaned.tsv")
    pyclone_df = pd.read_csv(clusters_tsv_fn, delimiter="\t")
    df = pd.read_csv(os.path.join(OUTPUT_DIR, f"{patient_id}_SNVs.tsv"), delimiter="\t", index_col=0)
    print(df)
    mut_idx_to_cluster_id = dict()
    muts_no_cluster = set()
    
    # 1. Get mapping between mutation names and PyClone cluster ids
    for _, row in df.iterrows():
        mut_items = row['character_label'].split(":")
        cluster_id = pyclone_df[(pyclone_df['CHR']==int(mut_items[1]))&(pyclone_df['POS']==int(mut_items[2]))&(pyclone_df['REF']==mut_items[3])]['treeCLUSTER'].unique()
        assert(len(cluster_id) <= 1)
        if len(cluster_id) == 1:
            cluster_id = int(cluster_id.item())
            mut_idx_to_cluster_id[int(row['character_index'])] = cluster_id
        else:
            muts_no_cluster.add(int(row['character_index']))
    print("Mutations without a pyclone cluster", len(set(muts_no_cluster)))
    df = df[~df['character_index'].isin(muts_no_cluster)]
    for val in list(df['purity']):
        assert(val>0.1)
    
    df['var_read_prob'] = df.apply(lambda row: dutil.calc_var_read_prob(row['major_cn'], row['minor_cn'], row['purity']), axis=1)
    df['site_category'] = df.apply(lambda row: 'primary' if 'primary' in row['anatomical_site_label'] else 'metastasis', axis=1)
    # If there is not a primary and metastasis sample with >10% purity, don't use this patient
    if (set(df['site_category'])!=set(['primary', 'metastasis'])):
        print("Patient does not have high purity primary and met", patient_id)
        continue
    
    df['cluster_index'] = df.apply(lambda row: int(mut_idx_to_cluster_id[row['character_index']]), axis=1)
    df['character_label'] = df.apply(lambda row: row['character_label'].split(":")[0], axis=1)
    df = df[final_cols]
    output_fn = os.path.join(OUTPUT_DIR,"conipher_pyclone_clustered", f"{patient_id}_SNVs.tsv")
    with open(output_fn, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerow(df.columns)
        for _, row in df.iterrows():
            writer.writerow(row)


In [None]:
for patient in unique_patients:
    df = pd.read_csv(os.path.join(OUTPUT_DIR,"conipher_pyclone_clustered", f"{patient}_SNVs.tsv"), sep="\t", index_col=False)
    if (set(df['site_category'])!=set(['primary', 'metastasis'])):
        print(patient)