In [1]:
import sys
import pandas as pd
import fnmatch
import os

repo_dir = os.path.join(os.getcwd(), "../../")
sanborn_raw_data_dir = os.path.join(repo_dir, 'data', 'sanborn_melanoma_2015')
data_dir = os.path.join(repo_dir, 'data', 'sanborn_melanoma_2015', 'cn_neutral_data')

# From Sanborn et. al. supplemental files
raw_data_fn = os.path.join(sanborn_raw_data_dir, "sanborn_melanoma_raw.xlsx")
PATIENTS = ["A", "B", "C", "D", "E", "F", "G"] # H has no CN neutral SNVs

# Collect patient dataframes
raw_dfs = {}
for patient_id in PATIENTS:
    
    df = pd.read_excel(raw_data_fn, f"Patient {patient_id}")
    df['character_label'] = df.apply(lambda row: f"{row['Gene']}:{row['Chromosome']}:{row['Start position']}:{row['Alternate base']}", axis=1)
    # Drop rows where cn info is missing
    cn_cols = [x for x in df.columns if x.startswith("Copy number at base")]
    df = df.dropna(subset=cn_cols)
    raw_dfs[patient_id] = df
    print(patient_id, len(df))

  warn(msg)


A 2109


  warn(msg)


B 135


  warn(msg)


C 1450


  warn(msg)


D 1005


  warn(msg)


E 140


  warn(msg)


F 4696
G 767


  warn(msg)


In [2]:
# Load tumor purities
def format_sample_name(name):
    return "_".join(name.lower().replace(",","").split())

purities = pd.read_csv(os.path.join(sanborn_raw_data_dir, "sanborn_melanoma_purities.csv"))
purities['Site'] = purities.apply(lambda row: format_sample_name(row['Site']), axis=1)
purities

Unnamed: 0,Patient,Site,Tumor Purity,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,A,primary_forehead,76,,,
1,A,parotid_metastasis,21,,,
2,A,locoregional_skin_metastasis_1_forehead,42,,,
3,A,locoregional_skin_metastasis_2_angle_jaw,62,,,
4,B,primary_mid-left_back,89,,,
5,B,lymph_node_metastasis_left_axilla,23,,,
6,B,locoregional_skin_metastasis_1_left_back,88,,,
7,B,locoregional_skin_metastasis_2_left_axilla,51,,,
8,C,primary_right_lower_calf,54,,,
9,C,locoregional_skin_metastasis_1_right_calf,85,,,


## Prepare inputs for pyclone clustering

In [3]:
pyclone_dir = os.path.join(data_dir, "pyclone_analysis")
pyclone_vi_dir = os.path.join(data_dir, "pyclone_vi_analysis")

def get_major_minor_cn(cn_str):
    items = cn_str.replace("(","").replace(")","").split(",")
    major_cn = int(float(items[0]))
    minor_cn = int(float(items[1]))
    return major_cn, minor_cn


patient_id_to_sample_names = {patient_id:[] for patient_id in PATIENTS}
pyclone_cols = ['mutation_id', 'ref_counts', 'var_counts', 'normal_cn', 'minor_cn', 'major_cn']

for patient_id,df in raw_dfs.items():
    print()
    print(patient_id)
    cols = list(df.columns)
    sample_names = [s.strip().lower() for s in cols[1:cols.index('Patient')]]
    if not os.path.exists(os.path.join(pyclone_dir, patient_id)):
        os.makedirs(os.path.join(pyclone_dir, patient_id))
    if not os.path.exists(os.path.join(pyclone_vi_dir, patient_id)):
        os.makedirs(os.path.join(pyclone_vi_dir, patient_id))
    pyclone_vi_data = []
    for sample in sample_names:
        cols_to_keep = ['character_label', f"Total read depth ({sample})", f"Alternate base read depth ({sample})", f"Copy number at base ({sample})"]
        subset = df[cols_to_keep]
        subset['major_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[0], axis=1)
        subset['minor_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[1], axis=1)
        subset['ref_counts'] = subset.apply(lambda row: row[cols_to_keep[1]]-row[cols_to_keep[2]], axis=1)
        subset['var_counts'] = subset[cols_to_keep[2]] 
        subset['normal_cn'] = 2 # no sex chromosomes
        subset['mutation_id'] = subset[cols_to_keep[0]]
        subset = subset[pyclone_cols]
        # Only keep CN neutral SNVs!
        subset = subset[(subset['major_cn']==1)&(subset['minor_cn']==1)]
        tsv_name = f"{patient_id}_{format_sample_name(sample)}_cn_neutral.tsv"
        print(sample, len(subset), tsv_name)
        subset.to_csv(os.path.join(pyclone_dir, patient_id, tsv_name), sep="\t")
        
        # Extra values for pyclone-vi
        subset['alt_counts'] = subset['var_counts']
        fmt_sample_name = format_sample_name(sample)
        purity = float(purities[(purities['Patient']==patient_id) &(purities['Site']==fmt_sample_name)]['Tumor Purity'].item())/100
        subset['tumour_content'] = purity
        subset['sample_id'] = fmt_sample_name
        pyclone_vi_data.append(subset)
        patient_id_to_sample_names[patient_id].append(fmt_sample_name)
        
    pyclone_vi_df = pd.concat(pyclone_vi_data)
    pyclone_vi_df.to_csv(os.path.join(pyclone_vi_dir, patient_id, f"{patient_id}_input.tsv"), sep="\t")
    print(sample_names)
    
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['major_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[0], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['minor_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[1], axis=1)



A
primary, forehead 1746 A_primary_forehead_cn_neutral.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['ref_counts'] = subset.apply(lambda row: row[cols_to_keep[1]]-row[cols_to_keep[2]], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['var_counts'] = subset[cols_to_keep[2]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['normal_cn'] = 2 # no sex chromosomes
A value is try

parotid metastasis 1196 A_parotid_metastasis_cn_neutral.tsv
locoregional skin metastasis 1, forehead 1739 A_locoregional_skin_metastasis_1_forehead_cn_neutral.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['ref_counts'] = subset.apply(lambda row: row[cols_to_keep[1]]-row[cols_to_keep[2]], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['var_counts'] = subset[cols_to_keep[2]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['normal_cn'] = 2 # no sex chromosomes
A value is try

locoregional skin metastasis 2, angle jaw 135 A_locoregional_skin_metastasis_2_angle_jaw_cn_neutral.tsv
['primary, forehead', 'parotid metastasis', 'locoregional skin metastasis 1, forehead', 'locoregional skin metastasis 2, angle jaw']

B
primary, mid-left back 79 B_primary_mid-left_back_cn_neutral.tsv
lymph node metastasis, left axilla 87 B_lymph_node_metastasis_left_axilla_cn_neutral.tsv
locoregional skin metastasis 1, left back 91 B_locoregional_skin_metastasis_1_left_back_cn_neutral.tsv
locoregional skin metastasis 2, left axilla 83 B_locoregional_skin_metastasis_2_left_axilla_cn_neutral.tsv
['primary, mid-left back', 'lymph node metastasis, left axilla', 'locoregional skin metastasis 1, left back', 'locoregional skin metastasis 2, left axilla']

C
primary, right lower calf 1200 C_primary_right_lower_calf_cn_neutral.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['major_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[0], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['minor_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[1], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset[

locoregional skin metastasis 1, right calf 1222 C_locoregional_skin_metastasis_1_right_calf_cn_neutral.tsv
locoregional skin metastasis 2, right mid-calf 1202 C_locoregional_skin_metastasis_2_right_mid-calf_cn_neutral.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['major_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[0], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['minor_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[1], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset[

['primary, right lower calf', 'locoregional skin metastasis 1, right calf', 'locoregional skin metastasis 2, right mid-calf']

D
primary, right ankle 15 D_primary_right_ankle_cn_neutral.tsv
lymph node metastasis, right groin 58 D_lymph_node_metastasis_right_groin_cn_neutral.tsv
locoregional skin metastasis 1, right ankle 42 D_locoregional_skin_metastasis_1_right_ankle_cn_neutral.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['minor_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[1], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['ref_counts'] = subset.apply(lambda row: row[cols_to_keep[1]]-row[cols_to_keep[2]], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset[

locoregional skin metastasis 2, right leg 42 D_locoregional_skin_metastasis_2_right_leg_cn_neutral.tsv
['primary, right ankle', 'lymph node metastasis, right groin', 'locoregional skin metastasis 1, right ankle', 'locoregional skin metastasis 2, right leg']

E
primary, left heel 98 E_primary_left_heel_cn_neutral.tsv
locoregional skin metastasis 1, left heel 98 E_locoregional_skin_metastasis_1_left_heel_cn_neutral.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['ref_counts'] = subset.apply(lambda row: row[cols_to_keep[1]]-row[cols_to_keep[2]], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['var_counts'] = subset[cols_to_keep[2]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['normal_cn'] = 2 # no sex chromosomes
A value is try

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['major_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[0], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['minor_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[1], axis=1)


locoregional skin metastasis 2, left heel 95 E_locoregional_skin_metastasis_2_left_heel_cn_neutral.tsv
lymph node metastasis, left groin 91 E_lymph_node_metastasis_left_groin_cn_neutral.tsv
locoregional skin metastasis 3, left heel 96 E_locoregional_skin_metastasis_3_left_heel_cn_neutral.tsv
['primary, left heel', 'locoregional skin metastasis 1, left heel', 'locoregional skin metastasis 2, left heel', 'lymph node metastasis, left groin', 'locoregional skin metastasis 3, left heel']

F
primary, left ear 3172 F_primary_left_ear_cn_neutral.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['ref_counts'] = subset.apply(lambda row: row[cols_to_keep[1]]-row[cols_to_keep[2]], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['var_counts'] = subset[cols_to_keep[2]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['normal_cn'] = 2 # no sex chromosomes
A value is try

lymph node metastasis, left cervical node 3316 F_lymph_node_metastasis_left_cervical_node_cn_neutral.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['major_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[0], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['minor_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[1], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset[

locoregional skin metastasis, left ear 3605 F_locoregional_skin_metastasis_left_ear_cn_neutral.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['minor_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[1], axis=1)


distant skin metastasis, back 3099 F_distant_skin_metastasis_back_cn_neutral.tsv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['ref_counts'] = subset.apply(lambda row: row[cols_to_keep[1]]-row[cols_to_keep[2]], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['var_counts'] = subset[cols_to_keep[2]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['normal_cn'] = 2 # no sex chromosomes
A value is try

['primary, left ear', 'lymph node metastasis, left cervical node', 'locoregional skin metastasis, left ear', 'distant skin metastasis, back']

G
primary, right forearm 670 G_primary_right_forearm_cn_neutral.tsv
lung metastasis 54 G_lung_metastasis_cn_neutral.tsv
locoregional skin metastasis, axilla 541 G_locoregional_skin_metastasis_axilla_cn_neutral.tsv
['primary, right forearm', 'lung metastasis', 'locoregional skin metastasis, axilla']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['minor_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[1], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['ref_counts'] = subset.apply(lambda row: row[cols_to_keep[1]]-row[cols_to_keep[2]], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset[

### Setup pyclone commands for each patient

In [4]:
# Pyclone commands
for patient_id,df in raw_dfs.items():
    cmd = [f"bsub -n 8 -W 100:00 -R rusage[mem=8] -o output_{patient_id}.log -e error_{patient_id}.log", "PyClone run_analysis_pipeline", "--in_files"]
    patient_dir = os.path.join(pyclone_dir, patient_id)
    
    for sample_name in patient_id_to_sample_names[patient_id]:
        cmd.append(os.path.join(patient_dir,  f"{patient_id}_{sample_name}_cn_neutral.tsv"))
    cmd += ["--working_dir", patient_dir, "--tumour_contents"]
    
    # Add tumour cell proportions for each sample
    for sample_name in patient_id_to_sample_names[patient_id]:
        purity = float(purities[(purities['Patient']==patient_id) &(purities['Site']==sample_name)]['Tumor Purity'].item())/100
        cmd.append(str(purity))
    # Add sample names
    cmd.append("--samples")
    for sample_name in patient_id_to_sample_names[patient_id]:
        cmd.append(sample_name)
    cmd += ["--burnin", "1000", "--max_clusters", "10"]
    print(" ".join(cmd))

print("\n****\n")
# Pyclone-VI commands
for patient_id,df in raw_dfs.items():
    patient_dir = os.path.join(pyclone_vi_dir, patient_id)
    cmd1 = f"pyclone-vi fit -i {patient_dir}/{patient_id}_input.tsv -o {patient_dir}/{patient_id}.h5"
    cmd2 = f"pyclone-vi write-results-file -i {patient_dir}/{patient_id}.h5 -o {patient_dir}/{patient_id}_output.tsv"
    print(cmd1)
    print(cmd2)
    print()

bsub -n 8 -W 100:00 -R rusage[mem=8] -o output_A.log -e error_A.log PyClone run_analysis_pipeline --in_files /lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks/data_preprocessing/../../data/sanborn_melanoma_2015/cn_neutral_data/pyclone_analysis/A/A_primary_forehead_cn_neutral.tsv /lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks/data_preprocessing/../../data/sanborn_melanoma_2015/cn_neutral_data/pyclone_analysis/A/A_parotid_metastasis_cn_neutral.tsv /lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks/data_preprocessing/../../data/sanborn_melanoma_2015/cn_neutral_data/pyclone_analysis/A/A_locoregional_skin_metastasis_1_forehead_cn_neutral.tsv /lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks/data_preprocessing/../../data/sanborn_melanoma_2015/cn_neutral_data/pyclone_analysis/A/A_locoregional_skin_metastasis_2_angle_jaw_cn_neutral.tsv --working_dir /lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks

## Prepare data for orchard tree inference


In [7]:
from metient.util import data_extraction_util as dutil
import json

def write_orchard_output(orchard_output_dir, cluster_dir, extension, includes_pt_name):
    for patient_id,df in raw_dfs.items():

        patient_pyclone_dir = os.path.join(cluster_dir, patient_id)
        print(patient_pyclone_dir)
        extension_name = f"{patient_id}_{extension}" if includes_pt_name else extension
        cluster_id_to_mut_names, mutation_names = dutil.load_pyclone_clusters(os.path.join(patient_pyclone_dir, extension_name))
        
        for cid in cluster_id_to_mut_names:
            print(cid, len(cluster_id_to_mut_names[cid]))
        header = ["id", "name", "var_reads", "total_reads", "var_read_prob"]
        mut_name_to_mut_id = {}
        cols = list(df.columns)
        sample_names = [s.strip().lower() for s in cols[1:cols.index('Patient')]]

        with open(os.path.join(orchard_output_dir, f"{patient_id}.ssm"), "w") as f:
            f.write("\t".join(header))
            f.write("\n")  
            for i, mut in enumerate(mutation_names):
                mut_name_to_mut_id[mut] = f"m{i}"
                row = [f"m{i}", mut]

                var_reads = []
                total_reads = []
                var_read_probs = []

                for sample in sample_names:

                    cols_to_keep = ['character_label', f"Total read depth ({sample})", f"Alternate base read depth ({sample})", f"Copy number at base ({sample})"]
                    subset = df[cols_to_keep]
                    subset = subset[subset['character_label']==mut]
                    subset['major_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[0], axis=1)
                    subset['minor_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[1], axis=1)

                    var = subset[cols_to_keep[2]].item()
                    tot = subset[cols_to_keep[1]].item()
                    major_cn = subset['major_cn'].item()
                    minor_cn = subset['minor_cn'].item()
                    formatted_sample_name = format_sample_name(sample)
                    var_reads.append(str(var))
                    total_reads.append(str(tot))
                    p = float(purities[(purities['Patient']==patient_id) &(purities['Site']==formatted_sample_name)]['Tumor Purity'].item())/100
                    var_read_prob = dutil.calc_var_read_prob(major_cn, minor_cn, p)
                    var_read_probs.append(str(var_read_prob))

                row += [",".join(var_reads), ",".join(total_reads), ",".join(var_read_probs)]
                f.write("\t".join(row))
                f.write("\n")
        json_data = {"samples": sample_names, "clusters": [], "garbage": []}
        for x in range(0,len(cluster_id_to_mut_names)):
            json_data["clusters"].append([mut_name_to_mut_id[t] for t in cluster_id_to_mut_names[x]])

        with open(os.path.join(orchard_output_dir, f"{patient_id}.params.json"), 'w', encoding='utf-8') as f:
            json.dump(json_data, f, ensure_ascii=False)
            
pyclone_orchard_dir = os.path.join(data_dir, "pyclone_orchard_trees")
write_orchard_output(pyclone_orchard_dir, pyclone_dir, "tables/loci.tsv", False)
pyclone_vi_orchard_dir = os.path.join(data_dir, "pyclone_vi_orchard_trees")
write_orchard_output(pyclone_vi_orchard_dir, pyclone_vi_dir, "output.tsv", True)



/lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks/data_preprocessing/../../data/sanborn_melanoma_2015/cn_neutral_data/pyclone_vi_analysis/A
0 41
1 30
2 58
3 3
4 2
5 1
/lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks/data_preprocessing/../../data/sanborn_melanoma_2015/cn_neutral_data/pyclone_vi_analysis/B
0 4
1 47
2 5
3 8
4 5
/lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks/data_preprocessing/../../data/sanborn_melanoma_2015/cn_neutral_data/pyclone_vi_analysis/C
0 1
1 22
2 2
3 51
4 178
5 907
/lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks/data_preprocessing/../../data/sanborn_melanoma_2015/cn_neutral_data/pyclone_vi_analysis/D
0 13
1 1
2 1
/lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks/data_preprocessing/../../data/sanborn_melanoma_2015/cn_neutral_data/pyclone_vi_analysis/E
0 6
1 46
2 8
3 12
4 5
5 5
/lila/data/morrisq/divyak/projects/metient/metient/jupyter_notebooks/data_preprocessing/

In [8]:
# Orchard commands
for patient_id in raw_dfs:
    
    cmd = f"bsub -n 8 -W 10:00 -R rusage[mem=8] -o output_{patient_id}.log -e error_{patient_id}.log"
    cmd += f" python /data/morrisq/divyak/projects/orchard/bin/orchard -p {patient_id}.ssm {patient_id}.params.json {patient_id}.results.npz"
    print(cmd)
    

bsub -n 8 -W 10:00 -R rusage[mem=8] -o output_A.log -e error_A.log python /data/morrisq/divyak/projects/orchard/bin/orchard -p A.ssm A.params.json A.results.npz
bsub -n 8 -W 10:00 -R rusage[mem=8] -o output_B.log -e error_B.log python /data/morrisq/divyak/projects/orchard/bin/orchard -p B.ssm B.params.json B.results.npz
bsub -n 8 -W 10:00 -R rusage[mem=8] -o output_C.log -e error_C.log python /data/morrisq/divyak/projects/orchard/bin/orchard -p C.ssm C.params.json C.results.npz
bsub -n 8 -W 10:00 -R rusage[mem=8] -o output_D.log -e error_D.log python /data/morrisq/divyak/projects/orchard/bin/orchard -p D.ssm D.params.json D.results.npz
bsub -n 8 -W 10:00 -R rusage[mem=8] -o output_E.log -e error_E.log python /data/morrisq/divyak/projects/orchard/bin/orchard -p E.ssm E.params.json E.results.npz
bsub -n 8 -W 10:00 -R rusage[mem=8] -o output_F.log -e error_F.log python /data/morrisq/divyak/projects/orchard/bin/orchard -p F.ssm F.params.json F.results.npz
bsub -n 8 -W 10:00 -R rusage[mem=8

## Take PyClone generated clusters and create csvs with ref and var counts pooled by cluster

In [9]:
# Need a tsv for each patient with ['anatomical_site_index','anatomical_site_label', 'character_index', 'character_label', 'ref', 'var', 'var_read_prob', 'site_category', 'num_mutations']
import re
import numpy as np

from metient.util import data_extraction_util as dutil

def write_metient_input(metient_output_dir, cluster_dir, extension, includes_pt_name):

    final_cols = ['anatomical_site_index','anatomical_site_label', 'cluster_index', 'character_index','character_label', 'ref', 'var', 'var_read_prob', 'site_category']

    for patient_id,df in raw_dfs.items():
        extension_name = f"{patient_id}_{extension}" if includes_pt_name else extension
        patient_pyclone_fn = os.path.join(cluster_dir, patient_id, extension_name)
        mut_id_to_clstr_id, clstr_id_to_name, mutation_names = dutil.get_mut_to_cluster_map_from_pyclone_output(patient_pyclone_fn)
        cols = list(df.columns)
        sample_names = [s.strip().lower() for s in cols[1:cols.index('Patient')]]
        print("patient", patient_id, len(clstr_id_to_name), "clusters", len(mutation_names), "mutations", len(sample_names), "samples")
        print(sample_names)

        data = []

        for midx, mut in enumerate(mutation_names):
            for sidx, sample in enumerate(sample_names):
                cols_to_keep = ['character_label', f"Total read depth ({sample})", f"Alternate base read depth ({sample})", f"Copy number at base ({sample})"]
                subset = df[cols_to_keep]
                subset = subset[subset['character_label']==mut]
                subset['major_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[0], axis=1)
                subset['minor_cn'] = subset.apply(lambda row: get_major_minor_cn(row[cols_to_keep[3]])[1], axis=1)

                var = subset[cols_to_keep[2]].item()
                tot = subset[cols_to_keep[1]].item()
                ref = tot - var
                major_cn = subset['major_cn'].item()
                minor_cn = subset['minor_cn'].item()
                formatted_sample_name = format_sample_name(sample)
                p = float(purities[(purities['Patient']==patient_id) &(purities['Site']==formatted_sample_name)]['Tumor Purity'].item())/100
                var_read_prob = dutil.calc_var_read_prob(major_cn, minor_cn, p)

                category = 'primary' if 'primary' in sample else 'metastasis'
                data.append([sidx, sample.capitalize(), mut_id_to_clstr_id[mut], midx, mut.split(":")[0], ref, var, var_read_prob, category])

        patient_df = pd.DataFrame(data, columns=final_cols)
        patient_df.to_csv(os.path.join(data_dir,metient_output_dir, f"{patient_id}_SNVs.tsv"), sep="\t", index=False)

write_metient_input("pyclone_clustered_tsvs", pyclone_dir, "tables/loci.tsv", False)

write_metient_input("pyclone_vi_clustered_tsvs", pyclone_vi_dir, "output.tsv", True)


patient A 5 clusters 135 mutations 4 samples
['primary, forehead', 'parotid metastasis', 'locoregional skin metastasis 1, forehead', 'locoregional skin metastasis 2, angle jaw']
patient B 10 clusters 69 mutations 4 samples
['primary, mid-left back', 'lymph node metastasis, left axilla', 'locoregional skin metastasis 1, left back', 'locoregional skin metastasis 2, left axilla']
patient C 10 clusters 1161 mutations 3 samples
['primary, right lower calf', 'locoregional skin metastasis 1, right calf', 'locoregional skin metastasis 2, right mid-calf']
patient D 3 clusters 15 mutations 4 samples
['primary, right ankle', 'lymph node metastasis, right groin', 'locoregional skin metastasis 1, right ankle', 'locoregional skin metastasis 2, right leg']
patient E 10 clusters 82 mutations 5 samples
['primary, left heel', 'locoregional skin metastasis 1, left heel', 'locoregional skin metastasis 2, left heel', 'lymph node metastasis, left groin', 'locoregional skin metastasis 3, left heel']
patient 