In [1]:
import sys
import pandas as pd
import fnmatch
import os

CONFIDENCE = 0.95

repo_dir = "/Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/metastatic_history_reconstruction/"
os.chdir(repo_dir)

import src.util.create_conf_intervals_from_reads as create_conf

# Use read files (contain reference and variant read counts for all samples 
# and mutation clusters) to generate confidence intervals
DATA_DIR = "/Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/metastatic_history_reconstruction/src/data/sanborn_melanoma_2015"

In [2]:
patients = fnmatch.filter(os.listdir(DATA_DIR), 'reads_*.tsv')
patients = [p.replace("reads_", "").replace(".tsv", "") for p in patients]
patients

['F', 'G', 'E', 'D', 'A', 'C', 'B']

In [3]:
# Process cluster output produced by running cluster command from machina (https://github.com/raphael-group/machina)
# Create clustering files for each patient, where each line has the SNVs 
# associated with that cluster (line number = cluster number)
for patient in patients:
    clustering_output_data = pd.read_table(os.path.join(DATA_DIR, f"machina_cluster_output_{patient}.tsv"), skiprows=3)
    cluster_nums = set(clustering_output_data['character_index'].tolist())
    with open(os.path.join(DATA_DIR, f'clustering_{patient}.txt'), 'w') as f:
        for cluster_idx in range(len(cluster_nums)):
            character_label = clustering_output_data[clustering_output_data['character_index'] == cluster_idx]['character_label'].unique()[0]
            f.write(character_label)
            f.write("\n")


In [4]:
# Create tsv for each patient with ref reads, var reads, and f upper bound and f lower bound
# pooled by all SNVs in a cluster
cluster_split = lambda cluster_name: [x for x in cluster_name.split("_")]
for patient in patients:
    print(patient)
    create_conf.write(os.path.join(DATA_DIR, f"reads_{patient}.tsv"), 
                      os.path.join(DATA_DIR, f'clustering_{patient}.txt'),
                      DATA_DIR,
                      cluster_split_function = cluster_split, 
                      use_char_idx_as_char_label=True)

F
num variants: 2136
anatomical site labels: ['primary' 'lymph_node' 'locoregional' 'distant']
G
num variants: 48
anatomical site labels: ['primary' 'lung' 'locoregional']
E
num variants: 59
anatomical site labels: ['primary' 'locoregional_1' 'locoregional_2' 'lymph_node' 'locoregional_3']
D
num variants: 13
anatomical site labels: ['primary' 'lymph_node' 'locoregional_1' 'locoregional_2']
A
num variants: 124
anatomical site labels: ['primary' 'parotid_gland' 'locoregional_1' 'locoregional_2']
C
num variants: 1056
anatomical site labels: ['primary' 'locoregional_1' 'locoregional_2']
B
num variants: 52
anatomical site labels: ['primary' 'lymph_node' 'locoregional_1' 'locoregional_2']
