In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from glob import glob
import musical
import sys
import csv
from tabulate import tabulate

# Ensure correct file paths 
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from hrdtimer import utils as HRDTimerUtils

# PCAWG Breast cancer run

In [2]:
# DEFINE PATHS ----- need to modify based on where we will end up saving/providing the data.
PCAWG_timing_vcfs = "/Volumes/extSSD/park_lab/HRDTimer_Analysis/AA_NEW_TEST_RUN_PCAWG_Apr25_v2/Breast/timing"
breast_metadata_path = '/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/data/metadata/pan_metadata_v3.csv'

In [3]:
breast_timing_samples = HRDTimerUtils.prepare_samples_for_timing(PCAWG_timing_vcfs)

breast_metadata_df = pd.read_csv(breast_metadata_path)
timing_sample_ids = list(breast_timing_samples.keys())

wgd_sample_ids = breast_metadata_df.query("sample in @timing_sample_ids and isWGD")["sample"].tolist()
hrd_wgd_sample_ids = breast_metadata_df.query("`HRDetect.isHRD` and organ == 'Breast' and isWGD")["sample"].tolist()

wgd_timing_samples = {s: breast_timing_samples[s] for s in wgd_sample_ids if s in breast_timing_samples}
hrd_wgd_timing_samples = {s: breast_timing_samples[s] for s in hrd_wgd_sample_ids if s in breast_timing_samples}

Processing Early samples:


Processing Files: 100%|██████████| 202/202 [00:22<00:00,  9.08it/s]


Processing Late samples:


Processing Files: 100%|██████████| 174/174 [00:19<00:00,  8.92it/s]


Processing NA samples:


Processing Files: 100%|██████████| 205/205 [00:18<00:00, 10.84it/s]


In [None]:
output_dir = '/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/temp/PCAWG_Breast_boot_only_p_change'
n_bootstraps = 200

HRDTimerUtils.generate_bootstraps(
    samples_dict=hrd_wgd_timing_samples,
    n_bootstraps=n_bootstraps,
    output_dir=output_dir
)

Bootstrapping:   0%|          | 0/200 [00:00<?, ?it/s]

Bootstrapping: 100%|██████████| 200/200 [21:54<00:00,  6.57s/it]


In [None]:
HRDTimerUtils.run_HRD_WGD_timing_analysis(
    hrd_wgd_timing_samples=hrd_wgd_timing_samples,
    base_dir="/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/temp/PCAWG_Breast_boot_only_p_change",
    output_csv_path="/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/data/output/May27_TimingRun/PCAWG_Breast_WGD_HRD_TimingResults_timing_nboot200_0001_only_prob_change_boot.csv"
)

# SCANB run

In [2]:
# DEFINE PATHS ----- need to modify based on where we will end up saving/providing the data.
SCANB_timing_vcfs = "/Volumes/extSSD/park_lab/HRDTimer_Analysis/AA_NEW_TEST_RUN_SCANB_unfiltered_Apr25/Breast/timing"
breast_metadata_path = '/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/data/metadata/pan_metadata_v3.csv'

In [3]:
breast_timing_samples = HRDTimerUtils.prepare_samples_for_timing(SCANB_timing_vcfs)

breast_metadata_df = pd.read_csv(breast_metadata_path)
timing_sample_ids = list(breast_timing_samples.keys())

wgd_sample_ids = breast_metadata_df.query("sample in @timing_sample_ids and isWGD")["sample"].tolist()
hrd_wgd_sample_ids = breast_metadata_df.query("`HRDetect.isHRD` and organ == 'Breast' and isWGD")["sample"].tolist()

wgd_timing_samples = {s: breast_timing_samples[s] for s in wgd_sample_ids if s in breast_timing_samples}
hrd_wgd_timing_samples = {s: breast_timing_samples[s] for s in hrd_wgd_sample_ids if s in breast_timing_samples}

Processing Early samples:


Processing Files: 100%|██████████| 49/49 [00:08<00:00,  5.61it/s]


Processing Late samples:


Processing Files: 100%|██████████| 49/49 [00:07<00:00,  6.62it/s]


Processing NA samples:


Processing Files: 100%|██████████| 49/49 [00:06<00:00,  7.83it/s]


In [None]:
output_dir = '/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/temp/SCANB_Breast_boot_only_p_change'
n_bootstraps = 200

HRDTimerUtils.generate_bootstraps(
    samples_dict=hrd_wgd_timing_samples,
    n_bootstraps=n_bootstraps,
    output_dir=output_dir
)

Bootstrapping: 100%|██████████| 200/200 [25:06<00:00,  7.53s/it]


In [4]:
HRDTimerUtils.run_HRD_WGD_timing_analysis(
    hrd_wgd_timing_samples=hrd_wgd_timing_samples,
    base_dir="/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/temp/SCANB_Breast_boot_only_p_change",
    output_csv_path="/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/data/output/May27_TimingRun/SCANB_Breast_WGD_HRD_TimingResults_timing_nboot200_0001_only_prob_change_boot.csv"
)

Processing WGD Samples: 100%|██████████| 49/49 [09:30<00:00, 11.64s/it]
Processing HRD Samples:  33%|███▎      | 16/49 [16:27<33:56, 61.72s/it]


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

# INFORM Run

In [2]:
# DEFINE PATHS ----- need to modify based on where we will end up saving/providing the data.
INFORM_timing_vcfs = "/Volumes/extSSD/park_lab/HRDTimer_Analysis/AA_NEW_TEST_RUN_INFORM_germline_filtered_v2/Breast/timing"
breast_metadata_path = '/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/data/metadata/pan_metadata_v3.csv'

In [3]:
breast_timing_samples = HRDTimerUtils.prepare_samples_for_timing(INFORM_timing_vcfs)

breast_metadata_df = pd.read_csv(breast_metadata_path)
timing_sample_ids = list(breast_timing_samples.keys())

wgd_sample_ids = breast_metadata_df.query("sample in @timing_sample_ids and isWGD")["sample"].tolist()
hrd_wgd_sample_ids = breast_metadata_df.query("organ == 'Breast' and isWGD")["sample"].tolist()

wgd_timing_samples = {s: breast_timing_samples[s] for s in wgd_sample_ids if s in breast_timing_samples}
hrd_wgd_timing_samples = {s: breast_timing_samples[s] for s in hrd_wgd_sample_ids if s in breast_timing_samples}

Processing Early samples:


Processing Files: 100%|██████████| 19/19 [00:01<00:00, 10.82it/s]


Processing Late samples:


Processing Files: 100%|██████████| 19/19 [00:02<00:00,  8.26it/s]


Processing NA samples:


Processing Files: 100%|██████████| 19/19 [00:02<00:00,  8.17it/s]


In [10]:
output_dir = '/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/temp/INFORM_Breast_boot_only_p_change'
n_bootstraps = 200

HRDTimerUtils.generate_bootstraps(
    samples_dict=hrd_wgd_timing_samples,
    n_bootstraps=n_bootstraps,
    output_dir=output_dir
)

Bootstrapping: 100%|██████████| 200/200 [05:01<00:00,  1.51s/it]


In [None]:
HRDTimerUtils.run_HRD_WGD_timing_analysis(
    hrd_wgd_timing_samples=hrd_wgd_timing_samples,
    base_dir="/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/temp/INFORM_Breast_boot_only_p_change",
    output_csv_path="/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/data/output/May27_TimingRun/INFORM_Breast_WGD_HRD_TimingResults_timing_nboot200_0001_only_prob_change_boot.csv"
)