In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from glob import glob
import musical
import sys
import csv
from tabulate import tabulate

# Ensure correct file paths 
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from hrdtimer import utils as HRDTimerUtils

In [13]:
import pandas as pd
import ast
from pathlib import Path
from tqdm import tqdm

# Root directory containing bootstrap folders
root_dir = Path("/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/temp/INFORM_Breast_boot_only_p_change")

# Find all .csv files recursively under all bootstrap_* folders
csv_files = list(root_dir.rglob("*.csv"))

# Process each file with a progress bar
for file_path in tqdm(csv_files, desc="Extracting pSub"):
    try:
        df = pd.read_csv(file_path)

        if 'INFO' in df.columns:
            df['pSub'] = df['INFO'].map(lambda x: ast.literal_eval(x).get('pSub', None))
            df.to_csv(file_path, index=False)
    except Exception:
        continue  # Silently skip problematic files

Extracting pSub: 100%|██████████| 2200/2200 [12:38<00:00,  2.90it/s]


# PCAWG Breast cancer run

In [2]:
# DEFINE PATHS ----- need to modify based on where we will end up saving/providing the data.
PCAWG_timing_vcfs = "/Volumes/extSSD/park_lab/HRDTimer_Analysis/AA_NEW_TEST_RUN_PCAWG_Apr25_v2/Breast/timing"
breast_metadata_path = '/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/data/metadata/pan_metadata_v3.csv'

In [3]:
breast_timing_samples = HRDTimerUtils.prepare_samples_for_timing(PCAWG_timing_vcfs)

breast_metadata_df = pd.read_csv(breast_metadata_path)
timing_sample_ids = list(breast_timing_samples.keys())

wgd_sample_ids = breast_metadata_df.query("sample in @timing_sample_ids and isWGD")["sample"].tolist()
hrd_wgd_sample_ids = breast_metadata_df.query("`HRDetect.isHRD` and organ == 'Breast' and isWGD")["sample"].tolist()

wgd_timing_samples = {s: breast_timing_samples[s] for s in wgd_sample_ids if s in breast_timing_samples}
hrd_wgd_timing_samples = {s: breast_timing_samples[s] for s in hrd_wgd_sample_ids if s in breast_timing_samples}

Processing Early samples:


Processing Files: 100%|██████████| 202/202 [00:23<00:00,  8.76it/s]


Processing Late samples:


Processing Files: 100%|██████████| 174/174 [00:19<00:00,  8.71it/s]


Processing NA samples:


Processing Files: 100%|██████████| 205/205 [00:18<00:00, 11.16it/s]


In [None]:
output_dir = '/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/temp/PCAWG_Breast_boot_only_p_change'
n_bootstraps = 200

HRDTimerUtils.generate_bootstraps(
    samples_dict=hrd_wgd_timing_samples,
    n_bootstraps=n_bootstraps,
    output_dir=output_dir
)

Bootstrapping:   0%|          | 0/200 [00:00<?, ?it/s]

Bootstrapping: 100%|██████████| 200/200 [21:54<00:00,  6.57s/it]


In [4]:
HRDTimerUtils.run_HRD_WGD_timing_analysis(
    hrd_wgd_timing_samples=hrd_wgd_timing_samples,
    base_dir="/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/temp/PCAWG_Breast_boot_only_p_change",
    output_csv_path="/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/data/output/May27_TimingRun/PCAWG_Breast_WGD_HRD_TimingResults_timing_nboot200_0001_only_prob_change_boot_pSub.csv"
)

Processing WGD Samples: 100%|██████████| 34/34 [09:25<00:00, 16.64s/it]
Processing HRD Samples: 100%|██████████| 34/34 [38:29<00:00, 67.92s/it] 

+--------------------------------------+-----------+-----------------+-----------------+---------------------+---------------------+-----------+-----------------+-----------------+---------------------+---------------------+---------------+----------------------------------------------------------------+--------------------------------------------------------------------+-------------------------------------------------------------------+--------------------------------------------------------------------+----------------------------------------------------------------+--------------------------------------------------------------------+-----------------------------------------------------------------+-------------------------------------------------------------------+-------------------------------------------------+-----------+--------------------------------------------------------------+-------------------------------------------------------------+-----------------+----------------




# SCANB run

In [2]:
# DEFINE PATHS ----- need to modify based on where we will end up saving/providing the data.
SCANB_timing_vcfs = "/Volumes/extSSD/park_lab/HRDTimer_Analysis/AA_NEW_TEST_RUN_SCANB_unfiltered_Apr25/Breast/timing"
breast_metadata_path = '/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/data/metadata/pan_metadata_v3.csv'

In [3]:
breast_timing_samples = HRDTimerUtils.prepare_samples_for_timing(SCANB_timing_vcfs)

breast_metadata_df = pd.read_csv(breast_metadata_path)
timing_sample_ids = list(breast_timing_samples.keys())

wgd_sample_ids = breast_metadata_df.query("sample in @timing_sample_ids and isWGD")["sample"].tolist()
hrd_wgd_sample_ids = breast_metadata_df.query("`HRDetect.isHRD` and organ == 'Breast' and isWGD")["sample"].tolist()

wgd_timing_samples = {s: breast_timing_samples[s] for s in wgd_sample_ids if s in breast_timing_samples}
hrd_wgd_timing_samples = {s: breast_timing_samples[s] for s in hrd_wgd_sample_ids if s in breast_timing_samples}

Processing Early samples:


Processing Files: 100%|██████████| 49/49 [00:08<00:00,  5.74it/s]


Processing Late samples:


Processing Files: 100%|██████████| 49/49 [00:07<00:00,  6.72it/s]


Processing NA samples:


Processing Files: 100%|██████████| 49/49 [00:06<00:00,  7.99it/s]


In [None]:
output_dir = '/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/temp/SCANB_Breast_boot_only_p_change'
n_bootstraps = 200

HRDTimerUtils.generate_bootstraps(
    samples_dict=hrd_wgd_timing_samples,
    n_bootstraps=n_bootstraps,
    output_dir=output_dir
)

Bootstrapping: 100%|██████████| 200/200 [25:06<00:00,  7.53s/it]


In [4]:
HRDTimerUtils.run_HRD_WGD_timing_analysis(
    hrd_wgd_timing_samples=hrd_wgd_timing_samples,
    base_dir="/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/temp/SCANB_Breast_boot_only_p_change",
    output_csv_path="/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/data/output/May27_TimingRun/SCANB_Breast_WGD_HRD_TimingResults_timing_nboot200_0001_only_prob_change_boot.csv"
)

Processing WGD Samples: 100%|██████████| 49/49 [09:24<00:00, 11.52s/it]
Processing HRD Samples: 100%|██████████| 49/49 [39:53<00:00, 48.84s/it]

+----------+------------+-----------------+-----------------+---------------------+---------------------+-----------+-----------------+-----------------+---------------------+---------------------+---------------+-----------------------------------------------------------------+--------------------------------------------------------------------+------------------------------------------------------------------+--------------------------------------------------------------------+-----------------------------------------------------------------+--------------------------------------------------------------------+----------------------------------------------------------------+--------------------------------------------------------------------+-------------------------------------------------+-------------+--------------------------------------------------------------+--------------------------------------------------------------+-----------------+--------------------+
| ID       |    H




# INFORM Run

In [4]:
# DEFINE PATHS ----- need to modify based on where we will end up saving/providing the data.
INFORM_timing_vcfs = "/Volumes/extSSD/park_lab/HRDTimer_Analysis/AA_NEW_TEST_RUN_INFORM_germline_filtered_v2/Breast/timing"
breast_metadata_path = '/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/data/metadata/pan_metadata_v3.csv'

In [5]:
breast_timing_samples = HRDTimerUtils.prepare_samples_for_timing(INFORM_timing_vcfs)

breast_metadata_df = pd.read_csv(breast_metadata_path)
timing_sample_ids = list(breast_timing_samples.keys())

wgd_sample_ids = breast_metadata_df.query("sample in @timing_sample_ids and isWGD")["sample"].tolist()
hrd_wgd_sample_ids = breast_metadata_df.query("organ == 'Breast' and isWGD")["sample"].tolist()

wgd_timing_samples = {s: breast_timing_samples[s] for s in wgd_sample_ids if s in breast_timing_samples}
hrd_wgd_timing_samples = {s: breast_timing_samples[s] for s in hrd_wgd_sample_ids if s in breast_timing_samples}

Processing Early samples:


Processing Files: 100%|██████████| 19/19 [00:01<00:00, 11.59it/s]


Processing Late samples:


Processing Files: 100%|██████████| 19/19 [00:02<00:00,  8.93it/s]


Processing NA samples:


Processing Files: 100%|██████████| 19/19 [00:02<00:00,  8.73it/s]


In [10]:
output_dir = '/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/temp/INFORM_Breast_boot_only_p_change'
n_bootstraps = 200

HRDTimerUtils.generate_bootstraps(
    samples_dict=hrd_wgd_timing_samples,
    n_bootstraps=n_bootstraps,
    output_dir=output_dir
)

Bootstrapping: 100%|██████████| 200/200 [05:01<00:00,  1.51s/it]


In [6]:
HRDTimerUtils.run_HRD_WGD_timing_analysis(
    hrd_wgd_timing_samples=hrd_wgd_timing_samples,
    base_dir="/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/temp/INFORM_Breast_boot_only_p_change",
    output_csv_path="/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/data/output/May27_TimingRun/INFORM_Breast_WGD_HRD_TimingResults_timing_nboot200_0001_only_prob_change_boot_pSub.csv"
)

Processing WGD Samples: 100%|██████████| 8/8 [02:50<00:00, 21.33s/it]
Processing HRD Samples: 100%|██████████| 8/8 [45:44<00:00, 343.06s/it]

+------------+------------+-----------------+-----------------+---------------------+---------------------+-----------+-----------------+-----------------+---------------------+---------------------+---------------+-----------------------------------------------------------------+------------------------------------------------------------------+-----------------------------------------------------------------+------------------------------------------------------------------+----------------------------------------------------------------+--------------------------------------------------------------------+---------------------------------------------------------------+-------------------------------------------------------------------+-------------------------------------------------+-----------+-------------------------------------------------------------+-------------------------------------------------------------+----------------+--------------------+
| ID         |    HRDTime | 




# PCAWG Ovary run

In [2]:
# DEFINE PATHS ----- need to modify based on where we will end up saving/providing the data.
PCAWG_timing_vcfs = "/Volumes/extSSD/park_lab/HRDTimer_Analysis/AA_NEW_TEST_RUN_PCAWG_Apr25_v2/Ovary/timing"
metadata_path = '/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/data/metadata/pan_metadata_v3.csv'

In [3]:
ovary_timing_samples = HRDTimerUtils.prepare_samples_for_timing(PCAWG_timing_vcfs)

metadata_df = pd.read_csv(metadata_path)
timing_sample_ids = list(ovary_timing_samples.keys())

wgd_sample_ids = metadata_df.query("sample in @timing_sample_ids and isWGD")["sample"].tolist()
hrd_wgd_sample_ids = metadata_df.query("`HRDetect.isHRD` and organ == 'Ovary' and isWGD")["sample"].tolist()

wgd_timing_samples = {s: ovary_timing_samples[s] for s in wgd_sample_ids if s in ovary_timing_samples}
hrd_wgd_timing_samples = {s: ovary_timing_samples[s] for s in hrd_wgd_sample_ids if s in ovary_timing_samples}

Processing Early samples:


Processing Files: 100%|██████████| 109/109 [00:12<00:00,  8.83it/s]


Processing Late samples:


Processing Files: 100%|██████████| 107/107 [00:16<00:00,  6.35it/s]


Processing NA samples:


Processing Files: 100%|██████████| 109/109 [00:15<00:00,  7.02it/s]


In [5]:
output_dir = '/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/temp/PCAWG_Ovary_boot_only_p_change'
n_bootstraps = 200

HRDTimerUtils.generate_bootstraps(
    samples_dict=hrd_wgd_timing_samples,
    n_bootstraps=n_bootstraps,
    output_dir=output_dir
)

Bootstrapping: 100%|██████████| 200/200 [19:14<00:00,  5.77s/it]


In [6]:
HRDTimerUtils.run_HRD_WGD_timing_analysis(
    hrd_wgd_timing_samples=hrd_wgd_timing_samples,
    base_dir="/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/temp/PCAWG_Ovary_boot_only_p_change",
    output_csv_path="/Users/michail/HMS Dropbox/Michail Andreopoulos/HRDTimer/data/output/May27_TimingRun/PCAWG_Ovary_WGD_HRD_TimingResults_timing_nboot200_0001_only_prob_change_boot_pSub.csv"
)

Processing WGD Samples: 100%|██████████| 27/27 [07:59<00:00, 17.77s/it]
Processing HRD Samples: 100%|██████████| 27/27 [32:29<00:00, 72.22s/it] 

+--------------------------------------+-----------+-----------------+-----------------+---------------------+---------------------+-----------+-----------------+-----------------+---------------------+---------------------+---------------+------------------------------------------------------------------+--------------------------------------------------------------------+--------------------------------------------------------------------+------------------------------------------------------------------------+-----------------------------------------------------------------+--------------------------------------------------------------------+----------------------------------------------------------------+-----------------------------------------------------------------------+-------------------------------------------------+-----------+-------------------------------------------------------------+--------------------------------------------------------------+-----------------+-----


