In [1]:
# ===============================
# Standard Library
# ===============================
import os
import sys
import re
import ast
import csv
import shutil
import logging
from contextlib import redirect_stdout

# ===============================
# Third-Party Libraries
# ===============================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from tqdm import tqdm
from tabulate import tabulate
from scipy import stats, odr
from scipy.stats import linregress
from contextlib import contextmanager
import vcfpy  # via pip only

# ===============================
# Domain-Specific / External Libraries
# ===============================
from SigProfilerAssignment import Analyzer as Analyze  # via pip
import musical  # Not installable via pip/conda – likely local or private


pd.set_option("display.max_columns", None)

original_sys_path = sys.path.copy()
sys.path.append(os.path.abspath(".."))
from hrdtimer import utils as HRDTimerUtils
sys.path = original_sys_path

%matplotlib inline

## Helpers

In [3]:
class DummyFile(object):
    def write(self, x): pass
    def flush(self): pass

def suppress_prints():
    sys.stdout = DummyFile()
    sys.stderr = DummyFile()

def restore_prints():
    sys.stdout = sys.__stdout__
    sys.stderr = sys.__stderr__

In [3]:
INPUT_PCAWG = "../raw_input_data/PCAWG_MutationTimeR_output"
OUTPUT_PCAWG = "../HRDTimer_PreProcess/PCAWG"
METADATA_PCAWG = "../data/metadata/pan_metadata_v5.csv"

In [4]:
if not os.path.exists(OUTPUT_PCAWG):
    os.makedirs(OUTPUT_PCAWG)

# Processing all mutations
HRDTimerUtils.process_vcfs_early_late(INPUT_PCAWG, OUTPUT_PCAWG, METADATA_PCAWG, time_analysis=False)
# Processing timing mutations
HRDTimerUtils.process_vcfs_early_late(INPUT_PCAWG, OUTPUT_PCAWG, METADATA_PCAWG, time_analysis=True)

Processing VCFs: 100%|██████████| 317/317 [01:48<00:00,  2.91it/s]
Processing VCFs: 100%|██████████| 317/317 [01:26<00:00,  3.65it/s]


In [2]:
INPUT_SCANB = "../raw_input_data/SCANB_unfiltered_MutationTimeR_output"
OUTPUT_SCANB = "../HRDTimer_PreProcess/SCANB"
METADATA_SCANB = "../data/metadata/pan_metadata_v5.csv"

In [3]:
if not os.path.exists(OUTPUT_SCANB):
    os.makedirs(OUTPUT_SCANB)

# Processing all mutations
HRDTimerUtils.process_vcfs_early_late(INPUT_SCANB, OUTPUT_SCANB, METADATA_SCANB, time_analysis=False)
# Processing timing mutations
HRDTimerUtils.process_vcfs_early_late(INPUT_SCANB, OUTPUT_SCANB, METADATA_SCANB, time_analysis=True)

Processing VCFs: 100%|██████████| 240/240 [00:18<00:00, 12.67it/s]
Processing VCFs: 100%|██████████| 240/240 [00:14<00:00, 16.41it/s]


In [4]:
INPUT_INFORM = "../raw_input_data/INFORM_0.001popAF_germline_filtered_MutationTimeR_output"
OUTPUT_INFORM = "../HRDTimer_PreProcess/INFORM"
METADATA_INFORM = "../data/metadata/pan_metadata_v5.csv"

In [5]:
if not os.path.exists(OUTPUT_INFORM):
    os.makedirs(OUTPUT_INFORM)

# Processing all mutations
HRDTimerUtils.process_vcfs_early_late(INPUT_INFORM, OUTPUT_INFORM, METADATA_INFORM, time_analysis=False)
# Processing timing mutations
HRDTimerUtils.process_vcfs_early_late(INPUT_INFORM, OUTPUT_INFORM, METADATA_INFORM, time_analysis=True)

Processing VCFs: 100%|██████████| 19/19 [00:12<00:00,  1.56it/s]
Processing VCFs: 100%|██████████| 19/19 [00:10<00:00,  1.77it/s]


In [4]:
base_dir = "../HRDTimer_PreProcess"

def find_all_mut_and_timing_dirs(base_dir):
    target_dirs = []
    for root, dirs, files in os.walk(base_dir):
        for d in dirs:
            if d in ['all_mut', 'timing']:
                target_dirs.append(os.path.join(root, d))
    return target_dirs

dirs_to_process = find_all_mut_and_timing_dirs(base_dir)

for d in dirs_to_process:
    genome_build = "GRCh38" if "INFORM" in d else "GRCh37"
    print(f"Processing {d} with genome build {genome_build} ...")
    suppress_prints()
    HRDTimerUtils.run_Signature_Analysis(d, genome_build=genome_build)
    restore_prints()


Processing ../HRDTimer_PreProcess/INFORM/Breast/all_mut with genome build GRCh38 ...
|████████████████████████████████████████| 19/19 [100%] in 6.9s (2.76/s) 
|████████████████████████████████████████| 19/19 [100%] in 6.5s (2.94/s) 
|████████████████████████████████████████| 19/19 [100%] in 9.2s (2.07/s) 
Processing ../HRDTimer_PreProcess/INFORM/Breast/timing with genome build GRCh38 ...
|████████████████████████████████████████| 19/19 [100%] in 6.2s (3.07/s) 
|████████████████████████████████████████| 19/19 [100%] in 5.6s (3.37/s) 
|████████████████████████████████████████| 19/19 [100%] in 6.9s (2.74/s) 
Processing ../HRDTimer_PreProcess/PCAWG/Breast/all_mut with genome build GRCh37 ...
|████████████████████████████████████████| 207/207 [100%] in 1:13.0 (2.84/s) 
|████████████████████████████████████████| 184/184 [100%] in 1:06.1 (2.78/s) 
|████████████████████████████████████████| 208/208 [100%] in 1:32.4 (2.25/s) 
Processing ../HRDTimer_PreProcess/PCAWG/Breast/timing with genome bui