# Combines results across chromosomes

Post-run program to merge tables (summary and expanded table) across chromosomes to enable genome-wide analysis

Instructions:
1. Run each cell from the beginning in order
2. Modify the variables where suggested

In [1]:
import pandas as pd
import os
import glob
import re

In [1]:
def _chrom_order(chrom):
    """
    Map chr names to an integer order
    """
    m = re.match(r'^chr(\d+)$', chrom)
    if m:
        return int(m.group(1))
    if chrom in ('chrX', 'X'):
        return 23
    if chrom in ('chrY', 'Y'):
        return 24
    if chrom in ('chrM', 'chrMT', 'MT'):
        return 25
    # put everything else at the end
    return float('inf')

In [None]:
def extract_comments(csv_path: str) -> list[str]:
    """
    Read the leading comments “#” from a csv and return them 
    Stops as soon as it hits a non-# line
    """
    comments = []
    with open(csv_path, 'r') as fh:
        for line in fh:
            if line.startswith('#'):
                # strip newline, then replace chr<number> with chr*
                txt = line.rstrip('\n')
                txt = re.sub(r'chr\d+', 'chr*', txt)
                comments.append(txt)
            else:
                break
    return comments


def combine_results(results_dir, folder_pattern, result_type):
    """
    Search for folders under `results_dir` matching `folder_pattern` (which may
    contain a glob-style asterisk), then for each folder:
      - load <folder>/<folder>_results_<result_type>/<folder>_summary_table.csv
      - load <folder>/<folder>_results_<result_type>/<folder>_expanded_table.csv
    and concatenate all summaries and all expanded tables into two DataFrames.

    Parameters
    ----------
    results_dir : str
        Path to the directory containing all result‐folders
    folder_pattern : str
        A glob‐style pattern (e.g. "splenic-B-cell_*_50Kb") used to match subfolders
    result_type : str
        The suffix after "_results" in each folder (e.g. "all", "p-0.01", etc)

    Returns
    -------
    (combined_summary, combined_expanded) : tuple of pd.DataFrame
        The concatenated summary and expanded tables.
    """
    # build the full glob pattern
    search_pattern = os.path.join(results_dir, folder_pattern)

    # Only match with digits for chromosomes (not letters)
    prefix, suffix = folder_pattern.split('*')
    pattern = re.compile(rf"^{re.escape(prefix)}\d+{re.escape(suffix)}$")

    matched_folders = [
        os.path.join(results_dir, d)
        for d in os.listdir(results_dir)
        if pattern.match(d)
    ]

    if not matched_folders:
        raise FileNotFoundError(f"No folders found matching {search_pattern}")

    summary_frames = []
    expanded_frames = []

    for i, folder in enumerate(matched_folders):
        # basename is the actual folder name without the path
        base = os.path.basename(folder)

        # path to the results‐type subdirectory
        res_subdir = os.path.join(folder, f"{base}_results_{result_type}")

        # CSV paths
        summary_csv  = os.path.join(res_subdir, f"{base}_summary_table.csv")
        expanded_csv = os.path.join(res_subdir, f"{base}_expanded_table.csv")

        # Load and collect
        if not os.path.isfile(summary_csv):
            raise FileNotFoundError(f"Expected file not found: {summary_csv}")
        if not os.path.isfile(expanded_csv):
            raise FileNotFoundError(f"Expected file not found: {expanded_csv}")
        
        # extract the comments from a single summary csv only
        if i == 0:
            comments = extract_comments(summary_csv)
            for comment in comments:
                print(comment)

        summary_frames.append(pd.read_csv(summary_csv, comment='#'))
        expanded_frames.append(pd.read_csv(expanded_csv, comment='#'))

    # Concatenate all and reset the index
    combined_summary  = pd.concat(summary_frames,  ignore_index=True)
    combined_expanded = pd.concat(expanded_frames, ignore_index=True)

    # Sort 
    # First convert chromosomes to a numerical value for sorting
    combined_summary["chrom_order"] = combined_summary["chrom"].apply(_chrom_order)
    combined_expanded["chrom_order"] = combined_expanded["chrom"].apply(_chrom_order)
    # Then simply sort by chromosome order and position
    combined_summary.sort_values(by=["chrom_order", "start"], inplace=True)
    combined_expanded.sort_values(by=["chrom_order", "start"], inplace=True)

    # Drop the temporary chrom_order column 
    combined_summary.drop(columns=["chrom_order"], inplace=True)
    combined_expanded.drop(columns=["chrom_order"], inplace=True)

    # Reset index 
    combined_summary.reset_index(drop=True, inplace=True)
    combined_expanded.reset_index(drop=True, inplace=True)

    return combined_summary, combined_expanded, comments

# Save the combined tables
def save_csv(df, save_dir, parameter_str_comment):
    """
    Save a dataframe to a csv file 

    Parameters
    ----------
    df : pd.DataFrame
        The input dataframe to save
    save_dir : str
        The name of the file to save the dataframe to
    parameter_str : list
        A list containing parameters to be included in the beginning of the csv file as comments
        Each element in the list is a string that must start with a special character (e.g '#')
    
    Returns
    -------
    None, but saves the dataframe to a csv file
    """
    with open(save_dir, "w") as f:
        f.write("\n".join(parameter_str_comment) + "\n")
        df.to_csv(f, index=False)

**MODIFY the following cell as needed**

In [35]:
# Let the notebook know what experiment type you would like to combine
# Specify it as a folder pattern with an asterisk * to match multiple directories
# Example: "splenic-B-cell_WT_insitu-hic_GSE82144_mm9_chr*_50Kb" will match all chromosomes
# of that specific experiment run
# folder_pattern = "splenic-B-cell_WT_insitu-hic_Kieffer-Kwon-2018_GSE82144_mm9_chr*_50Kb"
# folder_pattern = "DP-thymocytes_WT_hic_Guo-2022_GSE199059_mm10-remapped_chr*_50Kb"
# folder_pattern = "GSE199059_CD69negDPWTR1R2R3R4_merged_chr*_50Kb"
folder_pattern = "Repli-HiC_K562_WT_totalS_chr*_50Kb"

# Location of the results folders
# Where is the folder above located?
results_dir = "/nfs/turbo/umms-minjilab/sionkim/miajet_output/"

# This is the suffix after the "_results" in each folder 
# You can specify "all" to combine all results or a specifically thresholded output
# Examples: "all", "p-0.01", "saliency-90", "saliency-90-p-0.1"
result_type = ["saliency-90-p-0.1", "saliency-90-p-0.05", "saliency-90-p-0.01", "saliency-90", "p-0.1", "p-0.05", "p-0.01", "all"]

In [36]:
if isinstance(result_type, str):
    df_s, df_e, parameter_str = combine_results(results_dir, folder_pattern, result_type)
    # Will print out the parametesr of the experiment
    # So that you can confirm this is indeed the correct experiment you are combining

    df_summary = [df_s]
    df_expanded = [df_e]
else:
    df_summary = []
    df_expanded = []
    for rt in result_type:
        df_s, df_e, parameter_str = combine_results(results_dir, folder_pattern, rt)
        df_summary.append(df_s)
        df_expanded.append(df_e)

# Repli-HiC_K562_WT_totalS_chr*_50Kb
# Inputs
# * Hi-C file (.hic): /nfs/turbo/umms-minjilab/downloaded_data/Repli-HiC_K562_WT_totalS.hic
# 
# Required Parameters
# * Experiment type: replihic
# * Chromosome: chr*
# * Resolution: 50000
# * Save directory root: /nfs/turbo/umms-minjilab/sionkim/miajet_output
# 
# Extended Parameters
# * Significance threshold(s): [0.1, 0.05, 0.01]
# * Window size: 6000000
# * Normalization: VC_SQRT
# * Data type: observed
# * Jet widths (if specified): None
# * Angle range: [80, 100]
# * Saliency threshold on zero removed: 90
# * Hysteresis thresholding parameters: [0.01, 0.05]
# * Root within: None
# * Folder name: Repli-HiC_K562_WT_totalS_chr*_50Kb
# * Save directory: /nfs/turbo/umms-minjilab/sionkim/miajet_output/Repli-HiC_K562_WT_totalS_chr*_50Kb
# * Number of cores: 4
# * Verbose: True
# 
# Optional Parameters
# * Scale range: [ 1.5         1.66735513  1.85338208  2.06016407  2.29001675  2.54551412
#   2.82951734  3.14520683  3.49611782  3.88617999 

In [37]:
# Can visually inspect the summary table
df_summary[0]

Unnamed: 0,unique_id,chrom,start,end,length,input_mean,angle_mean,width_mean,jet_saliency,ks,p-val_raw,p-val_corr
0,chr12_2_3,chr12,8.046108e+07,8.089827e+07,700000,0.472,91.414,3.004,0.700,0.929,0.0,0.000
1,chr12_2357_7,chr12,8.042322e+07,8.079556e+07,600000,0.553,90.636,3.868,0.521,1.000,0.0,0.000
2,chr12_2_10,chr12,5.588850e+07,5.776503e+07,2600000,0.203,60.376,4.260,0.457,0.519,0.0,0.000
3,chr12_2_16,chr12,1.158524e+08,1.179368e+08,2950000,0.256,64.249,6.872,0.444,0.559,0.0,0.000
4,chr12_24_15,chr12,6.150615e+07,6.284005e+07,1850000,0.234,86.831,13.069,0.435,0.865,0.0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...
1656,chr16_221_6,chr16,8.579854e+07,8.673415e+07,1500000,0.109,78.613,3.483,0.143,0.733,0.0,0.000
1657,chr16_7268_0,chr16,7.812956e+07,7.850148e+07,550000,0.459,88.197,2.249,0.140,0.818,0.0,0.002
1658,chr16_4659_2,chr16,9.221034e+06,9.454015e+06,400000,0.416,90.853,4.156,0.136,1.000,0.0,0.001
1659,chr16_105_10,chr16,8.583431e+07,8.735676e+07,2050000,0.081,61.642,5.425,0.136,0.732,0.0,0.000


In [38]:
# Can visually inspect the expanded table
df_expanded[0]

Unnamed: 0,unique_id,chrom,x (bp),y (bp),x (pixels),y (pixels),width,angle_imagej,ridge_strength
0,chr12_2_3,chr12,8.046108e+07,8.015064e+07,2271.807,79.755,3.372,89.289,0.057
1,chr12_2_3,chr12,8.049363e+07,8.011162e+07,2271.716,78.743,3.178,92.001,0.064
2,chr12_2_3,chr12,8.052626e+07,8.007317e+07,2271.633,77.738,2.923,94.858,0.069
3,chr12_2_3,chr12,8.055620e+07,8.003335e+07,2271.493,76.752,2.918,96.964,0.071
4,chr12_2_3,chr12,8.058475e+07,7.999328e+07,2271.331,75.781,2.981,97.139,0.070
...,...,...,...,...,...,...,...,...,...
33497,chr16_5577_1,chr16,6.328554e+07,6.086218e+07,1756.128,49.874,1.321,117.317,0.051
33498,chr16_5577_1,chr16,6.329878e+07,6.081951e+07,1755.712,49.084,1.666,108.057,0.057
33499,chr16_5577_1,chr16,6.332165e+07,6.077359e+07,1755.386,48.111,1.865,95.834,0.060
33500,chr16_5577_1,chr16,6.335884e+07,6.073685e+07,1755.392,47.065,1.921,87.752,0.059


**(OPTIONAL) modify as needed**

In [39]:
save_name = folder_pattern.replace('*', f'_combined') 
# This is the save name for the combined results
# Essentially, the asterisk * in the folder pattern is replaced with "_combined"
# This may be overwritten if you want to save with a different name

save_dir = os.path.join(results_dir, save_name)
# Folder to save the combined results (summary and expanded tables)
# You may overwrite the `results_dir` if you wish to save to a different location

print("Saving combined tables to:")
print(f"* Folder location: {save_dir}")

save_dir_summary = []
save_dir_expanded = []
for rt in result_type:
    print(f"* Result type: {rt}")
    save_name_file = folder_pattern.replace('*', f'_combined_{rt}')

    # Below is the code to print out the save locations to help you modify the variables abovef
    # Once satisfied, move on to the next cell
    save_dir_s = os.path.join(save_dir, f"{save_name_file}_summary_table.csv")
    save_dir_e = os.path.join(save_dir, f"{save_name_file}_expanded_table.csv")

    save_dir_summary.append(save_dir_s)
    save_dir_expanded.append(save_dir_e)

    print(f"\t* Summary table: {save_name_file}_summary_table.csv")
    print(f"\t* Expanded table: {save_name_file}_expanded_table.csv")

Saving combined tables to:
* Folder location: /nfs/turbo/umms-minjilab/sionkim/miajet_output/Repli-HiC_K562_WT_totalS_chr_combined_50Kb
* Result type: saliency-90-p-0.1
	* Summary table: Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.1_50Kb_summary_table.csv
	* Expanded table: Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.1_50Kb_expanded_table.csv
* Result type: saliency-90-p-0.05
	* Summary table: Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.05_50Kb_summary_table.csv
	* Expanded table: Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.05_50Kb_expanded_table.csv
* Result type: saliency-90-p-0.01
	* Summary table: Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.01_50Kb_summary_table.csv
	* Expanded table: Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.01_50Kb_expanded_table.csv
* Result type: saliency-90
	* Summary table: Repli-HiC_K562_WT_totalS_chr_combined_saliency-90_50Kb_summary_table.csv
	* Expanded table: Repli-HiC_K562_WT_totalS_chr_combi

In [40]:
# Make directory if it doesn't exist already and save the files
os.makedirs(save_dir, exist_ok=True)

for i in range(len(result_type)):
    save_csv(df_summary[i], save_dir_summary[i], parameter_str)
    save_csv(df_expanded[i], save_dir_expanded[i], parameter_str)
    print(f"Saved\n *{save_dir_summary[i]}\n *{save_dir_expanded[i]}")

Saved
 */nfs/turbo/umms-minjilab/sionkim/miajet_output/Repli-HiC_K562_WT_totalS_chr_combined_50Kb/Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.1_50Kb_summary_table.csv
 */nfs/turbo/umms-minjilab/sionkim/miajet_output/Repli-HiC_K562_WT_totalS_chr_combined_50Kb/Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.1_50Kb_expanded_table.csv
Saved
 */nfs/turbo/umms-minjilab/sionkim/miajet_output/Repli-HiC_K562_WT_totalS_chr_combined_50Kb/Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.05_50Kb_summary_table.csv
 */nfs/turbo/umms-minjilab/sionkim/miajet_output/Repli-HiC_K562_WT_totalS_chr_combined_50Kb/Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.05_50Kb_expanded_table.csv
Saved
 */nfs/turbo/umms-minjilab/sionkim/miajet_output/Repli-HiC_K562_WT_totalS_chr_combined_50Kb/Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.01_50Kb_summary_table.csv
 */nfs/turbo/umms-minjilab/sionkim/miajet_output/Repli-HiC_K562_WT_totalS_chr_combined_50Kb/Repli-HiC_K562_WT_totalS_chr_c