# Combines results across chromosomes

Post-run program to merge tables (summary and expanded table) across chromosomes to enable genome-wide analysis

Instructions:
1. Run each cell from the beginning in order
2. Modify the variables where suggested

In [1]:
import pandas as pd
import os
import glob
import re

In [2]:
def _chrom_order(chrom):
    """
    Map chr names to an integer order
    """
    m = re.match(r'^chr(\d+)$', chrom)
    if m:
        return int(m.group(1))
    if chrom in ('chrX', 'X'):
        return 23
    if chrom in ('chrY', 'Y'):
        return 24
    if chrom in ('chrM', 'chrMT', 'MT'):
        return 25
    # put everything else at the end
    return float('inf')

In [21]:
def extract_comments(csv_path: str) -> list[str]:
    """
    Read the leading comments “#” from a csv and return them 
    Stops as soon as it hits a non-# line
    """
    comments = []
    with open(csv_path, 'r') as fh:
        for line in fh:
            if line.startswith('#'):
                # strip newline, then replace chr<number> with chr*
                txt = line.rstrip('\n')
                txt = re.sub(r'chr\d+', 'chr*', txt)
                comments.append(txt)
            else:
                break
    return comments


def combine_results(results_dir, folder_pattern, result_type):
    """
    Search for folders under `results_dir` matching `folder_pattern` (which may
    contain a glob-style asterisk), then for each folder:
      - load <folder>/<folder>_results_<result_type>/<folder>_summary_table.csv
      - load <folder>/<folder>_results_<result_type>/<folder>_expanded_table.csv
    and concatenate all summaries and all expanded tables into two DataFrames.

    Parameters
    ----------
    results_dir : str
        Path to the directory containing all result‐folders
    folder_pattern : str
        A glob‐style pattern (e.g. "splenic-B-cell_*_50Kb") used to match subfolders
    result_type : str
        The suffix after "_results" in each folder (e.g. "all", "p-0.01", etc)

    Returns
    -------
    (combined_summary, combined_expanded) : tuple of pd.DataFrame
        The concatenated summary and expanded tables.
    """
    # build the full glob pattern
    search_pattern = os.path.join(results_dir, folder_pattern)

    # Only match with digits for chromosomes (not letters)
    prefix, suffix = folder_pattern.split('*')

    # If this notebook is already run, then do not include the (already combined) combined folder
    exclude_name = f"{prefix}_combined{suffix}"
    # print(exclude_name)

    pattern = re.compile(
        rf"^{re.escape(prefix)}"      # literal “Repli-HiC_K562_WT_totalS_chr”
        r"(?:[1-9][0-9]?|X|Y|M)"      # one- or two-digit number (1–99), or X, Y, or M
        rf"{re.escape(suffix)}$"      # literal “_50Kb”
    )

    matched_folders = [
        os.path.join(results_dir, d)
        for d in os.listdir(results_dir)
        if pattern.match(d)
    ]

    # Exclude the combined folder if it exists
    matched_folders = [
        f for f in matched_folders
        if os.path.basename(f) != exclude_name
    ]

    if not matched_folders:
        raise FileNotFoundError(f"No folders found matching {search_pattern}")

    summary_frames = []
    expanded_frames = []

    for i, folder in enumerate(matched_folders):
        # basename is the actual folder name without the path
        base = os.path.basename(folder)

        # path to the results‐type subdirectory
        res_subdir = os.path.join(folder, f"{base}_results_{result_type}")

        # CSV paths
        summary_csv  = os.path.join(res_subdir, f"{base}_summary_table.csv")
        expanded_csv = os.path.join(res_subdir, f"{base}_expanded_table.csv")

        # Load and collect
        if not os.path.isfile(summary_csv):
            raise FileNotFoundError(f"Expected file not found: {summary_csv}")
        if not os.path.isfile(expanded_csv):
            raise FileNotFoundError(f"Expected file not found: {expanded_csv}")
        
        # extract the comments from a single summary csv only
        if i == 0:
            comments = extract_comments(summary_csv)
            for comment in comments:
                print(comment)

        summary_frames.append(pd.read_csv(summary_csv, comment='#'))
        expanded_frames.append(pd.read_csv(expanded_csv, comment='#'))

    # Concatenate all and reset the index
    combined_summary  = pd.concat(summary_frames,  ignore_index=True)
    combined_expanded = pd.concat(expanded_frames, ignore_index=True)

    # Sort 
    # First convert chromosomes to a numerical value for sorting
    combined_summary["chrom_order"] = combined_summary["chrom"].apply(_chrom_order)
    combined_expanded["chrom_order"] = combined_expanded["chrom"].apply(_chrom_order)
    # Then simply sort by chromosome order and position
    combined_summary.sort_values(by=["chrom_order"], inplace=True)
    combined_expanded.sort_values(by=["chrom_order"], inplace=True)

    # Drop the temporary chrom_order column 
    combined_summary.drop(columns=["chrom_order"], inplace=True)
    combined_expanded.drop(columns=["chrom_order"], inplace=True)

    # Reset index 
    combined_summary.reset_index(drop=True, inplace=True)
    combined_expanded.reset_index(drop=True, inplace=True)

    return combined_summary, combined_expanded, comments

# Save the combined tables
def save_csv(df, save_dir, parameter_str_comment):
    """
    Save a dataframe to a csv file 

    Parameters
    ----------
    df : pd.DataFrame
        The input dataframe to save
    save_dir : str
        The name of the file to save the dataframe to
    parameter_str : list
        A list containing parameters to be included in the beginning of the csv file as comments
        Each element in the list is a string that must start with a special character (e.g '#')
    
    Returns
    -------
    None, but saves the dataframe to a csv file
    """
    with open(save_dir, "w") as f:
        f.write("\n".join(parameter_str_comment) + "\n")
        df.to_csv(f, index=False)

**MODIFY the following cell as needed**

In [41]:
# Let the notebook know what experiment type you would like to combine
# Specify it as a folder pattern with an asterisk * to match multiple directories
# Example: "splenic-B-cell_WT_insitu-hic_GSE82144_mm9_chr*_50Kb" will match all chromosomes
# of that specific experiment run
# folder_pattern = "splenic-B-cell_WT_insitu-hic_Kieffer-Kwon-2018_GSE82144_mm9_chr*_50Kb"
# folder_pattern = "DP-thymocytes_WT_hic_Guo-2022_GSE199059_mm10-remapped_chr*_50Kb"
# folder_pattern = "GSE199059_CD69negDPWTR1R2R3R4_merged_chr*_50Kb"
folder_pattern = "Repli-HiC_K562_WT_totalS_chr*_50Kb"

# Location of the results folders
# Where is the folder above located?
results_dir = "/nfs/turbo/umms-minjilab/sionkim/miajet_output/"

# This is the suffix after the "_results" in each folder 
# You can specify "all" to combine all results or a specifically thresholded output
# Examples: "all", "p-0.01", "saliency-90", "saliency-90-p-0.1"
result_type = ["saliency-90-p-0.1", "saliency-90-p-0.05", "saliency-90-p-0.01", "saliency-90", "p-0.1", "p-0.05", "p-0.01", "all"]

In [42]:
if isinstance(result_type, str):
    df_s, df_e, parameter_str = combine_results(results_dir, folder_pattern, result_type)
    # Will print out the parametesr of the experiment
    # So that you can confirm this is indeed the correct experiment you are combining

    df_summary = [df_s]
    df_expanded = [df_e]
else:
    df_summary = []
    df_expanded = []
    for rt in result_type:
        df_s, df_e, parameter_str = combine_results(results_dir, folder_pattern, rt)
        df_summary.append(df_s)
        df_expanded.append(df_e)

# Repli-HiC_K562_WT_totalS_chr*_50Kb
# Inputs
# * Hi-C file (.hic): /nfs/turbo/umms-minjilab/downloaded_data/Repli-HiC_K562_WT_totalS.hic
# 
# Required Parameters
# * Experiment type: replihic
# * Chromosome: chr*
# * Resolution: 50000
# * Save directory root: /nfs/turbo/umms-minjilab/sionkim/miajet_output
# 
# Extended Parameters
# * Significance threshold(s): [0.1, 0.05, 0.01]
# * Window size: 6000000
# * Normalization: VC_SQRT
# * Data type: observed
# * Jet widths (if specified): None
# * Angle range: [80, 100]
# * Saliency threshold on zero removed: 90
# * Hysteresis thresholding parameters: [0.01, 0.05]
# * Root within: None
# * Folder name: Repli-HiC_K562_WT_totalS_chr*_50Kb
# * Save directory: /nfs/turbo/umms-minjilab/sionkim/miajet_output/Repli-HiC_K562_WT_totalS_chr*_50Kb
# * Number of cores: 4
# * Verbose: True
# 
# Optional Parameters
# * Scale range: [ 1.5         1.66735513  1.85338208  2.06016407  2.29001675  2.54551412
#   2.82951734  3.14520683  3.49611782  3.88617999 

In [43]:
# Can visually inspect the summary table
df_summary[0]

Unnamed: 0,unique_id,chrom,start,end,length,input_mean,angle_mean,width_mean,jet_saliency,ks,p-val_raw,p-val_corr
0,chr1_25_12,chr1,2.223542e+08,2.240925e+08,2350000,0.215,76.983,6.368,0.263,0.617,0.000,0.000
1,chr1_1071_12,chr1,2.246427e+08,2.251337e+08,800000,0.281,92.141,3.698,0.261,0.688,0.000,0.002
2,chr1_22_18,chr1,1.596501e+08,1.605675e+08,1300000,0.186,87.539,11.781,0.260,0.769,0.000,0.000
3,chr1_70_20,chr1,6.989730e+07,7.081756e+07,1400000,0.243,89.442,24.892,0.230,0.821,0.000,0.000
4,chr1_14_20,chr1,2.463660e+08,2.468952e+08,850000,0.306,90.313,15.746,0.228,1.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...
1752,chrX_13295_1,chrX,1.359620e+08,1.363016e+08,450000,0.411,82.786,2.158,0.147,0.778,0.003,0.011
1753,chrX_1779_9,chrX,9.522445e+06,9.961117e+06,700000,0.387,95.874,4.096,0.145,0.714,0.001,0.004
1754,chrX_1046_0,chrX,9.968278e+07,9.999746e+07,550000,0.170,98.563,1.408,0.139,0.909,0.000,0.001
1755,chrX_284_6,chrX,3.618103e+07,3.671831e+07,850000,0.168,93.617,3.675,0.138,0.647,0.001,0.005


In [44]:
# Can visually inspect the expanded table
df_expanded[0]

Unnamed: 0,unique_id,chrom,x (bp),y (bp),x (pixels),y (pixels),width,angle_imagej,ridge_strength
0,chr1_3459_8,chr1,2.108860e+08,2.087218e+08,5934.565,53.541,3.919,44.243,0.011
1,chr1_3459_8,chr1,2.109325e+08,2.086999e+08,5934.912,52.573,4.003,57.446,0.011
2,chr1_3459_8,chr1,2.109884e+08,2.086819e+08,5935.447,51.527,1.436,70.582,0.011
3,chr1_3459_8,chr1,2.110090e+08,2.086074e+08,5934.686,50.183,4.785,78.052,0.010
4,chr1_3459_8,chr1,2.110364e+08,2.085588e+08,5934.385,49.107,6.435,81.877,0.009
...,...,...,...,...,...,...,...,...,...
34879,chrX_146_1,chrX,1.407019e+08,1.398306e+08,3967.743,71.824,1.435,81.464,0.036
34880,chrX_9205_3,chrX,9.580091e+07,9.367039e+07,2679.943,54.016,1.538,90.758,0.020
34881,chrX_9205_3,chrX,9.576210e+07,9.370265e+07,2679.851,55.021,1.089,92.334,0.018
34882,chrX_109_9,chrX,4.358364e+07,4.173152e+07,1206.953,57.953,4.610,87.622,0.014


**(OPTIONAL) modify as needed**

In [45]:
save_name = folder_pattern.replace('*', f'_combined') 
# This is the save name for the combined results

save_dir = os.path.join(results_dir, save_name)
# Folder to save the combined results (summary and expanded tables)
# You may overwrite the `results_dir` if you wish to save to a different location

print("Saving combined tables to:")
print(f"* Folder location: {save_dir}")

save_dir_summary = []
save_dir_expanded = []
for rt in result_type:
    print(f"* Result type: {rt}")
    save_name_file = folder_pattern.replace('*', f'_combined_{rt}')

    # Below is the code to print out the save locations to help you modify the variables abovef
    # Once satisfied, move on to the next cell
    save_dir_s = os.path.join(save_dir, f"{save_name_file}_summary_table.csv")
    save_dir_e = os.path.join(save_dir, f"{save_name_file}_expanded_table.csv")

    save_dir_summary.append(save_dir_s)
    save_dir_expanded.append(save_dir_e)

    print(f"\t* Summary table: {save_name_file}_summary_table.csv")
    print(f"\t* Expanded table: {save_name_file}_expanded_table.csv")

Saving combined tables to:
* Folder location: /nfs/turbo/umms-minjilab/sionkim/miajet_output/Repli-HiC_K562_WT_totalS_chr_combined_50Kb
* Result type: saliency-90-p-0.1
	* Summary table: Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.1_50Kb_summary_table.csv
	* Expanded table: Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.1_50Kb_expanded_table.csv
* Result type: saliency-90-p-0.05
	* Summary table: Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.05_50Kb_summary_table.csv
	* Expanded table: Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.05_50Kb_expanded_table.csv
* Result type: saliency-90-p-0.01
	* Summary table: Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.01_50Kb_summary_table.csv
	* Expanded table: Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.01_50Kb_expanded_table.csv
* Result type: saliency-90
	* Summary table: Repli-HiC_K562_WT_totalS_chr_combined_saliency-90_50Kb_summary_table.csv
	* Expanded table: Repli-HiC_K562_WT_totalS_chr_combi

In [46]:
# Make directory if it doesn't exist already and save the files
os.makedirs(save_dir, exist_ok=True)

for i in range(len(result_type)):
    save_csv(df_summary[i], save_dir_summary[i], parameter_str)
    save_csv(df_expanded[i], save_dir_expanded[i], parameter_str)
    print(f"Saved\n *{save_dir_summary[i]}\n *{save_dir_expanded[i]}")

Saved
 */nfs/turbo/umms-minjilab/sionkim/miajet_output/Repli-HiC_K562_WT_totalS_chr_combined_50Kb/Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.1_50Kb_summary_table.csv
 */nfs/turbo/umms-minjilab/sionkim/miajet_output/Repli-HiC_K562_WT_totalS_chr_combined_50Kb/Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.1_50Kb_expanded_table.csv
Saved
 */nfs/turbo/umms-minjilab/sionkim/miajet_output/Repli-HiC_K562_WT_totalS_chr_combined_50Kb/Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.05_50Kb_summary_table.csv
 */nfs/turbo/umms-minjilab/sionkim/miajet_output/Repli-HiC_K562_WT_totalS_chr_combined_50Kb/Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.05_50Kb_expanded_table.csv
Saved
 */nfs/turbo/umms-minjilab/sionkim/miajet_output/Repli-HiC_K562_WT_totalS_chr_combined_50Kb/Repli-HiC_K562_WT_totalS_chr_combined_saliency-90-p-0.01_50Kb_summary_table.csv
 */nfs/turbo/umms-minjilab/sionkim/miajet_output/Repli-HiC_K562_WT_totalS_chr_combined_50Kb/Repli-HiC_K562_WT_totalS_chr_c