# Combines results across chromosomes

Post-run program to merge tables (summary and expanded table) across chromosomes to enable genome-wide analysis

Instructions:
1. Run each cell from the beginning in order
2. Modify the variables where suggested

In [47]:
import pandas as pd
import os
import glob
import re

In [48]:
def extract_comments(csv_path: str) -> list[str]:
    """
    Read the leading comments “#” from a csv and return them 
    Stops as soon as it hits a non-# line
    """
    comments = []
    with open(csv_path, 'r') as fh:
        for line in fh:
            if line.startswith('#'):
                # strip newline, then replace chr<number> with chr*
                txt = line.rstrip('\n')
                txt = re.sub(r'chr\d+', 'chr*', txt)
                comments.append(txt)
            else:
                break
    return comments


def combine_results(results_dir, folder_pattern, result_type):
    """
    Search for folders under `results_dir` matching `folder_pattern` (which may
    contain a glob-style asterisk), then for each folder:
      - load <folder>/<folder>_results_<result_type>/<folder>_summary_table.csv
      - load <folder>/<folder>_results_<result_type>/<folder>_expanded_table.csv
    and concatenate all summaries and all expanded tables into two DataFrames.

    Parameters
    ----------
    results_dir : str
        Path to the directory containing all result‐folders
    folder_pattern : str
        A glob‐style pattern (e.g. "splenic-B-cell_*_50Kb") used to match subfolders
    result_type : str
        The suffix after "_results" in each folder (e.g. "all", "p-0.01", etc)

    Returns
    -------
    (combined_summary, combined_expanded) : tuple of pd.DataFrame
        The concatenated summary and expanded tables.
    """
    # build the full glob pattern
    search_pattern = os.path.join(results_dir, folder_pattern)

    # Only match with digits for chromosomes (not letters)
    prefix, suffix = folder_pattern.split('*')
    pattern = re.compile(rf"^{re.escape(prefix)}\d+{re.escape(suffix)}$")

    matched_folders = [
        os.path.join(results_dir, d)
        for d in os.listdir(results_dir)
        if pattern.match(d)
    ]

    if not matched_folders:
        raise FileNotFoundError(f"No folders found matching {search_pattern}")

    summary_frames = []
    expanded_frames = []

    for i, folder in enumerate(matched_folders):
        # basename is the actual folder name without the path
        base = os.path.basename(folder)

        # path to the results‐type subdirectory
        res_subdir = os.path.join(folder, f"{base}_results_{result_type}")

        # CSV paths
        summary_csv  = os.path.join(res_subdir, f"{base}_summary_table.csv")
        expanded_csv = os.path.join(res_subdir, f"{base}_expanded_table.csv")

        # Load and collect
        if not os.path.isfile(summary_csv):
            raise FileNotFoundError(f"Expected file not found: {summary_csv}")
        if not os.path.isfile(expanded_csv):
            raise FileNotFoundError(f"Expected file not found: {expanded_csv}")
        
        # extract the comments from a single summary csv only
        if i == 0:
            comments = extract_comments(summary_csv)
            for comment in comments:
                print(comment)

        summary_frames.append(pd.read_csv(summary_csv, comment='#'))
        expanded_frames.append(pd.read_csv(expanded_csv, comment='#'))

    # Concatenate all and reset the index
    combined_summary  = pd.concat(summary_frames,  ignore_index=True)
    combined_expanded = pd.concat(expanded_frames, ignore_index=True)

    return combined_summary, combined_expanded, comments

# Save the combined tables
def save_csv(df, save_dir, parameter_str_comment):
    """
    Save a dataframe to a csv file 

    Parameters
    ----------
    df : pd.DataFrame
        The input dataframe to save
    save_dir : str
        The name of the file to save the dataframe to
    parameter_str : list
        A list containing parameters to be included in the beginning of the csv file as comments
        Each element in the list is a string that must start with a special character (e.g '#')
    
    Returns
    -------
    None, but saves the dataframe to a csv file
    """
    with open(save_dir, "w") as f:
        f.write("\n".join(parameter_str_comment) + "\n")
        df.to_csv(f, index=False)

**MODIFY the following cell as needed**

In [None]:
# Let the notebook know what experiment type you would like to combine
# Specify it as a folder pattern with an asterisk * to match multiple directories
# Example: "splenic-B-cell_WT_insitu-hic_GSE82144_mm9_chr*_50Kb" will match all chromosomes
# of that specific experiment run
folder_pattern = "splenic-B-cell_WT_insitu-hic_GSE82144_mm9_chr*_50Kb"

# Location of the results folders
# Where is the folder above located?
results_dir = "/nfs/turbo/umms-minjilab/sionkim/miajet_output/"

# This is the suffix after the "_results" in each folder 
# You can specify "all" to combine all results or a specifically thresholded output
# Examples: "all", "p-0.01", "saliency-90", "saliency-90-p-0.1"
result_type = "all"

In [None]:
df_summary, df_expanded, parameter_str = combine_results(results_dir, folder_pattern, result_type)
# Will print out the parametesr of the experiment
# So that you can confirm this is indeed the correct experiment you are combining

# splenic-B-cell_WT_insitu-hic_GSE82144_mm9_chr*_50Kb
# Inputs
# * Hi-C file (.hic): /nfs/turbo/umms-minjilab/jetfinder/data/splenic-B-cell_WT_insitu-hic_GSE82144_mm9.hic
# 
# Required Parameters
# * Experiment type: hic
# * Chromosome: chr*
# * Resolution: 50000
# * Save directory root: /nfs/turbo/umms-minjilab/sionkim/miajet_output
# 
# Extended Parameters
# * Significance threshold(s): [0.1, 0.05, 0.01]
# * Window size: 6000000
# * Normalization: KR
# * Data type: oe
# * Jet widths (if specified): None
# * Angle range: [80, 100]
# * Saliency threshold on zero removed: 90
# * Hysteresis thresholding parameters: [0.01, 0.05]
# * Root within: 10
# * Folder name: splenic-B-cell_WT_insitu-hic_GSE82144_mm9_chr*_50Kb
# * Save directory: /nfs/turbo/umms-minjilab/sionkim/miajet_output/splenic-B-cell_WT_insitu-hic_GSE82144_mm9_chr*_50Kb
# * Number of cores: 4
# * Verbose: True
# 
# Optional Parameters
# * Scale range: [ 1.5         1.66735513  1.85338208  2.06016407  2.29001675  2.54551412
# 

In [55]:
# Can visually inspect the summary table
df_summary

Unnamed: 0,unique_id,chrom,start,end,length,input_mean,angle_mean,width_mean,jet_saliency,ks,p-val_raw,p-val_corr
0,chr4_12_12,chr4,4.479348e+07,4.487310e+07,636396.103,0.173,82.717,6.680,0.127,1.000,0.000,0.000
1,chr4_3632_3,chr4,9.508966e+07,9.509972e+07,176776.695,0.178,86.516,4.430,0.074,0.800,0.040,0.100
2,chr4_59_7,chr4,1.080276e+08,1.080868e+08,388908.730,0.137,95.848,3.481,0.067,0.909,0.000,0.000
3,chr4_152_2,chr4,1.427393e+08,1.427645e+08,212132.034,0.102,86.149,2.344,0.065,0.333,0.536,0.635
4,chr4_81_12,chr4,1.407019e+08,1.408084e+08,707106.781,0.083,97.975,7.202,0.064,0.550,0.002,0.012
...,...,...,...,...,...,...,...,...,...,...,...,...
4688,chr3_1134_5,chr3,1.452871e+08,1.453288e+08,141421.356,0.065,139.936,2.830,0.000,0.250,0.800,0.888
4689,chr3_796_0,chr3,7.950438e+07,7.952916e+07,70710.678,0.119,135.415,1.233,0.000,1.000,0.167,0.250
4690,chr3_649_7,chr3,1.325399e+08,1.325849e+08,176776.695,0.094,39.657,0.741,0.000,0.800,0.040,0.119
4691,chr3_839_7,chr3,1.444199e+08,1.444199e+08,70710.678,0.115,129.134,0.000,0.000,1.000,0.167,0.250


In [56]:
# Can visually inspect the expanded table
df_expanded

Unnamed: 0,unique_id,chrom,x (bp),y (bp),x (pixels),y (pixels),width,angle_imagej,ridge_strength
0,chr4_12_12,chr4,4.487047e+07,4.469983e+07,1267.129,81.733,4.082,90.891,0.013
1,chr4_12_12,chr4,4.490542e+07,4.466490e+07,1267.130,80.744,4.142,91.274,0.014
2,chr4_12_12,chr4,4.494033e+07,4.462991e+07,1267.129,79.756,4.232,91.584,0.015
3,chr4_12_12,chr4,4.497521e+07,4.459485e+07,1267.126,78.767,4.376,91.787,0.016
4,chr4_12_12,chr4,4.501008e+07,4.455974e+07,1267.123,77.777,4.547,91.826,0.017
...,...,...,...,...,...,...,...,...,...
27353,chr3_2373_1,chr3,1.470098e+08,1.462504e+08,4147.740,73.407,1.067,41.124,0.004
27354,chr3_2373_1,chr3,1.470430e+08,1.462550e+08,4148.274,73.003,1.151,43.985,0.004
27355,chr3_2373_1,chr3,1.470778e+08,1.462941e+08,4149.319,73.062,3.943,42.358,0.003
27356,chr3_2373_1,chr3,1.471104e+08,1.462975e+08,4149.829,72.649,3.933,41.534,0.003


**(OPTIONAL) modify as needed**

In [None]:
save_name = folder_pattern.replace('*', '_combined') 
# This is the save name for the combined results
# Essentially, the asterisk * in the folder pattern is replaced with "_combined"
# This may be overwritten if you want to save with a different name

save_dir = os.path.join(results_dir, save_name)
# Folder to save the combined results (summary and expanded tables)
# You may overwrite the `results_dir` if you wish to save to a different location

# Below is the code to print out the save locations to help you modify the variables abovef
# Once satisfied, move on to the next cell
save_dir_summary = os.path.join(save_dir, f"{save_name}_summary_table.csv")
save_dir_expanded = os.path.join(save_dir, f"{save_name}_expanded_table.csv")

print("Saving combined tables to:")
print(f"* Folder location: {save_dir}")
print(f"* Summary table location: {save_dir_summary}")
print(f"* Expanded table location: {save_dir_expanded}")

Saving combined tables to:
* Folder location: /nfs/turbo/umms-minjilab/sionkim/miajet_output/splenic-B-cell_WT_insitu-hic_GSE82144_mm9_chr_combined_50Kb
* Summary table location: /nfs/turbo/umms-minjilab/sionkim/miajet_output/splenic-B-cell_WT_insitu-hic_GSE82144_mm9_chr_combined_50Kb/splenic-B-cell_WT_insitu-hic_GSE82144_mm9_chr_combined_50Kb_summary_table.csv
* Expanded table location: /nfs/turbo/umms-minjilab/sionkim/miajet_output/splenic-B-cell_WT_insitu-hic_GSE82144_mm9_chr_combined_50Kb/splenic-B-cell_WT_insitu-hic_GSE82144_mm9_chr_combined_50Kb_expanded_table.csv


In [None]:
# Make directory if it doesn't exist already and save the files
os.makedirs(save_dir, exist_ok=True)

save_csv(df_summary, save_dir_summary, parameter_str)
save_csv(df_summary, save_dir_expanded, parameter_str)