## Comparing SVs from cenHap4 derived from centrolign induced pairwise and Fedor's HorHap analysis

In [1]:
# Import statements
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
### This function reads in the SV bed files into a pandas DF for a given clade. 

def read_sv_bed_files(clade, bed_folder):
    """
    Read all .bed files in a folder and add a 'clade' column to the data frame
    Each bed file has columns:
        sample1, start, end, sample2, start, end, type, diff
    """
    all_beds = []
    bed_files = [f for f in os.listdir(bed_folder) if f.endswith(".bed")]
    print(f"Found {len(bed_files)} SV bed files for clade '{clade}'")

    # Ensure folder exists
    if not os.path.isdir(bed_folder):
        raise FileNotFoundError("Folder not found for clade '{}': {}".format(clade, bed_folder))

    for filename in os.listdir(bed_folder):
        if filename.endswith(".bed"):
            bed_path = os.path.join(bed_folder, filename)
            try:
                df = pd.read_csv(
                    bed_path,
                    sep="\t",
                    header=None,
                    names=["sample1", "start1", "end1", "sample2", "start2", "end2", "type", "diff"]
                )
                df["clade"] = clade
                df["source_file"] = filename  # optional, helps track origin
                df["length"] = np.where(
                    df["type"] == "I",
                    df["end2"] - df["start2"],  # insertion → use sample2 coords
                    df["end1"] - df["start1"]  # otherwise (deletion) → sample1 coords
                )
                all_beds.append(df)
            except Exception as e:
                print("Warning: Could not read {}: {}".format(bed_path, e))

    if all_beds:
        return pd.concat(all_beds, ignore_index=True)
    else:
        print("Warning: No .bed files found in {} for clade '{}'".format(bed_folder, clade))
        return pd.DataFrame(columns=["sample1", "start1", "end1", "sample2", "start2", "end2", "type", "diff", "clade", "source_file"])


In [3]:
horhap_beds="/private/groups/patenlab/mira/centrolign/analysis/SVs_pairwise/chr12/cenHap4_benchmarking_HorHaps/fedor_horHap_SV_beds"
cen_beds="/private/groups/patenlab/mira/centrolign/analysis/SVs_pairwise/chr12/cenHap4_benchmarking_HorHaps/SV_beds_asm_coords"
horhap_svs = read_sv_bed_files("cenhap4", horhap_beds)
centrolign_svs=read_sv_bed_files("cenhap4", cen_beds)

Found 1711 SV bed files for clade 'cenhap4'
Found 1485 SV bed files for clade 'cenhap4'


In [None]:
### Sanity check: missing samples 

# Convert each row to a frozenset (unordered pair)
pairs1 = set(centrolign_svs.apply(lambda row: frozenset([row["sample1"], row["sample2"]]), axis=1))
pairs2 = set(horhap_svs.apply(lambda row: frozenset([row["sample1"], row["sample2"]]), axis=1))

# Pairs missing in df2
missing_in_df2 = pairs1 - pairs2
print("Pairs in df1 but missing in df2:", missing_in_df2)

# Pairs missing in df1
missing_in_df1 = pairs2 - pairs1
print("Pairs in df2 but missing in df1:", missing_in_df1)

### Reason we are missing samples is that Fedor is using the full HPRC list, so some samples from his list
### were excluded in our QC 

In [8]:
### Sanity check - are the same samples used as ref-query in both SV sets?
# Create a combined pair key for clarity
# Create (sample1, sample2) pair tuples
pairs1 = list(zip(centrolign_svs["sample1"], centrolign_svs["sample2"]))
pairs2 = list(zip(horhap_svs["sample1"], horhap_svs["sample2"]))

# Compute sets for easy comparison
set1 = set(pairs1)
set2 = set(pairs2)
set2_rev = set((b, a) for a, b in set2)

# Determine relationships
same_order = set1 & set2
reversed_order = set1 & set2_rev
unique_to_df1 = set1 - set2 - set2_rev
unique_to_df2 = set2 - set1 - set((b, a) for a, b in set1)

# Print counts
print(f"Total pairs in df1: {len(set1)}")
print(f"Total pairs in df2: {len(set2)}")
print(f"Pairs matching in same order: {len(same_order)}")
print(f"Pairs matching in reversed order: {len(reversed_order)}")
print(f"Pairs unique to df1: {len(unique_to_df1)}")
print(f"Pairs unique to df2: {len(unique_to_df2)}")

# Print first few reversed pairs
print("First few reversed pairs:")
for i, pair in enumerate(reversed_order):
    if i >= 5:  # limit to first 5
        break
    print(pair)

Total pairs in df1: 1485
Total pairs in df2: 1711
Pairs matching in same order: 853
Pairs matching in reversed order: 632
Pairs unique to df1: 0
Pairs unique to df2: 226
First few reversed pairs:
('NA20905.1', 'HG02559.2')
('NA21102.2', 'HG01993.2')
('HG01258.2', 'HG00133.1')
('HG01252.1', 'HG01192.1')
('HG01123.2', 'HG01106.1')


In [4]:
def export_sv_bed(df, output_path):
    """
    Create a BED file from an SV dataframe.
    - For insertions ("I"), use sample2, start2, end2.
    - For deletions ("D"), use sample1, start1, end1.
    - Sample name is 'sample1_sample2' (sorted alphabetically).
    Output columns: [sample_pair, start, end, type, diff]
    """

    df = df.copy()

    # Create a consistent sample pair name
    df["sample_pair"] = df.apply(
        lambda row: "_".join(sorted([row["sample1"], row["sample2"]])), axis=1
    )

    # Separate insertions and deletions
    ins = df[df["type"] == "I"].copy()
    dels = df[df["type"] == "D"].copy()

    # For insertions: take sample2 coordinates
    ins_bed = ins.rename(columns={
        "start2": "start", "end2": "end"
    })[["sample_pair", "start", "end", "type", "diff"]]

    # For deletions: take sample1 coordinates
    del_bed = dels.rename(columns={
        "start1": "start", "end1": "end"
    })[["sample_pair", "start", "end", "type", "diff"]]

    del_bed_df = del_bed.sort_values(["sample_pair", "start", "end"])
    ins_bed_df = ins_bed.sort_values(["sample_pair", "start", "end"])

    # Save as tab-separated BED
    del_bed_df.to_csv(output_path+"_del.bed", sep="\t", header=False, index=False)
    ins_bed_df.to_csv(output_path+"_ins.bed", sep="\t", header=False, index=False)

    print(f"BED file written to: {del_bed_df}")
    print(del_bed_df.head())  # preview a few rows
    print(f"BED file written to: {ins_bed_df}")
    print(ins_bed_df.head())  # preview a few rows

    return 

export_sv_bed(centrolign_svs,"/private/groups/patenlab/mira/centrolign/analysis/SVs_pairwise/chr12/cenHap4_benchmarking_HorHaps/centrolign_SVs")
export_sv_bed(horhap_svs,"/private/groups/patenlab/mira/centrolign/analysis/SVs_pairwise/chr12/cenHap4_benchmarking_HorHaps/horhap_SVs")

BED file written to:                sample_pair     start       end type      diff
8163   HG00099.1_HG00126.2  34651801  34652930    D  0.546222
8165   HG00099.1_HG00126.2  34683917  34688668    D  0.176603
8167   HG00099.1_HG00126.2  34696343  34698379    D  0.667485
8169   HG00099.1_HG00126.2  34699194  34702408    D  0.845675
8171   HG00099.1_HG00126.2  34702992  34703066    D  0.991006
...                    ...       ...       ...  ...       ...
48529  NA21106.1_NA21106.2  37001679  37011303    D  0.458749
48531  NA21106.1_NA21106.2  37012464  37013824    D -1.000000
48532  NA21106.1_NA21106.2  37016477  37020551    D -1.000000
48533  NA21106.1_NA21106.2  37022724  37036985    D  0.416800
48537  NA21106.1_NA21106.2  37050342  37051698    D -1.000000

[237516 rows x 5 columns]
              sample_pair     start       end type      diff
8163  HG00099.1_HG00126.2  34651801  34652930    D  0.546222
8165  HG00099.1_HG00126.2  34683917  34688668    D  0.176603
8167  HG00099.1_HG00126.2