# CMRG HG002: overlap of small variants by ClinVar CLNSIG category with homopolymer regions

This notebook generates Table 1 and 2 of the manuscript, with the ClinVar SNVs and Indels that overlap with homopolymer regions.
The input file `clinvar_overlap_by_CLNSIG_and_VAR_TYPE.csv` is generated with the R script `analyses/clinvar_overlap/find_overlap.R`.

In [None]:
import os
import pandas as pd

# adjust the path to the input file accordingly
clinvar_res = "clinvar_overlap_by_CLNSIG_and_VAR_TYPE.csv"

df = pd.read_csv(clinvar_res, sep=',')

In [26]:
df.head()

Unnamed: 0,BED_file,VAR_TYPE,CLNSIG,Overlap_variants,Non_overlap_variants,Total_variants,Overlap_proportion
0,GRCh38_SimpleRepeat_homopolymer_4to6_slop5,SNP,Affects,21,85,106,0.1981
1,GRCh38_SimpleRepeat_homopolymer_4to6_slop5,SNP,association,74,222,296,0.25
2,GRCh38_SimpleRepeat_homopolymer_4to6_slop5,SNP,association_not_found,0,4,4,0.0
3,GRCh38_SimpleRepeat_homopolymer_4to6_slop5,SNP,association|drug_response|risk_factor,0,1,1,0.0
4,GRCh38_SimpleRepeat_homopolymer_4to6_slop5,SNP,association|risk_factor,0,1,1,0.0


In [27]:
subset_dict = {
    'GRCh38_SimpleRepeat_homopolymer_4to6_slop5': 'hpol_4to6',
    'GRCh38_SimpleRepeat_homopolymer_7to11_slop5': 'hpol_7to11',
    'GRCh38_SimpleRepeat_homopolymer_ge12_slop5': 'hpol_ge12',
    'GRCh38_SimpleRepeat_homopolymer_ge21_slop5': 'hpol_ge21'
}

df['BED_file'] = df['BED_file'].replace(subset_dict)
df.head()

Unnamed: 0,BED_file,VAR_TYPE,CLNSIG,Overlap_variants,Non_overlap_variants,Total_variants,Overlap_proportion
0,hpol_4to6,SNP,Affects,21,85,106,0.1981
1,hpol_4to6,SNP,association,74,222,296,0.25
2,hpol_4to6,SNP,association_not_found,0,4,4,0.0
3,hpol_4to6,SNP,association|drug_response|risk_factor,0,1,1,0.0
4,hpol_4to6,SNP,association|risk_factor,0,1,1,0.0


In [28]:
df['CLNSIG'].unique()

array(['Affects', 'association', 'association_not_found',
       'association|drug_response|risk_factor', 'association|risk_factor',
       'Benign', 'Benign/Likely_benign',
       'Benign/Likely_benign|association',
       'Benign/Likely_benign|drug_response',
       'Benign/Likely_benign|drug_response|other',
       'Benign/Likely_benign|other',
       'Benign/Likely_benign|other|risk_factor',
       'Benign/Likely_benign|risk_factor', 'Benign|Affects',
       'Benign|Affects|association|other', 'Benign|association',
       'Benign|confers_sensitivity', 'Benign|drug_response',
       'Benign|other', 'Benign|protective', 'Benign|risk_factor',
       'confers_sensitivity', 'confers_sensitivity|other',
       'Conflicting_classifications_of_pathogenicity',
       'Conflicting_classifications_of_pathogenicity|Affects',
       'Conflicting_classifications_of_pathogenicity|association',
       'Conflicting_classifications_of_pathogenicity|association|risk_factor',
       'Conflicting_class

In [29]:
clnsig_subset = [
    'Benign',
    'Likely_benign',
    'Uncertain_significance',
    'Likely_pathogenic',
    'Pathogenic'
]

df = df[df['CLNSIG'].isin(clnsig_subset)]
df['CLNSIG'] = pd.Categorical(df['CLNSIG'], categories=clnsig_subset, ordered=True)


In [30]:
df[df['Non_overlap_variants']<0]

Unnamed: 0,BED_file,VAR_TYPE,CLNSIG,Overlap_variants,Non_overlap_variants,Total_variants,Overlap_proportion


In [31]:
df = df[df['VAR_TYPE'].isin(['SNP','INDEL'])]
df

Unnamed: 0,BED_file,VAR_TYPE,CLNSIG,Overlap_variants,Non_overlap_variants,Total_variants,Overlap_proportion
5,hpol_4to6,SNP,Benign,48677,131569,180246,0.2701
35,hpol_4to6,SNP,Likely_benign,261084,710021,971105,0.2689
42,hpol_4to6,SNP,Likely_pathogenic,15228,55628,70856,0.2149
59,hpol_4to6,SNP,Pathogenic,16696,65333,82029,0.2035
83,hpol_4to6,SNP,Uncertain_significance,458592,1600725,2059317,0.2227
92,hpol_4to6,INDEL,Benign,10957,17498,28455,0.3851
107,hpol_4to6,INDEL,Likely_benign,14298,22876,37174,0.3846
110,hpol_4to6,INDEL,Likely_pathogenic,10959,24325,35284,0.3106
123,hpol_4to6,INDEL,Pathogenic,28537,63149,91686,0.3112
138,hpol_4to6,INDEL,Uncertain_significance,16926,41668,58594,0.2889


In [32]:
df_snp = df[df['VAR_TYPE']=='SNP'][['BED_file','CLNSIG','Overlap_variants','Total_variants','Overlap_proportion']]

In [33]:
def prepare_df(df, vartype="SNP"):
    
    subset_dict = {
        'hpol_4to6': '4-6',
        'hpol_7to11': '7-11',
        'hpol_ge12': '>11',
        'hpol_ge21': '>20'
    }

    df_res = df[df['VAR_TYPE']==vartype][['BED_file','CLNSIG','Overlap_variants','Total_variants','Overlap_proportion']].copy()

    df_res['BED_file'] = df_res['BED_file'].replace(subset_dict)

    df_res = df_res.rename(columns={"BED_file": "HPol (bp)", "Overlap_variants": "Overlap",
                                    "Total_variants": "Total", "Overlap_proportion": "%"})
    
    return df_res

In [None]:
import pandas as pd
from io import StringIO
import re

def latex_escape(s):
    """Escape LaTeX special characters (_ % & $ # ^ { } ~ \\)"""
    if pd.isna(s):
        return ""
    s = str(s)
    # Wrap strings containing '>' in math mode
    if '>' in s:
        # Escape underscores inside math mode
        s = re.sub(r'_', r'\_', s)
        return f"${s}$"
    # Otherwise escape normally
    return re.sub(r'([_%&#$^{}~\\])', r'\\\1', s)

def df_to_latex_multirow(
    df,
    group_cols,
    column_format=None,
    caption=None,
    label=None,
    center=True,
):
    ncols = len(df.columns)
    if column_format is None:
        column_format = "".join(["c"] * ncols)

    buf = StringIO()
    if center:
        buf.write("\\begin{table}[ht]\n\\centering\n")
    else:
        buf.write("\\begin{table}[ht]\n")

    buf.write("\\begin{tabular}{" + column_format + "}\n")
    buf.write("\\hline\n")
    
    headers = [f"\\textbf{{{latex_escape(col)}}}" for col in df.columns]
    buf.write(" & ".join(headers) + " \\\\\n")
    buf.write("\\hline\n")

    grouped = df.groupby(group_cols, sort=False, dropna=False)
    first_block = True

    for group_keys, subdf in grouped:
        if not isinstance(group_keys, tuple):
            group_keys = (group_keys,)

        if not first_block:
            buf.write("\\hline\n")
        first_block = False

        row_idx = 0
        for _, row in subdf.iterrows():
            row_cells = []
            for col in df.columns:
                if col in group_cols:

                    if row_idx == 0 or subdf.iloc[row_idx - 1][col] != row[col]:
                        
                        same_count = (subdf[col] == row[col]).sum()
                        if same_count > 1:
                            row_cells.append(
                                f"\\multirow{{{same_count}}}{{*}}{{{latex_escape(row[col])}}}"
                            )
                        else:
                            row_cells.append(latex_escape(row[col]))
                    else:
                        row_cells.append("")
                else:
                    row_cells.append(latex_escape(row[col]))
            buf.write(" & ".join(row_cells) + " \\\\\n")
            row_idx += 1

    buf.write("\\hline\n\\end{tabular}\n")
    if caption:
        buf.write(f"\\caption{{{latex_escape(caption)}}}\n")
    if label:
        buf.write(f"\\label{{{latex_escape(label)}}}\n")
    buf.write("\\end{table}\n")
    return buf.getvalue()


In [35]:
df_snp = prepare_df(df, vartype="SNP")

latex_code = df_to_latex_multirow(
    df_snp,
    group_cols=["HPol (bp)"],
    caption="Overlap of ClinVar SNVs per CLNSIG type with homopolymer regions.",
    label="tab1"
)

print(latex_code)

\begin{table}[ht]
\centering
\begin{tabular}{ccccc}
\hline
\textbf{HPol (bp)} & \textbf{CLNSIG} & \textbf{Overlap} & \textbf{Total} & \textbf{\%} \\
\hline
\multirow{5}{*}{4-6} & Benign & 48677 & 180246 & 0.2701 \\
 & Likely\_benign & 261084 & 971105 & 0.2689 \\
 & Likely\_pathogenic & 15228 & 70856 & 0.2149 \\
 & Pathogenic & 16696 & 82029 & 0.2035 \\
 & Uncertain\_significance & 458592 & 2059317 & 0.2227 \\
\hline
\multirow{5}{*}{7-11} & Benign & 3116 & 180246 & 0.0173 \\
 & Likely\_benign & 12259 & 971105 & 0.0126 \\
 & Likely\_pathogenic & 435 & 70856 & 0.0061 \\
 & Pathogenic & 377 & 82029 & 0.0046 \\
 & Uncertain\_significance & 10981 & 2059317 & 0.0053 \\
\hline
\multirow{5}{*}{$>11$} & Benign & 1245 & 180246 & 0.0069 \\
 & Likely\_benign & 2201 & 971105 & 0.0023 \\
 & Likely\_pathogenic & 86 & 70856 & 0.0012 \\
 & Pathogenic & 69 & 82029 & 0.0008 \\
 & Uncertain\_significance & 758 & 2059317 & 0.0004 \\
\hline
\multirow{5}{*}{$>20$} & Benign & 291 & 180246 & 0.0016 \\
 & Likely

In [36]:
df_snp = prepare_df(df, vartype="INDEL")

latex_code = df_to_latex_multirow(
    df_snp,
    group_cols=["HPol (bp)"],
    caption="Overlap of ClinVar Indels per CLNSIG type with homopolymer regions.",
    label="tab2"
)

print(latex_code)

\begin{table}[ht]
\centering
\begin{tabular}{ccccc}
\hline
\textbf{HPol (bp)} & \textbf{CLNSIG} & \textbf{Overlap} & \textbf{Total} & \textbf{\%} \\
\hline
\multirow{5}{*}{4-6} & Benign & 10957 & 28455 & 0.3851 \\
 & Likely\_benign & 14298 & 37174 & 0.3846 \\
 & Likely\_pathogenic & 10959 & 35284 & 0.3106 \\
 & Pathogenic & 28537 & 91686 & 0.3112 \\
 & Uncertain\_significance & 16926 & 58594 & 0.2889 \\
\hline
\multirow{5}{*}{7-11} & Benign & 5410 & 28455 & 0.1901 \\
 & Likely\_benign & 3404 & 37174 & 0.0916 \\
 & Likely\_pathogenic & 391 & 35284 & 0.0111 \\
 & Pathogenic & 1443 & 91686 & 0.0157 \\
 & Uncertain\_significance & 1481 & 58594 & 0.0253 \\
\hline
\multirow{5}{*}{$>11$} & Benign & 6049 & 28455 & 0.2126 \\
 & Likely\_benign & 3469 & 37174 & 0.0933 \\
 & Likely\_pathogenic & 37 & 35284 & 0.001 \\
 & Pathogenic & 35 & 91686 & 0.0004 \\
 & Uncertain\_significance & 1184 & 58594 & 0.0202 \\
\hline
\multirow{5}{*}{$>20$} & Benign & 1262 & 28455 & 0.0444 \\
 & Likely\_benign & 911 