In [1]:
import os 
import pandas as pd
import numpy as np
import sys 
from biomart import BiomartServer
from io import StringIO
import requests

def fetch_ensembl_data(gene_list, batch_size=100):
    url = "https://rest.ensembl.org/lookup/id"
    headers = {"Content-Type": "application/json"}

    all_results = []

    # Split the list into batches
    for i in range(0, len(gene_list), batch_size):
        batch = gene_list[i:i + batch_size]

        response = requests.post(url, headers=headers, json={"ids": batch})
        
        if response.status_code != 200:
            print(f"Error: {response.status_code} - {response.text}")
            continue  # Skip this batch

        gene_data = response.json()
        
        for gene_id, details in gene_data.items():
            if details:  # Ensure data exists
                all_results.append({
                    "gene": gene_id,
                    "symbol": details.get("display_name", ""),
                    "chromosome": details.get("seq_region_name", ""),
                    "start": details.get("start", ""),
                    "end": details.get("end", ""),
                })
        
    # Convert results to a DataFrame
    df_ensembl = pd.DataFrame(all_results)
    df_ensembl["location"] = df_ensembl["chromosome"].astype(str) + ":" + df_ensembl["start"].astype(str) + "-" + df_ensembl["end"].astype(str)
    return df_ensembl 


In [60]:
import pandas as pd
import glob
import os

# Folder path
folder = "C:/Users/maliz/thesa/UKbiobank/data/result_singleton/filter_1_percent"

# Pattern to match your files
pattern = os.path.join(folder, "df_top10*.csv")

# Get list of files sorted by number
files = glob.glob(pattern)
# Loop over the files
for file in files:
    df = pd.read_csv(file)
    
    # Process genes
    gene_list = df.gene_1.unique().tolist()
    ensembl_df_1 = fetch_ensembl_data(gene_list)

    gene_list = df.gene_2.unique().tolist()
    ensembl_df_2 = fetch_ensembl_data(gene_list)

    # Merge with annotations
    df = df.merge(
        ensembl_df_1[['gene', 'symbol']], left_on='gene_1', right_on='gene', 
        how='left', suffixes=('_1', '_2')
    ).drop("gene", axis=1)

    df = df.merge(
        ensembl_df_2[['gene', 'symbol']], left_on='gene_2', right_on='gene', 
        how='left', suffixes=('_1', '_2')
    ).drop("gene", axis=1)

    df = df.rename(columns={
    col: (
        "P value" if col.startswith("p-value") else
        "Q value" if col.startswith("q-value") else
        "Expected" if col.startswith("expected") else
        "Observed" if col.startswith("both") else
        "Symbol 1" if col == "symbol_1" else
        "Symbol 2" if col == "symbol_2" 
         else
        col
    )
    for col in df.columns
})
   
    print (df.columns) 
  # sum all *_1 columns except 'gene_1'
    cols_1 = [c for c in df.columns if c.endswith("_1") and c != "gene_1"]
    df["Obs gene 1"] = df[cols_1].sum(axis=1)
    
    # sum all *_2 columns except 'gene_2'
    cols_2 = [c for c in df.columns if c.endswith("_2") and c != "gene_2"]
    df["Obs gene 2"] = df[cols_2].sum(axis=1)
    # Define target order
    col_order = [
    "Symbol 1", "Symbol 2",
    "Obs gene 1" , "Obs gene 2" , 
    "Observed", "Expected",
    "P value", "Q value"
    ]

    df = df[[c for c in col_order if c in df.columns]]
    print (df.columns)

    
    # Create new filename
    base = os.path.basename(file)                       # e.g. "df_top10_p_lof_1.csv"
    name, ext = os.path.splitext(base)                  # ("df_top10_p_lof_1", ".csv")
    new_name = f"{name}_annotated{ext}"
    new_path = folder + "/annotated/"+new_name

    # # Save new file
    print (new_path)
    df.to_csv(new_path, index=False)

    print(f"Processed {file} → saved as {new_path}")


Index(['gene_1', 'gene_2', 'lof_1', 'lof_singleton_1', 'lof_2',
       'lof_singleton_2', 'Expected', 'Observed', 'P value', 'Q value',
       'Symbol 1', 'Symbol 2'],
      dtype='object')
Index(['Symbol 1', 'Symbol 2', 'Obs gene 1', 'Obs gene 2', 'Observed',
       'Expected', 'P value', 'Q value'],
      dtype='object')
C:/Users/maliz/thesa/UKbiobank/data/result_singleton/filter_1_percent/annotated/df_top10_p_lof_annotated.csv
Processed C:/Users/maliz/thesa/UKbiobank/data/result_singleton/filter_1_percent\df_top10_p_lof.csv → saved as C:/Users/maliz/thesa/UKbiobank/data/result_singleton/filter_1_percent/annotated/df_top10_p_lof_annotated.csv
Index(['gene_1', 'gene_2', 'missense_1', 'missense_singleton_1', 'lof_1',
       'lof_singleton_1', 'missense_2', 'missense_singleton_2', 'lof_2',
       'lof_singleton_2', 'Expected', 'Observed', 'P value', 'Q value',
       'Symbol 1', 'Symbol 2'],
      dtype='object')
Index(['Symbol 1', 'Symbol 2', 'Obs gene 1', 'Obs gene 2', 'Observed',
   

In [84]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import table
import numpy as np
csv_paths = [
    "C:/Users/maliz/thesa/UKbiobank/data/result_singleton/filter_1_percent/annotated/df_top10_p_lof_annotated.csv",
    "C:/Users/maliz/thesa/UKbiobank/data/result_singleton/filter_1_percent/annotated/df_top10_p_missense_annotated.csv",
    "C:/Users/maliz/thesa/UKbiobank/data/result_singleton/filter_1_percent/annotated/df_top10_p_lof_missense_annotated.csv",
]
titles = [
   "Both genes are loss-of-function (LoF)" , 
   "Both genes are Missense" , 
   "One gene is loss-of-function (LoF) and the other is Missene" ] 

dfs = []
for path in csv_paths:
    df = pd.read_csv(path)

    # Round 'Expected' to 2 decimals
    if 'Expected' in df.columns:
        df['Expected'] = df['Expected'].round(2)

    # Round P value / Q value to 5 decimals (before renaming headers)
    for col in df.columns:
        if 'P value' in col or 'Q value' in col:
            df[col] = df[col].round(5)

    df.columns = df.columns.str.replace('\n', '', regex=False)

    dfs.append(df)
    


In [82]:
import matplotlib.pyplot as plt
from pandas.plotting import table

def plot_tables_pretty_separate(dfs_subset, titles_subset, save_dir):
    for idx, (df, title) in enumerate(zip(dfs_subset, titles_subset), start=1):

        # Just slightly bigger than before
        fig, ax = plt.subplots(1, 1, figsize=(20, 6))
        ax.axis('off')

        df_str = df.astype(str)
        n_cols = len(df_str.columns)
        col_widths = [1 / n_cols] * n_cols

        tbl = table(
            ax,
            df_str,
            loc='center',
            cellLoc='center',
            colWidths=col_widths,
            bbox=[0, 0.1, 1, 0.8]   # your original
        )

        tbl.auto_set_font_size(False)
        tbl.set_fontsize(11)       # just one step bigger
        tbl.scale(1.05, 1.1)       # small size increase

        for (i, j), cell in tbl.get_celld().items():
            cell.set_linewidth(0.5)
            cell.PAD = 0.22        # tiny increase in padding

            if i == 0:
                cell.set_facecolor('#d3d3d3')
                cell.set_text_props(weight='bold', fontsize=12)
            else:
                cell.set_facecolor('white')

        if idx == 1:
            fig.suptitle(
                "Gene pairs with the strongest depletion signal for mutations",
                fontweight='bold',
                fontsize=30 , 
                y=1.03
            )

        
        ax.set_title(title, fontsize=25,   pad=10)

        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"{save_dir}/table_{idx}.png", dpi=300, bbox_inches='tight')
        plt.close()


In [85]:
# Suppose your list of DataFrames is called dfs
for df in dfs:
    df['Observed'] = df['Observed'].astype(int)

plot_tables_pretty_separate(
    dfs,
    titles,
    save_dir="C:/Users/maliz/thesa/UKbiobank/data/result_singleton"
)


In [57]:
dfs[0]

Unnamed: 0,Symbol 1,Symbol 2,Obs gene 1,Expected,P value,Q value
0,DNAH14,AKAP9,35578,13.92,0.00192,0.27909
1,DNAH14,BLTP1,35578,10.08,0.0026,0.27909
2,SYNE1,CEL,1401,11.64,0.00303,0.27909
3,SPTBN5,TTN,6300,16.09,0.00377,0.27909
4,DNAH14,PKHD1L1,35578,11.27,0.00401,0.27909
5,TTN,TTC39B,16620,10.72,0.00605,0.35068
6,DNAH14,SYNE1,35578,18.66,0.01071,0.50476
7,ANKRD30A,TTN,11959,19.87,0.0116,0.50476
8,TTN,MUC20,16620,13.93,0.01484,0.52563
9,TTN,TG,16620,11.0,0.0151,0.52563


In [53]:
df = pd.read_csv("C:/Users/maliz/thesa/UKbiobank/data/result_singleton/filter_1_percent/df_top10_p_lof_missense.csv")
df

Unnamed: 0,gene_1,gene_2,missense_1,missense_singleton_1,lof_1,lof_singleton_1,missense_2,missense_singleton_2,lof_2,lof_singleton_2,expected_both_lof_missense_combined,both_lof_missense_combined,p-value_lof_missense_combined,q-value_lof_missense_combined
0,ENSG00000118997,ENSG00000198502,24645,323,12844,133,8081,2,27426,1,23.752894,10.0,0.00126,0.849434
1,ENSG00000184363,ENSG00000181143,12109,80,431,34,11205,110,5866,300,10.778481,2.0,0.001456,0.849434
2,ENSG00000155657,ENSG00000164989,6662,115,16034,586,9380,40,972,39,15.487513,5.0,0.001987,0.849434
3,ENSG00000228198,ENSG00000115760,306,7,14813,8,8371,429,121,23,15.29716,5.0,0.002271,0.849434
4,ENSG00000181333,ENSG00000155657,6021,76,863,26,6662,115,16034,586,11.941862,3.0,0.002397,0.849434
5,ENSG00000123384,ENSG00000145113,9164,318,24,15,0,0,11860,48,10.019508,2.0,0.002725,0.849434
6,ENSG00000137834,ENSG00000118997,1041,124,618,118,24645,323,12844,133,11.512389,3.0,0.003333,0.890275
7,ENSG00000118997,ENSG00000197410,24645,323,12844,133,7118,88,8156,78,15.782591,6.0,0.004617,0.999893
8,ENSG00000162909,ENSG00000155657,6730,70,211,31,6662,115,16034,586,12.602319,4.0,0.004971,0.999893
9,ENSG00000138286,ENSG00000178209,164,1,15782,24,15027,254,383,89,10.430873,3.0,0.007524,0.999893


In [36]:
gene_list = df.gene_1.unique().tolist()
ensembl_df_1 = fetch_ensembl_data(gene_list)

gene_list = df.gene_2.unique().tolist()
ensembl_df_2 = fetch_ensembl_data(gene_list)

    # Merge with annotations
df = df.merge(
        ensembl_df_1[['gene', 'symbol']], left_on='gene_1', right_on='gene', 
        how='left', suffixes=('_1', '_2')
    ).drop("gene", axis=1)
df.head()


df = df.merge(
        ensembl_df_2[['gene', 'symbol']], left_on='gene_2', right_on='gene', 
        how='left', suffixes=('_1', '_2')
).drop("gene", axis=1)



In [33]:
new = df.rename(columns={
col: (
        "P value" if col.startswith("p-value") else
        "Q value" if col.startswith("q-value") else
        "Expected" if col.startswith("expected") else
        "Observed" if col.startswith("both") else
        "Symbol 1" if col == "symbol_1" else
        "Symbol 2" if col == "symbol_2" else
        "Obs gene 1" if (
            col in ["lof_1", "missense_1", "lof_missense_1", "missense_lof_1"]
        ) else
        "Obs gene 2" if (
            col in ["lof_2", "missense_2", "lof_missense_2", "missense_lof_2"]
        ) else
        "Obs singleton gene 1" if (
            col in ["lof_singleton_1", "missense_singleton_1",
                    "lof_missense_singleton_1", "missense_lof_singleton_1"]
        ) else
        "Obs singleton gene 2" if (
            col in ["lof_singleton_2", "missense_singleton_2",
                    "lof_missense_singleton_2", "missense_lof_singleton_2"]
        ) else
        col
    )
    for col in df.columns
})


In [34]:
df

Unnamed: 0,gene_1,gene_2,Obs gene 1,Obs singleton gene 1,Obs gene 1.1,Obs singleton gene 1.1,Obs gene 2,Obs singleton gene 2,Obs gene 2.1,Obs singleton gene 2.1,Expected,Observed,P value,Q value,Symbol 1,Symbol 2
0,ENSG00000118997,ENSG00000198502,24645,323,12844,133,8081,2,27426,1,23.752894,10.0,0.00126,0.849434,DNAH7,HLA-DRB5
1,ENSG00000184363,ENSG00000181143,12109,80,431,34,11205,110,5866,300,10.778481,2.0,0.001456,0.849434,PKP3,MUC16
2,ENSG00000155657,ENSG00000164989,6662,115,16034,586,9380,40,972,39,15.487513,5.0,0.001987,0.849434,TTN,CCDC171
3,ENSG00000228198,ENSG00000115760,306,7,14813,8,8371,429,121,23,15.29716,5.0,0.002271,0.849434,OR2M3,BIRC6
4,ENSG00000181333,ENSG00000155657,6021,76,863,26,6662,115,16034,586,11.941862,3.0,0.002397,0.849434,HEPHL1,TTN
5,ENSG00000123384,ENSG00000145113,9164,318,24,15,0,0,11860,48,10.019508,2.0,0.002725,0.849434,LRP1,MUC4
6,ENSG00000137834,ENSG00000118997,1041,124,618,118,24645,323,12844,133,11.512389,3.0,0.003333,0.890275,SMAD6,DNAH7
7,ENSG00000118997,ENSG00000197410,24645,323,12844,133,7118,88,8156,78,15.782591,6.0,0.004617,0.999893,DNAH7,DCHS2
8,ENSG00000162909,ENSG00000155657,6730,70,211,31,6662,115,16034,586,12.602319,4.0,0.004971,0.999893,CAPN2,TTN
9,ENSG00000138286,ENSG00000178209,164,1,15782,24,15027,254,383,89,10.430873,3.0,0.007524,0.999893,FAM149B1,PLEC
