In [44]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import combine_pvalues, spearmanr
from statsmodels.stats.multitest import multipletests

from genomic_utils.variant import Variant

In [40]:
INPUT_PATH = "all_sequences_variant_effect_data.csv"
OUTPUT_PATH = f"GM12878_effect_sizes.csv"

In [50]:
input_df = pd.read_csv(INPUT_PATH)


def create_variant_from_id(id_):
    chrom, pos, ref, alt, *_ = id_.split(":")
    return Variant(chrom, int(pos), ref, alt)


input_df["hg19_variant"] = input_df["ref_ID"].apply(create_variant_from_id)
assert input_df["hg19_variant"].equals(input_df["alt_ID"].apply(create_variant_from_id))
input_df = input_df[
    [
        "hg19_variant",
        "GM12878_Skew_logP",
        "GM12878_Skew_logFDR",
        "GM12878_normalized_variant_effect",
    ]
].dropna()
input_df["GM12878_Skew_P"] = 10 ** -input_df["GM12878_Skew_logP"]

# Count number of duplicated variants
n_duplicated = input_df["hg19_variant"].duplicated().sum()
print(f"{n_duplicated}/{input_df.shape[0]} variants are duplicated")

# For duplicated variants, average effect size and use Fisher's method to combine p-values
unique_df = input_df[~input_df["hg19_variant"].duplicated(keep=False)]
duplicated_df = input_df[input_df["hg19_variant"].duplicated(keep=False)]
combined_pvalues = duplicated_df.groupby("hg19_variant")["GM12878_Skew_P"].apply(
    lambda x: combine_pvalues(x, method="fisher")[1]
)
duplicated_averaged_df = duplicated_df.groupby("hg19_variant").mean()
duplicated_averaged_df["GM12878_Skew_P"] = combined_pvalues
combined_df = pd.concat([unique_df, duplicated_averaged_df.reset_index()])
assert combined_df["hg19_variant"].duplicated().sum() == 0
print(f"After deduplication, we have {combined_df.shape[0]} unique variants")

# Use Benjamini-Hochberg to control FDR
_, combined_df["GM12878_Skew_FDR"], _, _ = multipletests(
    combined_df["GM12878_Skew_P"], method="fdr_bh"
)

95/155283 variants are duplicated
After deduplication, we have 155188 unique variants


In [52]:
combined_df

Unnamed: 0,hg19_variant,GM12878_Skew_logP,GM12878_Skew_logFDR,GM12878_normalized_variant_effect,GM12878_Skew_P,GM12878_Skew_FDR
0,chr6:105479558 A/G,0.159264,0.023302,-0.130583,0.693004,0.911388
3,chr13:67311727 T/C,0.228532,0.033767,0.333244,0.590837,0.878953
9,chr14:105588928 A/G,0.067623,0.012273,-0.094555,0.855810,0.960128
11,chr17:1371473 C/G,0.267034,0.055534,1.029765,0.540712,0.861164
12,chr10:80953393 G/C,2.434001,0.810205,0.875200,0.003681,0.138524
...,...,...,...,...,...,...
43,chr7:99635369 C/A,2.562993,1.578398,-0.515813,0.000004,0.000797
44,chr8:66913397 CACTTT/C,0.905712,0.272148,-0.314708,0.051458,0.460852
45,chr8:71570989 T/C,1.667079,0.789633,0.298857,0.000786,0.054502
46,chr9:123634040 G/A,0.579107,0.172430,-0.163881,0.238055,0.721095


In [53]:
# Compare old and new log FDR
x = combined_df["GM12878_Skew_logFDR"].values
y = -np.log10(combined_df["GM12878_Skew_FDR"].values)
rho, _ = spearmanr(x, y)
print(f"Spearman correlation between old and new log FDR: {rho:.3f}")

Spearman correlation between old and new log FDR: 0.997


  y = -np.log10(combined_df["GM12878_Skew_FDR"].values)


In [54]:
combined_df = combined_df.drop(columns=["GM12878_Skew_logFDR"])
combined_df.to_csv(OUTPUT_PATH, index=False)