In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import gdreg
import time

%load_ext autoreload
%autoreload 2

### Main annot

In [2]:
DATA_PATH = "/n/groups/price/martin/data_GDREG/UKBimp_337K_MAF001/baseline_annot"
PGEN_FILE = "/n/scratch3/users/j/jz286/imp_geno/ukb_imp_chr@_v3"
DIC_ANNOT_FILE = {
    "bed" : DATA_PATH + "/bed/ukb_imp_chr@_v3.bed.annot.gz",
    "diversity" : DATA_PATH + "/nucleotide_diversity/diversity.10kb.@.annot",
    "recomb_rate" : DATA_PATH + "/recomb_rate/ukb_imp_chr@_v3.oxford.10kb.recrate.annot",
    'LLD_AFR' : DATA_PATH + "/LLD_AFR/LLD_AFR.@.annot.gz",
    "CADD" : DATA_PATH + "/CADD/ukb_imp_chr@_v3.CADD",
    "snfeff" : DATA_PATH + "/snfeff/ukb_imp_chr@_v3.annot.gz",
}

DIC_NAME_UPDATE = {
    "AN:Vahedi_Tcell_SE_500bp" : "AN:Vahedi_Tcell_SE.extend.500",
    "AN:Vahedi_Tcell_TE_500bp" : "AN:Vahedi_Tcell_TE.extend.500",
}

In [3]:
# Write main annot
for CHR in range(1,23):
# for CHR in [22]:
    print('CHR%d' % CHR)
    df_annot_chr = gdreg.util.read_pgen(PGEN_FILE.replace("@", "%d" % CHR))["pvar"]
    df_annot_chr = df_annot_chr[["CHR", "SNP", "BP"]].copy()
    df_annot_chr['MAF'] = gdreg.util.read_pgen(
        PGEN_FILE.replace("@", "%d" % CHR))["afreq"]["MAF"].astype(np.float32)
    df_annot_chr.index = df_annot_chr["SNP"]   
    print("PGEN n_snp=%d" % df_annot_chr.shape[0])
    
    # .bed 
    temp_df = gdreg.util.read_annot(DIC_ANNOT_FILE["bed"].replace("@", "%d" % CHR))
    temp_df.columns = [DIC_NAME_UPDATE[x] if x in DIC_NAME_UPDATE else x for x in temp_df]
    temp_df.index = temp_df["SNP"]
    # AN.flanking.500 = AN.extend.500 - AN
    for AN in temp_df:
        if "%s.extend.500" % AN in temp_df:
            temp_df["%s.flanking.500" % AN] = temp_df["%s.extend.500" % AN].values & (~temp_df[AN].values) 
            temp_df.drop("%s.extend.500" % AN, axis=1, inplace=True)
    AN_list = [x for x in temp_df if x.startswith("AN:")]
    df_annot_chr = df_annot_chr.join(temp_df[AN_list])
    print("bed n_snp=%d" % df_annot_chr.shape[0])
    
    # nucleotide_diversity
    temp_df = pd.read_csv(
        DIC_ANNOT_FILE["diversity"].replace("@", "%d" % CHR), header=None
    )
    df_annot_chr["AN:nucleotide_div"] = temp_df[0].values
    print("nucleotide_diversity n_snp=%d" % df_annot_chr.shape[0])
    
    # recomb_rate
    temp_df =  pd.read_csv(
        DIC_ANNOT_FILE["recomb_rate"].replace("@", "%d" % CHR), header=None
    )
    df_annot_chr["AN:recomb_rate"] = temp_df[0].values
    print("recomb_rate n_snp=%d" % df_annot_chr.shape[0])    
    
    # LLD_AFR
    temp_df =  pd.read_csv(
        DIC_ANNOT_FILE["LLD_AFR"].replace("@", "%d" % CHR), sep='\t',
    )
    temp_dic = {x:y for x,y in zip(temp_df['SNP'], temp_df['AN:MAF_Adj_LLD_AFR'])}
    df_annot_chr["AN:LLD_AFR"] = [temp_dic[x] if x in temp_dic else 1 for x in df_annot_chr['SNP']]
    print("LLD_AFR n_snp=%d" % df_annot_chr.shape[0])   
    
    # CADD  
    temp_df = pd.read_csv(
        DIC_ANNOT_FILE["CADD"].replace("@", "%d" % CHR), sep='\t',
    )
    temp_df.drop_duplicates(subset="SNP", inplace=True)
    temp_df.index = temp_df["SNP"]
    temp_df = temp_df[["CADD:CpG", "CADD:GerpRS", "CADD:GerpN", "CADD:PHRED"]]
    temp_df["CADD:GerpRS"] = temp_df["CADD:GerpRS"]>=4
    temp_df["CADD:PHRED"] = temp_df["CADD:PHRED"]>=20
    temp_df.columns = ["AN:CpG", "AN:GerpRS_g4", "AN:GerpN", "AN:CADD_g20"]
    df_annot_chr = df_annot_chr.join(temp_df)
    print("CADD n_snp=%d" % df_annot_chr.shape[0])    
    
    # snpeff
    temp_df = gdreg.util.read_annot(DIC_ANNOT_FILE["snfeff"].replace("@", "%d" % CHR))
    temp_df.index = temp_df["SNP"]
    drop_list = ["AN:snpeff_3_prime_UTR_variant", "AN:snpeff_5_prime_UTR_variant", 
                 "AN:snpeff_intergenic_region", "AN:snpeff_intron_variant"]
    temp_df.drop(drop_list, axis=1, inplace=True)
    AN_list = [x for x in temp_df if x.startswith("AN:")]
    df_annot_chr = df_annot_chr.join(temp_df[AN_list])
    print("snpeff n_snp=%d" % df_annot_chr.shape[0])    
    
    # Summary 
    print('df_annot_chr', df_annot_chr.shape)
    print('n_AN = %d' % len([x for x in df_annot_chr if x.startswith('AN:')]))
    
    print('writing')
    df_annot_chr.fillna(0, inplace=True)
    AN_list = [x for x in df_annot_chr if x.startswith("AN:")]
    for AN in AN_list:
        if len(set(df_annot_chr[AN].values[:10000])) <= 2:
            df_annot_chr[AN] = df_annot_chr[AN].astype(bool)
        else:
            df_annot_chr[AN] = df_annot_chr[AN].astype(np.float32)
    gdreg.util.write_annot(df_annot_chr, DATA_PATH+"/main_annot/ukb_imp_chr%d_v3.annot.gz" % CHR)

CHR1
PGEN n_snp=1161341
bed n_snp=1161341
nucleotide_diversity n_snp=1161341
recomb_rate n_snp=1161341
LLD_AFR n_snp=1161341


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=1161341
snpeff n_snp=1161341
df_annot_chr (1161341, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR2
PGEN n_snp=1259312
bed n_snp=1259312
nucleotide_diversity n_snp=1259312
recomb_rate n_snp=1259312
LLD_AFR n_snp=1259312


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=1259312
snpeff n_snp=1259312
df_annot_chr (1259312, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR3
PGEN n_snp=1059605
bed n_snp=1059605
nucleotide_diversity n_snp=1059605
recomb_rate n_snp=1059605
LLD_AFR n_snp=1059605


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=1059605
snpeff n_snp=1059605
df_annot_chr (1059605, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR4
PGEN n_snp=1074447
bed n_snp=1074447
nucleotide_diversity n_snp=1074447
recomb_rate n_snp=1074447
LLD_AFR n_snp=1074447


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=1074447
snpeff n_snp=1074447
df_annot_chr (1074447, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR5
PGEN n_snp=964806
bed n_snp=964806
nucleotide_diversity n_snp=964806
recomb_rate n_snp=964806
LLD_AFR n_snp=964806


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=964806
snpeff n_snp=964806
df_annot_chr (964806, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR6
PGEN n_snp=976090
bed n_snp=976090
nucleotide_diversity n_snp=976090
recomb_rate n_snp=976090
LLD_AFR n_snp=976090


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=976090
snpeff n_snp=976090
df_annot_chr (976090, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR7
PGEN n_snp=868301
bed n_snp=868301
nucleotide_diversity n_snp=868301
recomb_rate n_snp=868301
LLD_AFR n_snp=868301


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=868301
snpeff n_snp=868301
df_annot_chr (868301, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR8
PGEN n_snp=826076
bed n_snp=826076
nucleotide_diversity n_snp=826076
recomb_rate n_snp=826076
LLD_AFR n_snp=826076


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=826076
snpeff n_snp=826076
df_annot_chr (826076, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR9
PGEN n_snp=641608
bed n_snp=641608
nucleotide_diversity n_snp=641608
recomb_rate n_snp=641608
LLD_AFR n_snp=641608


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=641608
snpeff n_snp=641608
df_annot_chr (641608, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR10
PGEN n_snp=747773
bed n_snp=747773
nucleotide_diversity n_snp=747773
recomb_rate n_snp=747773
LLD_AFR n_snp=747773


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=747773
snpeff n_snp=747773
df_annot_chr (747773, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR11
PGEN n_snp=730385
bed n_snp=730385
nucleotide_diversity n_snp=730385
recomb_rate n_snp=730385
LLD_AFR n_snp=730385


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=730385
snpeff n_snp=730385
df_annot_chr (730385, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR12
PGEN n_snp=705493
bed n_snp=705493
nucleotide_diversity n_snp=705493
recomb_rate n_snp=705493
LLD_AFR n_snp=705493


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=705493
snpeff n_snp=705493
df_annot_chr (705493, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR13
PGEN n_snp=537961
bed n_snp=537961
nucleotide_diversity n_snp=537961
recomb_rate n_snp=537961
LLD_AFR n_snp=537961


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=537961
snpeff n_snp=537961
df_annot_chr (537961, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR14
PGEN n_snp=482519
bed n_snp=482519
nucleotide_diversity n_snp=482519
recomb_rate n_snp=482519
LLD_AFR n_snp=482519


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=482519
snpeff n_snp=482519
df_annot_chr (482519, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR15
PGEN n_snp=423991
bed n_snp=423991
nucleotide_diversity n_snp=423991
recomb_rate n_snp=423991
LLD_AFR n_snp=423991


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=423991
snpeff n_snp=423991
df_annot_chr (423991, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR16
PGEN n_snp=465661
bed n_snp=465661
nucleotide_diversity n_snp=465661
recomb_rate n_snp=465661
LLD_AFR n_snp=465661


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=465661
snpeff n_snp=465661
df_annot_chr (465661, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR17
PGEN n_snp=406065
bed n_snp=406065
nucleotide_diversity n_snp=406065
recomb_rate n_snp=406065
LLD_AFR n_snp=406065


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=406065
snpeff n_snp=406065
df_annot_chr (406065, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR18
PGEN n_snp=422346
bed n_snp=422346
nucleotide_diversity n_snp=422346
recomb_rate n_snp=422346
LLD_AFR n_snp=422346


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=422346
snpeff n_snp=422346
df_annot_chr (422346, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR19
PGEN n_snp=336680
bed n_snp=336680
nucleotide_diversity n_snp=336680
recomb_rate n_snp=336680
LLD_AFR n_snp=336680


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=336680
snpeff n_snp=336680
df_annot_chr (336680, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR20
PGEN n_snp=329864
bed n_snp=329864
nucleotide_diversity n_snp=329864
recomb_rate n_snp=329864
LLD_AFR n_snp=329864


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=329864
snpeff n_snp=329864
df_annot_chr (329864, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR21
PGEN n_snp=200644
bed n_snp=200644
nucleotide_diversity n_snp=200644
recomb_rate n_snp=200644
LLD_AFR n_snp=200644


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=200644
snpeff n_snp=200644
df_annot_chr (200644, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR22
PGEN n_snp=199680
bed n_snp=199680
nucleotide_diversity n_snp=199680
recomb_rate n_snp=199680
LLD_AFR n_snp=199680


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=199680
snpeff n_snp=199680
df_annot_chr (199680, 83)
n_AN = 79
writing
'CM' missing from df_annot.columns, add 'CM' column with 0


In [4]:
# Comparison to baselinev2.2
CHR = 22
ANNOT_FILE = "/n/groups/price/martin/data_GDREG/UKBimp_337K_MAF001/baseline_annot/main_annot/ukb_imp_chr@_v3.annot.gz"
df_annot_chr = gdreg.util.read_annot(ANNOT_FILE.replace("@", "%d" % CHR))
df_annot_chr.index = df_annot_chr["SNP"]

ANNOT_FILE_REF = "/n/groups/price/ldsc/reference_files/1000G_EUR_Phase3/baselineLD_v2.2/baselineLD.@.annot.gz"
df_annot_chr_ref = pd.read_csv(ANNOT_FILE_REF.replace("@", "%d" % CHR), sep='\t')
df_annot_chr_ref.index = df_annot_chr_ref["SNP"]

dic_map = {
    "Transcr_Hoffman" : "AN:Transcribed_Hoffman",
    "Transcr_Hoffman.flanking.500" : "AN:Transcribed_Hoffman.flanking.500",
    "GERP.NS" : "AN:GerpN",
    "GERP.RSsup4" : "AN:GerpRS_g4",
    "MAF_Adj_Predicted_Allele_Age" : "AN:alleleage",
    "Nucleotide_Diversity_10kb" : "AN:nucleotide_div",
    "Recomb_Rate_10kb" : "AN:recomb_rate",
    "CpG_Content_50kb" : "AN:CpG",
    "synonymous" : "AN:snpeff_synonymous_variant",
    "non_synonymous" : "AN:snpeff_nonsynonymous_variant",
    "MAF_Adj_LLD_AFR" : "AN:LLD_AFR"
}

snp_list = list(set(df_annot_chr["SNP"]) & set(df_annot_chr_ref["SNP"]))
AN2_list = []

for AN in df_annot_chr_ref:
    if AN in ["CHR", "BP", "SNP", "CM", "base"]:
        continue
    if AN.startswith("MAFb"):
        continue
    if "AN:%s" % AN in df_annot_chr:
        AN1,AN2 = AN,"AN:%s" % AN
    elif AN in dic_map:
        AN1,AN2 = AN,dic_map[AN]
    else:
        print(AN)
        continue
    
    AN2_list.append(AN2)
    v1 = df_annot_chr_ref.loc[snp_list, AN1].values
    v2 = df_annot_chr.loc[snp_list, AN2].values
    print("%-60s Corr=%0.3g, AN1_only=%d, AN2_only=%d, overlap=%d" % (
        "%s (%s) " % (AN, AN2.replace("AN:", "") if AN!=AN2.replace("AN:", "") else ""), 
        np.corrcoef(v1, v2)[0,1], 
        ((v1==1) & (v2==0)).sum(), 
        ((v1==0) & (v2==1)).sum(), 
        ((v1==1) & (v2==1)).sum(), 
    ))
    
for col in df_annot_chr:
    if col.startswith("AN:") and (col not in AN2_list):
        print("(%s)" % col.replace("AN:", ""))

Coding_UCSC ()                                               Corr=1, AN1_only=0, AN2_only=0, overlap=3922
Coding_UCSC.flanking.500 ()                                  Corr=1, AN1_only=0, AN2_only=0, overlap=11731
Conserved_LindbladToh ()                                     Corr=1, AN1_only=0, AN2_only=0, overlap=3422
Conserved_LindbladToh.flanking.500 ()                        Corr=1, AN1_only=0, AN2_only=0, overlap=42839
CTCF_Hoffman ()                                              Corr=1, AN1_only=0, AN2_only=0, overlap=4891
CTCF_Hoffman.flanking.500 ()                                 Corr=1, AN1_only=0, AN2_only=0, overlap=9950
DGF_ENCODE ()                                                Corr=1, AN1_only=0, AN2_only=0, overlap=24734
DGF_ENCODE.flanking.500 ()                                   Corr=1, AN1_only=0, AN2_only=0, overlap=62600
DHS_peaks_Trynka ()                                          Corr=1, AN1_only=0, AN2_only=0, overlap=18061
DHS_Trynka ()                            

### MAF-Stratified annots

In [2]:
# Load main_annot 
df_annot = None
PGEN_FILE = "/n/scratch3/users/j/jz286/imp_geno/ukb_imp_chr@_v3"
ANNOT_FILE = \
    "/n/groups/price/martin/data_GDREG/UKBimp_337K_MAF001/baseline_annot/main_annot/ukb_imp_chr@_v3.annot.gz"
OUT_PATH = "/n/groups/price/martin/data_GDREG/UKBimp_337K_MAF001/baseline_annot"

temp_list = []
for CHR in range(1,23):
# for CHR in [22]:
    temp_df = gdreg.util.read_annot(ANNOT_FILE.replace("@", "%d" % CHR))
    temp_df.index = temp_df["SNP"]
    temp_df_maf = gdreg.util.read_pgen(PGEN_FILE.replace("@", "%d" % CHR))["afreq"]
    temp_df_maf.index = temp_df_maf["SNP"]
    temp_df = temp_df.join(temp_df_maf[["MAF"]])
    temp_list.append(temp_df)

df_annot = pd.concat(temp_list, axis=0)
AN_list = [x for x in df_annot if x.startswith("AN:")]

# Remove highly-correlated annotations
df_corr = df_annot[AN_list].corr()
for i in range(len(AN_list)):
    for j in range(i+1, len(AN_list)):
        AN1,AN2 = AN_list[i],AN_list[j]
        if np.absolute(df_corr.loc[AN1, AN2]) > 0.6:
            print('%-50s %-50s r=%0.4f' % (AN1, AN2, df_corr.loc[AN1, AN2]))

AN:Conserved_LindbladToh                           AN:Conserved_Mammal_phastCons46way                 r=0.7849
AN:Conserved_LindbladToh                           AN:Conserved_Vertebrate_phastCons46way             r=0.6782
AN:Conserved_LindbladToh.flanking.500              AN:Conserved_Mammal_phastCons46way.flanking.500    r=0.7599
AN:Conserved_LindbladToh.flanking.500              AN:Conserved_Vertebrate_phastCons46way.flanking.500 r=0.7067
AN:Conserved_Mammal_phastCons46way                 AN:Conserved_Vertebrate_phastCons46way             r=0.7996
AN:Conserved_Mammal_phastCons46way.flanking.500    AN:Conserved_Vertebrate_phastCons46way.flanking.500 r=0.8485
AN:DHS_Trynka                                      AN:DHS_peaks_Trynka                                r=0.7724
AN:DHS_Trynka                                      AN:FetalDHS_Trynka                                 r=0.6833
AN:DHS_peaks_Trynka                                AN:FetalDHS_Trynka                                 r=0.6519

In [3]:
from sklearn.preprocessing import quantile_transform

# MAF bins
df_annot_norm = df_annot[["CHR", "SNP", "BP", "MAF"]].copy()
dic_maf_ind = {
    "common" : df_annot["MAF"].values >= 0.05,
    "lf" : (df_annot["MAF"].values >= 0.005) & (df_annot["MAF"].values < 0.05),
}

# base
df_annot_norm['AN:all'] = True

# MAF_splits
for AN in AN_list:
    for mbin in dic_maf_ind:
        df_annot_norm['%s_%s' % (AN, mbin)] = df_annot[AN].values * dic_maf_ind[mbin]
        
# MAF bins : 10 common bins & 5 lf bins
temp_v = pd.qcut(df_annot['MAF'].values[dic_maf_ind['common']], q=10, labels=False) 
for i in np.arange(10):
    df_annot_norm['AN:mbin%d_common' % i] = False
    df_annot_norm.loc[dic_maf_ind['common'], 'AN:mbin%d_common' % i] = temp_v==i
    
temp_v = pd.qcut(df_annot['MAF'].values[dic_maf_ind['lf']], q=5, labels=False) 
for i in np.arange(5):
    df_annot_norm['AN:mbin%d_lf' % i] = False
    df_annot_norm.loc[dic_maf_ind['lf'], 'AN:mbin%d_lf' % i] = temp_v==i

# MAF adjustment : AN:LLD_AFR_common
for i in np.arange(10):
    ind_select = df_annot_norm['AN:mbin%d_common' % i].values
    temp_v = df_annot_norm.loc[ind_select, 'AN:LLD_AFR_common'].values.copy()
    df_annot_norm.loc[ind_select, 'AN:LLD_AFR_common'] = quantile_transform(
        temp_v.reshape([-1,1]), n_quantiles=min(10000, temp_v.shape[0]), 
        output_distribution='normal', subsample=temp_v.shape[0],
    ).flatten()
        
for i in np.arange(5):
    ind_select = df_annot_norm['AN:mbin%d_lf' % i].values
    temp_v = df_annot_norm.loc[ind_select, 'AN:LLD_AFR_lf'].values.copy()
    df_annot_norm.loc[ind_select, 'AN:LLD_AFR_lf'] = quantile_transform(
        temp_v.reshape([-1,1]), n_quantiles=min(10000, temp_v.shape[0]), 
        output_distribution='normal', subsample=temp_v.shape[0],
    ).flatten()
    
# MAF-adjusted : alleleage_common
ind_nonmissing = (df_annot['AN:alleleage'] != 0).values
for i in np.arange(10):
    ind_select = df_annot_norm['AN:mbin%d_common' % i].values & ind_nonmissing
    temp_v = df_annot_norm.loc[ind_select, 'AN:alleleage_common'].values.copy()
    df_annot_norm.loc[ind_select, 'AN:alleleage_common'] = quantile_transform(
        temp_v.reshape([-1,1]), n_quantiles=min(10000, temp_v.shape[0]), 
        output_distribution='normal', subsample=temp_v.shape[0],
    ).flatten()
    
# drop
df_annot_norm.drop(["AN:alleleage_lf"], axis=1, inplace=True)

# dtype
AN_list_norm = [x for x in df_annot_norm if x.startswith("AN:")]
for AN in AN_list_norm:
    if len(set(df_annot_norm[AN].values[:10000])) <= 2:
        df_annot_norm[AN] = df_annot_norm[AN].astype(bool)
    else:
        df_annot_norm[AN] = df_annot_norm[AN].astype(np.float32)
        
        
# Summary
print('df_annot_norm', df_annot_norm.shape)
print('n_AN = %d' % len([x for x in df_annot_norm if x.startswith('AN:')]))

for CHR in range(1,23):
# for CHR in [22]:
    gdreg.util.write_annot(
        df_annot_norm.loc[df_annot_norm["CHR"]==CHR], OUT_PATH+"/baseline_chr%d.annot.gz" % CHR
    )

# Print information
for AN in AN_list_norm:
    ind_select = df_annot_norm[AN]!=0
    an_size = ind_select.mean()
    maf_min = df_annot_norm.loc[ind_select, "MAF"].min()
    maf_max = df_annot_norm.loc[ind_select, "MAF"].max()
    
    print("{:^55s} {:^15s} {:^15s}".format(
        AN, "size=%0.2g" % an_size, "MAF: %0.2g-%0.2g" % (maf_min, maf_max)
    ))

df_annot_norm (14820648, 177)
n_AN = 173
'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


                        AN:all                              size=1      MAF: 0.001-0.5 
           AN:Backgrd_Selection_Stat_common                size=0.44     MAF: 0.05-0.5 
             AN:Backgrd_Selection_Stat_lf                  size=0.27    MAF: 0.005-0.05
                   AN:BivFlnk_common                      size=0.0058    MAF: 0.05-0.5 
                     AN:BivFlnk_lf                        size=0.0041   MAF: 0.005-0.05
            AN:BivFlnk.flanking.500_common                size=0.0076    MAF: 0.05-0.5 
              AN:BivFlnk.flanking.500_lf                  size=0.005    MAF: 0.005-0.05
                  AN:CADD_g20_common                     size=0.00031    MAF: 0.05-0.5 
                    AN:CADD_g20_lf                       size=0.00045   MAF: 0.005-0.05
                AN:CTCF_Hoffman_common                     size=0.01     MAF: 0.05-0.5 
                  AN:CTCF_Hoffman_lf                      size=0.0064   MAF: 0.005-0.05
          AN:CTCF_Hoffman.flanki

    AN:PromoterFlanking_Hoffman.flanking.500_common       size=0.011     MAF: 0.05-0.5 
      AN:PromoterFlanking_Hoffman.flanking.500_lf         size=0.0072   MAF: 0.005-0.05
                AN:Promoter_UCSC_common                    size=0.02     MAF: 0.05-0.5 
                  AN:Promoter_UCSC_lf                     size=0.014    MAF: 0.005-0.05
         AN:Promoter_UCSC.flanking.500_common             size=0.0047    MAF: 0.05-0.5 
           AN:Promoter_UCSC.flanking.500_lf               size=0.0031   MAF: 0.005-0.05
              AN:Repressed_Hoffman_common                  size=0.2      MAF: 0.05-0.5 
                AN:Repressed_Hoffman_lf                    size=0.12    MAF: 0.005-0.05
       AN:Repressed_Hoffman.flanking.500_common            size=0.11     MAF: 0.05-0.5 
         AN:Repressed_Hoffman.flanking.500_lf             size=0.069    MAF: 0.005-0.05
             AN:SuperEnhancer_Hnisz_common                size=0.073     MAF: 0.05-0.5 
               AN:SuperEnhancer_

### Correctness

In [4]:
# Comparison the MAF-stratified version to baselinev2.2
CHR = 22
ANNOT_FILE = "/n/groups/price/martin/data_GDREG/UKBimp_337K_MAF001/baseline_annot/baseline_chr@.annot.gz"
df_annot_chr = gdreg.util.read_annot(ANNOT_FILE.replace("@", "%d" % CHR))
df_annot_chr.index = df_annot_chr["SNP"]

ANNOT_FILE_REF = "/n/groups/price/ldsc/reference_files/1000G_EUR_Phase3/baselineLD_v2.2/baselineLD.@.annot.gz"
df_annot_chr_ref = pd.read_csv(ANNOT_FILE_REF.replace("@", "%d" % CHR), sep='\t')
df_annot_chr_ref.index = df_annot_chr_ref["SNP"]

dic_map = {
    "Transcr_Hoffman" : "AN:Transcribed_Hoffman",
    "Transcr_Hoffman.flanking.500" : "AN:Transcribed_Hoffman.flanking.500",
    "GERP.NS" : "AN:GerpN",
    "GERP.RSsup4" : "AN:GerpRS_g4",
    "MAF_Adj_Predicted_Allele_Age" : "AN:alleleage",
    "Nucleotide_Diversity_10kb" : "AN:nucleotide_div",
    "Recomb_Rate_10kb" : "AN:recomb_rate",
    "CpG_Content_50kb" : "AN:CpG",
    "synonymous" : "AN:snpeff_synonymous_variant",
    "non_synonymous" : "AN:snpeff_nonsynonymous_variant",
    "MAF_Adj_LLD_AFR" : "AN:LLD_AFR"
}

snp_list = list(set(df_annot_chr["SNP"]) & set(df_annot_chr_ref["SNP"]))

for AN in df_annot_chr_ref:
    if AN in ["CHR", "BP", "SNP", "CM", "base"]:
        continue
    if AN.startswith("MAFb"):
        continue
    if "AN:%s_common" % AN in df_annot_chr:
        AN1 = AN
        AN2_list = ["AN:%s_%s" % (AN, x) for x in ["common", "lf"]]
    elif AN in dic_map:
        AN1 = AN
        AN2_list = ["%s_%s" % (dic_map[AN], x) for x in ["common", "lf"]]
    else:
        continue
        
    AN2_list = [x for x in AN2_list if x in df_annot_chr]
    
    v1 = df_annot_chr_ref.loc[snp_list, AN1].values
    v2 = df_annot_chr.loc[snp_list, AN2_list].sum(axis=1).values
    print("%-40s Corr=%0.3g, AN1_only=%d, AN2_only=%d, overlap=%d" % (
        AN, 
        np.corrcoef(v1, v2)[0,1], 
        ((v1==1) & (v2==0)).sum(), 
        ((v1==0) & (v2==1)).sum(), 
        ((v1==1) & (v2==1)).sum(), 
    ))
    print(AN2_list, "\n")    

Coding_UCSC                              Corr=0.96, AN1_only=296, AN2_only=0, overlap=3626
['AN:Coding_UCSC_common', 'AN:Coding_UCSC_lf'] 

Coding_UCSC.flanking.500                 Corr=0.966, AN1_only=723, AN2_only=0, overlap=11008
['AN:Coding_UCSC.flanking.500_common', 'AN:Coding_UCSC.flanking.500_lf'] 

Conserved_LindbladToh                    Corr=0.956, AN1_only=288, AN2_only=0, overlap=3134
['AN:Conserved_LindbladToh_common', 'AN:Conserved_LindbladToh_lf'] 

Conserved_LindbladToh.flanking.500       Corr=0.952, AN1_only=2768, AN2_only=0, overlap=40071
['AN:Conserved_LindbladToh.flanking.500_common', 'AN:Conserved_LindbladToh.flanking.500_lf'] 

CTCF_Hoffman                             Corr=0.969, AN1_only=290, AN2_only=0, overlap=4601
['AN:CTCF_Hoffman_common', 'AN:CTCF_Hoffman_lf'] 

CTCF_Hoffman.flanking.500                Corr=0.968, AN1_only=584, AN2_only=0, overlap=9366
['AN:CTCF_Hoffman.flanking.500_common', 'AN:CTCF_Hoffman.flanking.500_lf'] 

DGF_ENCODE                    

WeakEnhancer_Hoffman.flanking.500        Corr=0.963, AN1_only=959, AN2_only=0, overlap=13902
['AN:WeakEnhancer_Hoffman.flanking.500_common', 'AN:WeakEnhancer_Hoffman.flanking.500_lf'] 

GERP.NS                                  Corr=0.796, AN1_only=20, AN2_only=0, overlap=116
['AN:GerpN_common', 'AN:GerpN_lf'] 

GERP.RSsup4                              Corr=0.248, AN1_only=211, AN2_only=3880, overlap=423
['AN:GerpRS_g4_common', 'AN:GerpRS_g4_lf'] 

MAF_Adj_Predicted_Allele_Age             Corr=0.975, AN1_only=0, AN2_only=0, overlap=0
['AN:alleleage_common'] 

MAF_Adj_LLD_AFR                          Corr=0.754, AN1_only=0, AN2_only=0, overlap=0
['AN:LLD_AFR_common', 'AN:LLD_AFR_lf'] 

Recomb_Rate_10kb                         Corr=0.953, AN1_only=0, AN2_only=0, overlap=1
['AN:recomb_rate_common', 'AN:recomb_rate_lf'] 

Nucleotide_Diversity_10kb                Corr=0.676, AN1_only=3, AN2_only=0, overlap=0
['AN:nucleotide_div_common', 'AN:nucleotide_div_lf'] 

Backgrd_Selection_Stat       