In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import gdreg
import time

%load_ext autoreload
%autoreload 2

### Main annot

In [2]:
DATA_PATH = "/n/groups/price/martin/data_GDREG/UKBimp_337K_MAF001/baseline_annot"
PGEN_FILE = "/n/scratch3/users/j/jz286/imp_geno/ukb_imp_chr@_v3"
DIC_ANNOT_FILE = {
    "bed" : DATA_PATH + "/bed/ukb_imp_chr@_v3.bed.annot.gz",
    "diversity" : DATA_PATH + "/nucleotide_diversity/diversity.10kb.@.annot",
    "recomb_rate" : DATA_PATH + "/recomb_rate/ukb_imp_chr@_v3.oxford.10kb.recrate.annot",
    "CADD" : DATA_PATH + "/CADD/ukb_imp_chr@_v3.CADD",
    "snfeff" : DATA_PATH + "/snfeff/ukb_imp_chr@_v3.annot.gz",
}

DIC_NAME_UPDATE = {
    "AN:Vahedi_Tcell_SE_500bp" : "AN:Vahedi_Tcell_SE.extend.500",
    "AN:Vahedi_Tcell_TE_500bp" : "AN:Vahedi_Tcell_TE.extend.500",
}

In [3]:
# Write main annot
for CHR in range(1,23):
# for CHR in [21]:
    print('CHR%d' % CHR)
    df_annot_chr = gdreg.util.read_pgen(PGEN_FILE.replace("@", "%d" % CHR))["pvar"]
    df_annot_chr = df_annot_chr[["CHR", "SNP", "BP"]].copy()
    df_annot_chr['MAF'] = gdreg.util.read_pgen(
        PGEN_FILE.replace("@", "%d" % CHR))["afreq"]["MAF"].astype(np.float32)
    df_annot_chr.index = df_annot_chr["SNP"]   
    print("PGEN n_snp=%d" % df_annot_chr.shape[0])
    
    # .bed 
    temp_df = gdreg.util.read_annot(DIC_ANNOT_FILE["bed"].replace("@", "%d" % CHR))
    temp_df.columns = [DIC_NAME_UPDATE[x] if x in DIC_NAME_UPDATE else x for x in temp_df]
    temp_df.index = temp_df["SNP"]
    # AN.flanking.500 = AN.extend.500 - AN
    for AN in temp_df:
        if "%s.extend.500" % AN in temp_df:
            temp_df["%s.flanking.500" % AN] = temp_df["%s.extend.500" % AN].values & (~temp_df[AN].values) 
            temp_df.drop("%s.extend.500" % AN, axis=1, inplace=True)
    AN_list = [x for x in temp_df if x.startswith("AN:")]
    df_annot_chr = df_annot_chr.join(temp_df[AN_list])
    print("bed n_snp=%d" % df_annot_chr.shape[0])
    
    # nucleotide_diversity
    temp_df = pd.read_csv(
        DIC_ANNOT_FILE["diversity"].replace("@", "%d" % CHR), header=None
    )
    df_annot_chr["AN:nucleotide_div"] = temp_df[0].values
    print("nucleotide_diversity n_snp=%d" % df_annot_chr.shape[0])
    
    # recomb_rate
    temp_df =  pd.read_csv(
        DIC_ANNOT_FILE["recomb_rate"].replace("@", "%d" % CHR), header=None
    )
    df_annot_chr["AN:recomb_rate"] = temp_df[0].values
    print("recomb_rate n_snp=%d" % df_annot_chr.shape[0])    
    
    # CADD  
    temp_df = pd.read_csv(
        DIC_ANNOT_FILE["CADD"].replace("@", "%d" % CHR), sep='\t',
    )
    temp_df.drop_duplicates(subset="SNP", inplace=True)
    temp_df.index = temp_df["SNP"]
    temp_df = temp_df[["CADD:CpG", "CADD:GerpRS", "CADD:GerpN", "CADD:PHRED"]]
    temp_df["CADD:GerpRS"] = temp_df["CADD:GerpRS"]>=4
    temp_df["CADD:PHRED"] = temp_df["CADD:PHRED"]>=20
    temp_df.columns = ["AN:CpG", "AN:GerpRS_g4", "AN:GerpN", "AN:CADD_g20"]
    df_annot_chr = df_annot_chr.join(temp_df)
    print("CADD n_snp=%d" % df_annot_chr.shape[0])    
    
    # snpeff
    temp_df = gdreg.util.read_annot(DIC_ANNOT_FILE["snfeff"].replace("@", "%d" % CHR))
    temp_df.index = temp_df["SNP"]
    drop_list = ["AN:snpeff_3_prime_UTR_variant", "AN:snpeff_5_prime_UTR_variant", 
                 "AN:snpeff_intergenic_region", "AN:snpeff_intron_variant"]
    temp_df.drop(drop_list, axis=1, inplace=True)
    AN_list = [x for x in temp_df if x.startswith("AN:")]
    df_annot_chr = df_annot_chr.join(temp_df[AN_list])
    print("snpeff n_snp=%d" % df_annot_chr.shape[0])    
    
    print('writing')
    df_annot_chr.fillna(0, inplace=True)
    AN_list = [x for x in df_annot_chr if x.startswith("AN:")]
    for AN in AN_list:
        if len(set(df_annot_chr[AN].values[:10000])) <= 2:
            df_annot_chr[AN] = df_annot_chr[AN].astype(bool)
        else:
            df_annot_chr[AN] = df_annot_chr[AN].astype(np.float32)
    gdreg.util.write_annot(df_annot_chr, DATA_PATH+"/main_annot/ukb_imp_chr%d_v3.annot.gz" % CHR)

CHR1
PGEN n_snp=1161341
bed n_snp=1161341
nucleotide_diversity n_snp=1161341
recomb_rate n_snp=1161341


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=1161341
snpeff n_snp=1161341
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR2
PGEN n_snp=1259312
bed n_snp=1259312
nucleotide_diversity n_snp=1259312
recomb_rate n_snp=1259312


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=1259312
snpeff n_snp=1259312
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR3
PGEN n_snp=1059605
bed n_snp=1059605
nucleotide_diversity n_snp=1059605
recomb_rate n_snp=1059605


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=1059605
snpeff n_snp=1059605
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR4
PGEN n_snp=1074447
bed n_snp=1074447
nucleotide_diversity n_snp=1074447
recomb_rate n_snp=1074447


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=1074447
snpeff n_snp=1074447
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR5
PGEN n_snp=964806
bed n_snp=964806
nucleotide_diversity n_snp=964806
recomb_rate n_snp=964806


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=964806
snpeff n_snp=964806
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR6
PGEN n_snp=976090
bed n_snp=976090
nucleotide_diversity n_snp=976090
recomb_rate n_snp=976090


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=976090
snpeff n_snp=976090
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR7
PGEN n_snp=868301
bed n_snp=868301
nucleotide_diversity n_snp=868301
recomb_rate n_snp=868301


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=868301
snpeff n_snp=868301
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR8
PGEN n_snp=826076
bed n_snp=826076
nucleotide_diversity n_snp=826076
recomb_rate n_snp=826076


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=826076
snpeff n_snp=826076
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR9
PGEN n_snp=641608
bed n_snp=641608
nucleotide_diversity n_snp=641608
recomb_rate n_snp=641608


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=641608
snpeff n_snp=641608
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR10
PGEN n_snp=747773
bed n_snp=747773
nucleotide_diversity n_snp=747773
recomb_rate n_snp=747773


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=747773
snpeff n_snp=747773
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR11
PGEN n_snp=730385
bed n_snp=730385
nucleotide_diversity n_snp=730385
recomb_rate n_snp=730385


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=730385
snpeff n_snp=730385
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR12
PGEN n_snp=705493
bed n_snp=705493
nucleotide_diversity n_snp=705493
recomb_rate n_snp=705493


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=705493
snpeff n_snp=705493
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR13
PGEN n_snp=537961
bed n_snp=537961
nucleotide_diversity n_snp=537961
recomb_rate n_snp=537961


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=537961
snpeff n_snp=537961
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR14
PGEN n_snp=482519
bed n_snp=482519
nucleotide_diversity n_snp=482519
recomb_rate n_snp=482519


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=482519
snpeff n_snp=482519
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR15
PGEN n_snp=423991
bed n_snp=423991
nucleotide_diversity n_snp=423991
recomb_rate n_snp=423991


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=423991
snpeff n_snp=423991
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR16
PGEN n_snp=465661
bed n_snp=465661
nucleotide_diversity n_snp=465661
recomb_rate n_snp=465661


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=465661
snpeff n_snp=465661
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR17
PGEN n_snp=406065
bed n_snp=406065
nucleotide_diversity n_snp=406065
recomb_rate n_snp=406065


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=406065
snpeff n_snp=406065
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR18
PGEN n_snp=422346
bed n_snp=422346
nucleotide_diversity n_snp=422346
recomb_rate n_snp=422346


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=422346
snpeff n_snp=422346
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR19
PGEN n_snp=336680
bed n_snp=336680
nucleotide_diversity n_snp=336680
recomb_rate n_snp=336680


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=336680
snpeff n_snp=336680
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR20
PGEN n_snp=329864
bed n_snp=329864
nucleotide_diversity n_snp=329864
recomb_rate n_snp=329864


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=329864
snpeff n_snp=329864
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR21
PGEN n_snp=200644
bed n_snp=200644
nucleotide_diversity n_snp=200644
recomb_rate n_snp=200644


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=200644
snpeff n_snp=200644
writing
'CM' missing from df_annot.columns, add 'CM' column with 0
CHR22
PGEN n_snp=199680
bed n_snp=199680
nucleotide_diversity n_snp=199680
recomb_rate n_snp=199680


  interactivity=interactivity, compiler=compiler, result=result)


CADD n_snp=199680
snpeff n_snp=199680
writing
'CM' missing from df_annot.columns, add 'CM' column with 0


In [4]:
# Comparison to baselinev2.2
CHR = 21
ANNOT_FILE = "/n/groups/price/martin/data_GDREG/UKBimp_337K_MAF001/baseline_annot/main_annot/ukb_imp_chr@_v3.annot.gz"
df_annot_chr = gdreg.util.read_annot(ANNOT_FILE.replace("@", "%d" % CHR))
df_annot_chr.index = df_annot_chr["SNP"]

ANNOT_FILE_REF = "/n/groups/price/ldsc/reference_files/1000G_EUR_Phase3/baselineLD_v2.2/baselineLD.@.annot.gz"
df_annot_chr_ref = pd.read_csv(ANNOT_FILE_REF.replace("@", "%d" % CHR), sep='\t')
df_annot_chr_ref.index = df_annot_chr_ref["SNP"]

dic_map = {
    "Transcr_Hoffman" : "AN:Transcribed_Hoffman",
    "Transcr_Hoffman.flanking.500" : "AN:Transcribed_Hoffman.flanking.500",
    "GERP.NS" : "AN:GerpN",
    "GERP.RSsup4" : "AN:GerpRS_g4",
    "MAF_Adj_Predicted_Allele_Age" : "AN:alleleage",
    "Nucleotide_Diversity_10kb" : "AN:nucleotide_div",
    "Recomb_Rate_10kb" : "AN:recomb_rate",
    "CpG_Content_50kb" : "AN:CpG",
    "synonymous" : "AN:snpeff_synonymous_variant",
    "non_synonymous" : "AN:snpeff_nonsynonymous_variant",
}

snp_list = list(set(df_annot_chr["SNP"]) & set(df_annot_chr_ref["SNP"]))
AN2_list = []

for AN in df_annot_chr_ref:
    if AN in ["CHR", "BP", "SNP", "CM", "base"]:
        continue
    if AN.startswith("MAFb"):
        continue
    if "AN:%s" % AN in df_annot_chr:
        AN1,AN2 = AN,"AN:%s" % AN
    elif AN in dic_map:
        AN1,AN2 = AN,dic_map[AN]
    else:
        print(AN)
        continue
    
    AN2_list.append(AN2)
    v1 = df_annot_chr_ref.loc[snp_list, AN1].values
    v2 = df_annot_chr.loc[snp_list, AN2].values
    print("%-60s Corr=%0.3g, AN1_only=%d, AN2_only=%d, overlap=%d" % (
        "%s (%s) " % (AN, AN2.replace("AN:", "") if AN!=AN2.replace("AN:", "") else ""), 
        np.corrcoef(v1, v2)[0,1], 
        ((v1==1) & (v2==0)).sum(), 
        ((v1==0) & (v2==1)).sum(), 
        ((v1==1) & (v2==1)).sum(), 
    ))
    
for col in df_annot_chr:
    if col.startswith("AN:") and (col not in AN2_list):
        print("(%s)" % col.replace("AN:", ""))

Coding_UCSC ()                                               Corr=1, AN1_only=0, AN2_only=0, overlap=1964
Coding_UCSC.flanking.500 ()                                  Corr=1, AN1_only=0, AN2_only=0, overlap=6735
Conserved_LindbladToh ()                                     Corr=1, AN1_only=0, AN2_only=0, overlap=3508
Conserved_LindbladToh.flanking.500 ()                        Corr=1, AN1_only=0, AN2_only=0, overlap=44628
CTCF_Hoffman ()                                              Corr=1, AN1_only=0, AN2_only=0, overlap=2916
CTCF_Hoffman.flanking.500 ()                                 Corr=1, AN1_only=0, AN2_only=0, overlap=5837
DGF_ENCODE ()                                                Corr=1, AN1_only=0, AN2_only=0, overlap=19151
DGF_ENCODE.flanking.500 ()                                   Corr=1, AN1_only=0, AN2_only=0, overlap=51448
DHS_peaks_Trynka
DHS_Trynka ()                                                Corr=1, AN1_only=0, AN2_only=0, overlap=22693
DHS_Trynka.flanking.500 (

### MAF-Stratified annots

In [5]:
# Load main_annot 
df_annot = None
PGEN_FILE = "/n/scratch3/users/j/jz286/imp_geno/ukb_imp_chr@_v3"
ANNOT_FILE = \
    "/n/groups/price/martin/data_GDREG/UKBimp_337K_MAF001/baseline_annot/main_annot/ukb_imp_chr@_v3.annot.gz"
OUT_PATH = "/n/groups/price/martin/data_GDREG/UKBimp_337K_MAF001/baseline_annot"

temp_list = []
for CHR in range(1,23):
# for CHR in [21, 22]:
    temp_df = gdreg.util.read_annot(ANNOT_FILE.replace("@", "%d" % CHR))
    temp_df.index = temp_df["SNP"]
    temp_df_maf = gdreg.util.read_pgen(PGEN_FILE.replace("@", "%d" % CHR))["afreq"]
    temp_df_maf.index = temp_df_maf["SNP"]
    temp_df = temp_df.join(temp_df_maf[["MAF"]])
    temp_list.append(temp_df)

df_annot = pd.concat(temp_list, axis=0)
AN_list = [x for x in df_annot if x.startswith("AN:")]

# Remove highly-correlated annotations
df_corr = df_annot[AN_list].corr()
for i in range(len(AN_list)):
    for j in range(i+1, len(AN_list)):
        AN1,AN2 = AN_list[i],AN_list[j]
        if np.absolute(df_corr.loc[AN1, AN2]) > 0.5:
            print(AN1, AN2, df_corr.loc[AN1, AN2])

AN:Coding_UCSC AN:Intron_UCSC.flanking.500 0.5954851469487258
AN:Coding_UCSC AN:UTR_3_UCSC 0.5966605528484225
AN:Conserved_LindbladToh AN:Conserved_Mammal_phastCons46way 0.7848986778487758
AN:Conserved_LindbladToh AN:Conserved_Primate_phastCons46way 0.5867114564501255
AN:Conserved_LindbladToh AN:Conserved_Vertebrate_phastCons46way 0.6782357978001979
AN:Conserved_LindbladToh.flanking.500 AN:Conserved_Mammal_phastCons46way.flanking.500 0.7599474280806244
AN:Conserved_LindbladToh.flanking.500 AN:Conserved_Vertebrate_phastCons46way.flanking.500 0.7067474307348826
AN:Conserved_Mammal_phastCons46way AN:Conserved_Primate_phastCons46way 0.5955074928777135
AN:Conserved_Mammal_phastCons46way AN:Conserved_Vertebrate_phastCons46way 0.7996170663293057
AN:Conserved_Mammal_phastCons46way.flanking.500 AN:Conserved_Primate_phastCons46way.flanking.500 0.514123582589877
AN:Conserved_Mammal_phastCons46way.flanking.500 AN:Conserved_Vertebrate_phastCons46way.flanking.500 0.8485282559463706
AN:Conserved_Prim

In [7]:
# MAF bins
df_annot_norm = df_annot[["CHR", "SNP", "BP", "MAF"]].copy()
dic_maf_ind = {
    "common" : df_annot["MAF"].values >= 0.05,
    "lf" : (df_annot["MAF"].values >= 0.005) & (df_annot["MAF"].values < 0.05),
    "rare" : df_annot["MAF"].values <= 0.005,
}

# MAF_splits
for AN in AN_list:
    for mbin in dic_maf_ind:
        df_annot_norm['%s_%s' % (AN, mbin)] = df_annot[AN].values * dic_maf_ind[mbin]
        
# MAF bins
for mbin in dic_maf_ind:
    temp_v = pd.qcut(df_annot['MAF'].values[dic_maf_ind[mbin]], q=5, labels=False) 
    for i in np.arange(5):
        df_annot_norm['AN:mbin%d_%s' % (i, mbin)] = False
        df_annot_norm.loc[dic_maf_ind[mbin], 'AN:mbin%d_%s' % (i, mbin)] = temp_v==i   


# MAF adjustment
AN = "AN:alleleage_common"
for i in np.arange(5):
    ind_select = df_annot_norm['AN:mbin%d_common' % i].values
    temp_v = df_annot_norm.loc[ind_select, AN].values.copy()
    df_annot_norm.loc[ind_select, AN] = (temp_v-temp_v.mean())/np.std(temp_v) 
    
# drop
drop_list = ["AN:alleleage_lf", "AN:alleleage_rare"]
df_annot_norm.drop(drop_list, axis=1, inplace=True)

# dtype
AN_list_norm = [x for x in df_annot_norm if x.startswith("AN:")]
for AN in AN_list_norm:
    if len(set(df_annot_norm[AN].values[:10000])) <= 2:
        df_annot_norm[AN] = df_annot_norm[AN].astype(bool)
    else:
        df_annot_norm[AN] = df_annot_norm[AN].astype(np.float32)

for CHR in range(1,23):
# for CHR in [21]:
    gdreg.util.write_annot(
        df_annot_norm.loc[df_annot_norm["CHR"]==CHR], OUT_PATH+"/baseline_chr%d.annot.gz" % CHR
    )

# Print information
for AN in AN_list_norm:
    ind_select = df_annot_norm[AN]!=0
    an_size = ind_select.mean()
    if an_size<0.001:
        drop_list.append(AN)
    maf_min = df_annot_norm.loc[ind_select, "MAF"].min()
    maf_max = df_annot_norm.loc[ind_select, "MAF"].max()
    
    print("{:^55s} {:^15s} {:^15s}".format(
        AN, "size=%0.2g" % an_size, "MAF: %0.2g-%0.2g" % (maf_min, maf_max)
    ))

'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


'CM' missing from df_annot.columns, add 'CM' column with 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_annot["CM"] = 0


           AN:Backgrd_Selection_Stat_common                size=0.44     MAF: 0.05-0.5 
             AN:Backgrd_Selection_Stat_lf                  size=0.27    MAF: 0.005-0.05
            AN:Backgrd_Selection_Stat_rare                 size=0.29    MAF: 0.001-0.005
                   AN:BivFlnk_common                      size=0.0058    MAF: 0.05-0.5 
                     AN:BivFlnk_lf                        size=0.0041   MAF: 0.005-0.05
                    AN:BivFlnk_rare                       size=0.0045   MAF: 0.001-0.005
            AN:BivFlnk.flanking.500_common                size=0.0076    MAF: 0.05-0.5 
              AN:BivFlnk.flanking.500_lf                  size=0.005    MAF: 0.005-0.05
             AN:BivFlnk.flanking.500_rare                 size=0.0053   MAF: 0.001-0.005
                  AN:CADD_g20_common                     size=0.00031    MAF: 0.05-0.5 
                    AN:CADD_g20_lf                       size=0.00045   MAF: 0.005-0.05
                   AN:CADD_g2

                AN:H3K27ac_PGC2_common                     size=0.12     MAF: 0.05-0.5 
                  AN:H3K27ac_PGC2_lf                      size=0.076    MAF: 0.005-0.05
                 AN:H3K27ac_PGC2_rare                     size=0.082    MAF: 0.001-0.005
          AN:H3K27ac_PGC2.flanking.500_common             size=0.029     MAF: 0.05-0.5 
            AN:H3K27ac_PGC2.flanking.500_lf               size=0.018    MAF: 0.005-0.05
           AN:H3K27ac_PGC2.flanking.500_rare               size=0.02    MAF: 0.001-0.005
               AN:H3K4me1_Trynka_common                    size=0.19     MAF: 0.05-0.5 
                 AN:H3K4me1_Trynka_lf                      size=0.12    MAF: 0.005-0.05
                AN:H3K4me1_Trynka_rare                     size=0.13    MAF: 0.001-0.005
         AN:H3K4me1_Trynka.flanking.500_common            size=0.081     MAF: 0.05-0.5 
           AN:H3K4me1_Trynka.flanking.500_lf               size=0.05    MAF: 0.005-0.05
          AN:H3K4me1_Trynka.f

          AN:Vahedi_Tcell_TE.flanking.500_lf              size=0.0012   MAF: 0.005-0.05
         AN:Vahedi_Tcell_TE.flanking.500_rare             size=0.0013   MAF: 0.001-0.005
            AN:WeakEnhancer_Hoffman_common                size=0.0089    MAF: 0.05-0.5 
              AN:WeakEnhancer_Hoffman_lf                  size=0.0059   MAF: 0.005-0.05
             AN:WeakEnhancer_Hoffman_rare                 size=0.0065   MAF: 0.001-0.005
      AN:WeakEnhancer_Hoffman.flanking.500_common          size=0.03     MAF: 0.05-0.5 
        AN:WeakEnhancer_Hoffman.flanking.500_lf           size=0.019    MAF: 0.005-0.05
       AN:WeakEnhancer_Hoffman.flanking.500_rare           size=0.02    MAF: 0.001-0.005
                  AN:alleleage_common                      size=0.44     MAF: 0.05-0.5 
               AN:nucleotide_div_common                    size=0.44     MAF: 0.05-0.5 
                 AN:nucleotide_div_lf                      size=0.27    MAF: 0.005-0.05
                AN:nucleotide

### Correctness

In [8]:
# Comparison the MAF-stratified version to baselinev2.2
CHR = 21
ANNOT_FILE = "/n/groups/price/martin/data_GDREG/UKBimp_337K_MAF001/baseline_annot/baseline_chr@.annot.gz"
df_annot_chr = gdreg.util.read_annot(ANNOT_FILE.replace("@", "%d" % CHR))
df_annot_chr.index = df_annot_chr["SNP"]

ANNOT_FILE_REF = "/n/groups/price/ldsc/reference_files/1000G_EUR_Phase3/baselineLD_v2.2/baselineLD.@.annot.gz"
df_annot_chr_ref = pd.read_csv(ANNOT_FILE_REF.replace("@", "%d" % CHR), sep='\t')
df_annot_chr_ref.index = df_annot_chr_ref["SNP"]

dic_map = {
    "Transcr_Hoffman" : "AN:Transcribed_Hoffman",
    "Transcr_Hoffman.flanking.500" : "AN:Transcribed_Hoffman.flanking.500",
    "GERP.NS" : "AN:GerpN",
    "GERP.RSsup4" : "AN:GerpRS_g4",
    "MAF_Adj_Predicted_Allele_Age" : "AN:alleleage",
    "Nucleotide_Diversity_10kb" : "AN:nucleotide_div",
    "Recomb_Rate_10kb" : "AN:recomb_rate",
    "CpG_Content_50kb" : "AN:CpG",
    "synonymous" : "AN:snpeff_synonymous_variant",
    "non_synonymous" : "AN:snpeff_nonsynonymous_variant",
}

snp_list = list(set(df_annot_chr["SNP"]) & set(df_annot_chr_ref["SNP"]))

for AN in df_annot_chr_ref:
    if AN in ["CHR", "BP", "SNP", "CM", "base"]:
        continue
    if AN.startswith("MAFb"):
        continue
    if "AN:%s_common" % AN in df_annot_chr:
        AN1 = AN
        AN2_list = ["AN:%s_%s" % (AN, x) for x in ["common", "lf", "rare"]]
    elif AN in dic_map:
        AN1 = AN
        AN2_list = ["%s_%s" % (dic_map[AN], x) for x in ["common", "lf", "rare"]]
    else:
        continue
        
    AN2_list = [x for x in AN2_list if x in df_annot_chr]
    
    v1 = df_annot_chr_ref.loc[snp_list, AN1].values
    v2 = df_annot_chr.loc[snp_list, AN2_list].sum(axis=1).values
    print("%-40s Corr=%0.3g, AN1_only=%d, AN2_only=%d, overlap=%d" % (
        AN, 
        np.corrcoef(v1, v2)[0,1], 
        ((v1==1) & (v2==0)).sum(), 
        ((v1==0) & (v2==1)).sum(), 
        ((v1==1) & (v2==1)).sum(), 
    ))
    print(AN2_list, "\n")
    

Coding_UCSC                              Corr=1, AN1_only=0, AN2_only=0, overlap=1964
['AN:Coding_UCSC_common', 'AN:Coding_UCSC_lf', 'AN:Coding_UCSC_rare'] 

Coding_UCSC.flanking.500                 Corr=1, AN1_only=0, AN2_only=0, overlap=6735
['AN:Coding_UCSC.flanking.500_common', 'AN:Coding_UCSC.flanking.500_lf', 'AN:Coding_UCSC.flanking.500_rare'] 

Conserved_LindbladToh                    Corr=1, AN1_only=0, AN2_only=0, overlap=3508
['AN:Conserved_LindbladToh_common', 'AN:Conserved_LindbladToh_lf', 'AN:Conserved_LindbladToh_rare'] 

Conserved_LindbladToh.flanking.500       Corr=1, AN1_only=0, AN2_only=0, overlap=44628
['AN:Conserved_LindbladToh.flanking.500_common', 'AN:Conserved_LindbladToh.flanking.500_lf', 'AN:Conserved_LindbladToh.flanking.500_rare'] 

CTCF_Hoffman                             Corr=1, AN1_only=0, AN2_only=0, overlap=2916
['AN:CTCF_Hoffman_common', 'AN:CTCF_Hoffman_lf', 'AN:CTCF_Hoffman_rare'] 

CTCF_Hoffman.flanking.500                Corr=1, AN1_only=0, AN2_onl

UTR_5_UCSC                               Corr=1, AN1_only=0, AN2_only=0, overlap=863
['AN:UTR_5_UCSC_common', 'AN:UTR_5_UCSC_lf', 'AN:UTR_5_UCSC_rare'] 

UTR_5_UCSC.flanking.500                  Corr=1, AN1_only=0, AN2_only=0, overlap=3302
['AN:UTR_5_UCSC.flanking.500_common', 'AN:UTR_5_UCSC.flanking.500_lf', 'AN:UTR_5_UCSC.flanking.500_rare'] 

WeakEnhancer_Hoffman                     Corr=1, AN1_only=0, AN2_only=0, overlap=2669
['AN:WeakEnhancer_Hoffman_common', 'AN:WeakEnhancer_Hoffman_lf', 'AN:WeakEnhancer_Hoffman_rare'] 

WeakEnhancer_Hoffman.flanking.500        Corr=1, AN1_only=0, AN2_only=0, overlap=8175
['AN:WeakEnhancer_Hoffman.flanking.500_common', 'AN:WeakEnhancer_Hoffman.flanking.500_lf', 'AN:WeakEnhancer_Hoffman.flanking.500_rare'] 

GERP.NS                                  Corr=0.822, AN1_only=27, AN2_only=0, overlap=103
['AN:GerpN_common', 'AN:GerpN_lf', 'AN:GerpN_rare'] 

GERP.RSsup4                              Corr=0.248, AN1_only=170, AN2_only=4054, overlap=412
['AN: