In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import time
import os
import ldspec
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from statsmodels.stats.multitest import multipletests


# autoreload
%load_ext autoreload
%autoreload 2

### Trait info

In [2]:
DIC_NAME = {
    'blood_PLATELET_COUNT': 'Platelet Count', 
    'blood_RBC_DISTRIB_WIDTH': 'Red Blood Cell Distribution Width',
    'blood_RED_COUNT': 'Red Blood Cell Count',
    'blood_WHITE_COUNT': 'White Blood Cell Count', 
    'bmd_HEEL_TSCOREz': 'Heel T Score', 
    'body_BALDING1': 'Balding Type I', 
    'body_BMIz': 'BMI',
    'body_HEIGHTz': 'Height',
    'body_WHRadjBMIz': 'Waist-hip Ratio',
    'bp_DIASTOLICadjMEDz': 'Diastolic Blood Pressure',
    'cov_EDU_YEARS': 'Years of Education',
    'disease_ALLERGY_ECZEMA_DIAGNOSED': 'Eczema',
    'lung_FEV1FVCzSMOKE': 'FEV1-FVC Ratio',
    'lung_FVCzSMOKE': 'Forced Vital Capacity (FVC)',
    'mental_NEUROTICISM': 'Neuroticism',
    'other_MORNINGPERSON': 'Morning Person',
#     'pigment_SUNBURN': 'Sunburn Occasion',
    'repro_MENARCHE_AGE': 'Age at Menarche',
    'repro_MENOPAUSE_AGE': 'Age at Menopause',
    'repro_NumberChildrenEverBorn_Pooled': 'Number Children (Pooled)',
#     'cancer_BREAST' : 'Breast Cancer',
#     'cancer_PROSTATE' : 'Prostate Cancer',
    'disease_HYPOTHYROIDISM_SELF_REP' : 'Hypothyroidism',
    'biochemistry_AlkalinePhosphatase' : 'Alkaline Phosphatase',
    'biochemistry_AspartateAminotransferase' : 'Aspartate Aminotransferase',
    'biochemistry_Cholesterol' : 'Cholesterol', 
    'biochemistry_Creatinine' : 'Creatinine',
    'biochemistry_IGF1' : 'IGF1',
    'biochemistry_Phosphate' : 'Phosphate',
    'biochemistry_TotalBilirubin' : 'Total Bilirubin',
    'biochemistry_TotalProtein' : 'Total Protein',
    'biochemistry_VitaminD' : 'Vitamin D',
}

DIC_NAME_OTHER = {
    'biochemistry_AlanineAminotransferase' : 'Alanine Aminotransferase',
    'biochemistry_Albumin' : 'Albumin',
    'biochemistry_ApolipoproteinA' : 'Apolipoprotein A',
    'biochemistry_ApolipoproteinB' : 'Apolipoprotein B',
    'biochemistry_Calcium' : 'Calcium',
    'biochemistry_CreactiveProtein' : 'C-reactive Protein',
    'biochemistry_CystatinC' : 'Cystatin C',
    'biochemistry_DirectBilirubin' : 'Direct Bilirubin',
    'biochemistry_GammaGlutamyltransferase' : 'Gamma Glutamyl Transferase',
    'biochemistry_Glucose' : 'Glucose',
    'biochemistry_HDLcholesterol' : 'HDL Cholesterol',
    'biochemistry_HbA1c' : 'HbA1c',
    'biochemistry_LDLdirect' : 'LDL Direct',
    'biochemistry_SHBG' : 'SHBG',
    'biochemistry_Triglycerides' : 'Triglycerides',
    'biochemistry_Urate' : 'Urate',
    'biochemistry_Urea' : 'Urea',
    'blood_EOSINOPHIL_COUNT' : 'Eosinophil Count',
    'blood_HIGH_LIGHT_SCATTER_RETICULOCYTE_COUNT' : 'High Light Scatter Reticulocyte Count',
    'blood_LYMPHOCYTE_COUNT' : 'Lymphocyte Count',
    'blood_MEAN_CORPUSCULAR_HEMOGLOBIN' : 'Mean Corpular Hemoglobin',
    'blood_MEAN_PLATELET_VOL' : 'Mean Platelet Volume',
    'blood_MEAN_SPHERED_CELL_VOL' : 'Mean Sphered Cell Volume',
    'blood_MONOCYTE_COUNT' : 'Monocyte Count',
    'blood_PLATELET_DISTRIB_WIDTH' : 'Platelet Distribution Width',
    'body_BALDING4' : 'Balding Type IV',
    'bp_SYSTOLICadjMEDz' : 'Systolic Blood Pressure',
    'cov_EDU_COLLEGE' : 'College Education',
    'cov_SMOKING_STATUS' : 'Smoking Status',
    'disease_AID_ALL' : 'Auto Immune Traits',
    'disease_ASTHMA_DIAGNOSED' : 'Asthma',
    'disease_CARDIOVASCULAR' : 'Cardiovascular Diseases',
    'disease_DIABETES_ANY_DIAGNOSED' : 'Diabetes',
    'disease_ENDOCRINE_DIABETES' : 'Endocrine and Diabetes Diseases',
    'disease_HI_CHOL_SELF_REP' : 'High Cholesterol',
    'disease_HYPERTENSION_DIAGNOSED' : 'Hypertension',
    'disease_RESPIRATORY_ENT' : 'Respiratory and Ear-nose-throat Diseases',
    'disease_T2D' : 'Type 2 Diabetes',
    'disease_THYROID_ANY_SELF_REP' : 'Thyroid',
    'impedance_BASAL_METABOLIC_RATEz' : 'Basal Metabolic Rate',
    'repro_AgeFirstBirth_Female' : 'Age at First Birth (Female)',
}

In [3]:
df_info = pd.DataFrame(index=sorted(list(DIC_NAME)+list(DIC_NAME_OTHER)))
df_info['Trait Name'] = [DIC_NAME[x] if x in DIC_NAME else DIC_NAME_OTHER[x] for x in df_info.index]
df_info['Trait_Identifier'] = df_info.index
df_info['Indpt'] = [x in DIC_NAME for x in df_info.index]

sumstats_file = '/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/sumstats/@.nomhc.sumstats.gz'
df_info['N'] = [pd.read_csv(sumstats_file.replace('@', x), nrows=10, sep='\t')['N'].median().astype(int)
                for x in df_info.index]

# Heritability from LD-SPEC
res_file = '/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/ldspec_res_092223.prox_gene_fct_all_ld/@'
df_info['h2'] = 0
df_info['h2_se'] = 0
for trait in df_info.index:
    temp_df = pd.read_csv(res_file.replace('@', trait)+'.tau.tsv', sep='\t', index_col=0)
    df_info.loc[trait, ['h2', 'h2_se']] = temp_df.loc['AN:all', ['h2', 'h2_se']]
df_info['h2_z'] = df_info['h2'] / df_info['h2_se']
    
df_info.to_csv('/n/groups/price/martin/LDSPEC_data/results/tables/trait_info.tsv', sep='\t', index=False)
print('Avg. N = %d' % df_info['N'].mean())
print('Avg. N_indpt = %d' % df_info.loc[df_info['Indpt'], 'N'].mean())
print('min h2_z = %0.3f' % df_info['h2_z'].min())
display(df_info)

# Write trait list
with open('/n/groups/price/martin/LDSPEC_data/UKBB_trait/trait_list_paper.txt', 'w') as f:
    for trait in df_info.index:
        f.write('%s\n'%trait)        
with open('/n/groups/price/martin/LDSPEC_data/UKBB_trait/trait_list_paper_indpt.txt', 'w') as f:
    for trait in df_info.index[df_info['Indpt']]:
        f.write('%s\n'%trait)
with open('/n/groups/price/martin/LDSPEC_data/UKBB_trait/trait_list_paper_other.txt', 'w') as f:
    for trait in df_info.index[~df_info['Indpt']]:
        f.write('%s\n'%trait)

Avg. N = 305646
Avg. N_indpt = 298430
min h2_z = 5.948


Unnamed: 0,Trait Name,Trait_Identifier,Indpt,N,h2,h2_se,h2_z
biochemistry_AlanineAminotransferase,Alanine Aminotransferase,biochemistry_AlanineAminotransferase,False,317355,0.147316,0.010030,14.688030
biochemistry_Albumin,Albumin,biochemistry_Albumin,False,292543,0.189497,0.011304,16.763759
biochemistry_AlkalinePhosphatase,Alkaline Phosphatase,biochemistry_AlkalinePhosphatase,True,318768,0.294509,0.036139,8.149350
biochemistry_ApolipoproteinA,Apolipoprotein A,biochemistry_ApolipoproteinA,False,291023,0.261099,0.021496,12.146281
biochemistry_ApolipoproteinB,Apolipoprotein B,biochemistry_ApolipoproteinB,False,318292,0.172199,0.023906,7.203151
...,...,...,...,...,...,...,...
other_MORNINGPERSON,Morning Person,other_MORNINGPERSON,True,301336,0.138086,0.006923,19.945305
repro_AgeFirstBirth_Female,Age at First Birth (Female),repro_AgeFirstBirth_Female,False,123910,0.241422,0.013692,17.632939
repro_MENARCHE_AGE,Age at Menarche,repro_MENARCHE_AGE,True,176122,0.295766,0.014629,20.217575
repro_MENOPAUSE_AGE,Age at Menopause,repro_MENOPAUSE_AGE,True,104413,0.145326,0.017618,8.248823


### Main single-SNP annotations and the 165 annotation baseline model

In [15]:
# [Name, type, reference]
DIC_NAME_AN = {
    'BivFlnk' : ['Flanking bivalent TSS/enhancer', 'b', 'Roadmap Consortium 2015', 'bed'],
    'Conserved_LindbladToh' : ['Conserved (Lindblad-Toh)', 'b', 'Ward and Kellis 2012', 'bed'],
    'Conserved_Vertebrate_phastCons46way' : ['Conserved in vertebrates (phastCons)', 'b', 'Siepel et al. 2005', 'bed'],
    'Conserved_Mammal_phastCons46way' : ['Conserved in mammals (phastCons)', 'b', 'Siepel et al. 2005', 'bed'],
    'Conserved_Primate_phastCons46way' : ['Conserved in primates (phastCons)', 'b', 'Siepel et al. 2005', 'bed'],
    'Coding_UCSC' : ['Coding', 'b', 'UCSC', 'bed'],
    'UTR_3_UCSC' : ['3UTR', 'b', 'UCSC', 'bed'],
    'UTR_5_UCSC' : ['5UTR', 'b', 'UCSC', 'bed'],
    'Intron_UCSC' : ['Intron', 'b', 'UCSC', 'bed'],
    'Enhancer_Andersson' : ['FANTOM5 enhancer', 'b', 'Andersson et al. 2014', 'bed'],
    'SuperEnhancer_Hnisz' : ['Super enhancer (Hnisz)', 'b', 'Hnisz et al. 2013', 'bed'],
    'Enhancer_Hoffman' : ['Enhancer', 'b', 'Hoffman et al. 2013', 'bed'],
    'WeakEnhancer_Hoffman' : ['Weak enhancer', 'b', 'Hoffman et al. 2013', 'bed'],
    'Promoter_UCSC' : ['Promoter', 'b', 'UCSC', 'bed'],
    'PromoterFlanking_Hoffman' : ['Promoter flanking', 'b', 'Hoffman et al. 2013', 'bed'],
    'DHS_Trynka' : ['DHS', 'b', 'Trynka et al. 2013', 'bed'],
    'DHS_peaks_Trynka' : ['DHS peaks', 'b', 'Trynka et al. 2013', 'bed'], # In baseline-LF but not in description
    'FetalDHS_Trynka' : ['Fetal DHS', 'b', 'Trynka et al. 2013', 'bed'],
#     'H3K27ac_Hnisz' : ['H3K27ac (Hnisz)', 'b', 'Hnisz et al. 2013'], 
    'H3K27ac_Hnisz' : ['H3K27ac', 'b', 'Hnisz et al. 2013', 'bed'], # Originally H3K27ac (Hnisz)
    'H3K27ac_PGC2' : ['H3K27ac (PGC2)', 'b', 'PGC 2014', 'bed'],
    'H3K4me1_Trynka' : ['H3K4me1', 'b', 'Trynka et al. 2013', 'bed'],
    'H3K4me1_peaks_Trynka' : ['H3K4me1 peaks', 'b', 'Trynka et al. 2013', 'bed'], # In baseline-LF but not in description
    'H3K4me3_Trynka' : ['H3K4me3', 'b', 'Trynka et al. 2013', 'bed'],
    'H3K4me3_peaks_Trynka' : ['H3K4me3 peaks', 'b', 'Trynka et al. 2013', 'bed'], # In baseline-LF but not in description
    'H3K9ac_Trynka' : ['H3K9ac', 'b', 'Trynka et al. 2013', 'bed'],
    'H3K9ac_peaks_Trynka' : ['H3K9ac peaks', 'b', 'Trynka et al. 2013', 'bed'], # In baseline-LF but not in description
    'TFBS_ENCODE' : ['TFBS', 'b', 'ENCODE', 'bed'],
    'Transcribed_Hoffman' : ['Transcribed', 'b', 'Hoffman et al. 2013', 'bed'],
    'TSS_Hoffman' : ['TSS', 'b', 'Hoffman et al. 2013', 'bed'],
    'CTCF_Hoffman' : ['CTCF', 'b', 'Hoffman et al. 2013', 'bed'],
    'DGF_ENCODE' : ['DGF', 'b', 'ENCODE', 'bed'],
    'Repressed_Hoffman' : ['Repressed', 'b', 'Hoffman et al. 2013', 'bed'],
    'Vahedi_Tcell_SE' : ['Super enhancer (Vahedi)', 'b', 'Vahedi et al. 2015', 'bed'],
    'Vahedi_Tcell_TE' : ['Typical enhancer', 'b', 'Vahedi et al. 2015', 'bed'],
    'GerpRS_g4' : ['Conserved (GERP RS >= 4)', 'b', 'Davydov et al. 2010', 'CADD'], # from CADD
    'GerpN' : ['Conserved (GERP NS)', 'c', 'Davydov et al. 2010', 'CADD'], # from CADD
    'alleleage' : ['MAF-adjusted predicted allele age', 'c', 'Rasmussen et al. 2014', 'bed'],
    'LLD_AFR' : ['MAF-adjusted LLD-AFR', 'c', 'Gazal et al. 2017', '1000G'],
    'nucleotide_div' : ['Nucleotide diversity', 'c', '', 'recompute'],
    'recomb_rate' : ['Recombination rate', 'c', 'HapMap2 map', 'recompute'],
    'Backgrd_Selection_Stat' : ['McVicker B statistics', 'c', 'McVicker et al. 2009', 'bed'],
    'CpG' : ['CpG content', 'c', '', 'CADD'], # from CADD
    # Non-synonymous removed
    # Synonymous removed
    'CADD_g20' : ['Deleterious (CADD >= 20)', 'b', 'Kircher et al. 2014', 'CADD'],
    'snpeff_nonsynonymous_variant' : ['Non-synonymous', 'b', 'Cingolani et al. 2012', 'SnpEff'],
    'snpeff_synonymous_variant' : ['Synonymous', 'b', 'Cingolani et al. 2012', 'SnpEff'],
}

In [16]:
def get_type(line):
    if line == 'b':
        return 'Binary'
    if line == 'c':
        return 'Continuous'

df_info = pd.DataFrame(index=DIC_NAME_AN)
df_info['Name'] = [DIC_NAME_AN[x][0] for x in df_info.index]
df_info['Identifier'] = df_info.index
df_info['Type'] = [get_type(DIC_NAME_AN[x][1]) for x in df_info.index]
df_info['n_snp_common'] = 0
df_info['n_snp_lf'] = 0
df_info['Reference/dataset'] = [DIC_NAME_AN[x][2] for x in df_info.index]
df_info['Source'] = [DIC_NAME_AN[x][3] for x in df_info.index]

# n_snp
temp_df = pd.read_csv(
    '/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/ldspec_res_092223.prox_gene_fct_all_ld/'
    'blood_PLATELET_COUNT.tau.tsv', sep='\t', index_col=0
)
for row in df_info.index:
    if df_info.loc[row, 'Type'] == 'Binary':
        df_info.loc[row, 'n_snp_common'] = temp_df.loc['AN:%s_common'%row, 'n_snp']
        df_info.loc[row, 'n_snp_lf'] = temp_df.loc['AN:%s_lf'%row, 'n_snp']
df_info['n_snp_common'] = df_info['n_snp_common'].astype(int)
df_info['n_snp_lf'] = df_info['n_snp_lf'].astype(int)

# Version of the baseline model
df_ldsc = pd.read_excel(
    '/n/groups/price/martin/LDSPEC_data/results/Gazal_NG_2018_table.xlsx', sheet_name='S1', skiprows=1
)
temp_dic = {x:y for x,y in zip(df_ldsc['Main annotation'], df_ldsc['Version of the baseline model'])}
for x in temp_dic:
    if temp_dic[x]=='New annotation':
        temp_dic[x] = 'Gazal et al. 2018'
for x in ['Flanking bivalent TSS/enhancer', 'FANTOM5 enhancer', 'Super enhancer (Hnisz)', 'Weak enhancer', 
          'Promoter flanking', 'DHS peaks', 'H3K27ac', 'H3K4me1 peaks', 'H3K4me3 peaks', 'H3K9ac peaks']:
    temp_dic[x] = 'Finucane et al. 2015'
for x in ['Super enhancer (Vahedi)', 'Typical enhancer']:
    temp_dic[x] = 'Liu et al. 2016'
for x in ['Recombination rate', 'CpG content']:
    temp_dic[x] = 'Gazal et al. 2017'
for x in ['Conserved in vertebrates (phastCons)', 'Conserved in mammals (phastCons)', 
          'Conserved in primates (phastCons)']:
    temp_dic[x] = 'Gazal et al. 2018'
df_info['Version of the baseline model'] = [
    temp_dic[x] if x in temp_dic else 'New annotation' for x in df_info['Name']
]

df_info.to_csv('/n/groups/price/martin/LDSPEC_data/results/tables/annot_info.tsv', sep='\t', index=False)
print('n_annot=%d' % df_info.shape[0])
for src in set(df_info['Source']):
    print('%-10s n_annot=%d: %s' % (
        src, (df_info['Source']==src).sum(), ', '.join(df_info.loc[df_info['Source']==src, 'Name'])
    ))
display(df_info)

n_annot=45
bed        n_annot=36: Flanking bivalent TSS/enhancer, Conserved (Lindblad-Toh), Conserved in vertebrates (phastCons), Conserved in mammals (phastCons), Conserved in primates (phastCons), Coding, 3UTR, 5UTR, Intron, FANTOM5 enhancer, Super enhancer (Hnisz), Enhancer, Weak enhancer, Promoter, Promoter flanking, DHS, DHS peaks, Fetal DHS, H3K27ac, H3K27ac (PGC2), H3K4me1, H3K4me1 peaks, H3K4me3, H3K4me3 peaks, H3K9ac, H3K9ac peaks, TFBS, Transcribed, TSS, CTCF, DGF, Repressed, Super enhancer (Vahedi), Typical enhancer, MAF-adjusted predicted allele age, McVicker B statistics
recompute  n_annot=2: Nucleotide diversity, Recombination rate
1000G      n_annot=1: MAF-adjusted LLD-AFR
CADD       n_annot=4: Conserved (GERP RS >= 4), Conserved (GERP NS), CpG content, Deleterious (CADD >= 20)
SnpEff     n_annot=2: Non-synonymous, Synonymous


Unnamed: 0,Name,Identifier,Type,n_snp_common,n_snp_lf,Reference/dataset,Source,Version of the baseline model
BivFlnk,Flanking bivalent TSS/enhancer,BivFlnk,Binary,85347,60877,Roadmap Consortium 2015,bed,Finucane et al. 2015
Conserved_LindbladToh,Conserved (Lindblad-Toh),Conserved_LindbladToh,Binary,157154,125931,Ward and Kellis 2012,bed,Finucane et al. 2015
Conserved_Vertebrate_phastCons46way,Conserved in vertebrates (phastCons),Conserved_Vertebrate_phastCons46way,Binary,186689,147685,Siepel et al. 2005,bed,Gazal et al. 2018
Conserved_Mammal_phastCons46way,Conserved in mammals (phastCons),Conserved_Mammal_phastCons46way,Binary,136807,112826,Siepel et al. 2005,bed,Gazal et al. 2018
Conserved_Primate_phastCons46way,Conserved in primates (phastCons),Conserved_Primate_phastCons46way,Binary,124082,105972,Siepel et al. 2005,bed,Gazal et al. 2018
Coding_UCSC,Coding,Coding_UCSC,Binary,90901,73406,UCSC,bed,Finucane et al. 2015
UTR_3_UCSC,3UTR,UTR_3_UCSC,Binary,73957,52747,UCSC,bed,Finucane et al. 2015
UTR_5_UCSC,5UTR,UTR_5_UCSC,Binary,34355,25514,UCSC,bed,Finucane et al. 2015
Intron_UCSC,Intron,Intron_UCSC,Binary,2545824,1647772,UCSC,bed,Finucane et al. 2015
Enhancer_Andersson,FANTOM5 enhancer,Enhancer_Andersson,Binary,28257,18461,Andersson et al. 2014,bed,Finucane et al. 2015


### SNP-pair annotation info

In [17]:
DIC_NAME_pAN = {
    # Proximity-based
    'proxy_0_100' : ['Proximal 0-100bp', '0-100bp', 'proximity'],
    'proxy_100_1000' : ['Proximal 100bp-1kb', '100bp-1kb', 'proximity'], 
    'proxy_1000_10000' : ['Proximal 1-10kb', '1-10kb', 'proximity'],
    # Gene-based
    'exon' : ['Same-exon', 'Same exon', 'gene'],
    'exonic_gene' : ['Same-gene exonic', 'Exonic region of same gene', 'gene'],
    'cS2G_promoter' : ['Same-gene promoter', 'Promoter of same gene', 'gene'],
    'protein_domain' : ['Same-protein-domain', 'Same protein domain of same gene', 'gene'],
    'gene' : ['Same-gene', 'Same gene', 'gene'],
    # fct-100
    'H3K27ac_Hnisz_proxy_0_100': ['H3K27ac-100', 'H3K27ac (0-100bp)', 'fct-100'],
    'H3K27ac_PGC2_proxy_0_100': ['H3K27ac (PGC2)-100', 'H3K27ac (PGC2) (0-100bp)', 'fct-100'],
    'H3K4me1_Trynka_proxy_0_100': ['H3K4me1-100', 'H3K4me1 (0-100bp)', 'fct-100'],
    'Intron_UCSC_proxy_0_100': ['Intron-100', 'Intron (0-100bp)', 'fct-100'],
    'Repressed_Hoffman_proxy_0_100': ['Repressed-100', 'Repressed (0-100bp)', 'fct-100'],
    'SuperEnhancer_Hnisz_proxy_0_100' : ['Super enhancer-100', 'Super enhancer (0-100bp)', 'fct-100'],
    'Transcribed_Hoffman_proxy_0_100': ['Transcribed-100', 'Transcribed (0-100bp)', 'fct-100'],
    # fct-1k
    'DGF_ENCODE_proxy_0_1000': ['DGF-1k', 'DGF (0-1kb)', 'fct-1k'],
    'DHS_Trynka_proxy_0_1000': ['DHS-1k', 'DHS (0-1kb)', 'fct-1k'],
    'DHS_peaks_Trynka_proxy_0_1000': ['DHS peaks-1k', 'DHS peaks (0-1kb)', 'fct-1k'],
    'Enhancer_Hoffman_proxy_0_1000': ['Enhancer-1k', 'Enhancer (0-1kb)', 'fct-1k'],
    'FetalDHS_Trynka_proxy_0_1000': ['Fetal DHS-1k', 'Fetal DHS (0-1kb)', 'fct-1k'],
    'H3K27ac_Hnisz_proxy_0_1000': ['H3K27ac-1k', 'H3K27ac (0-1kb)', 'fct-1k'],
    'H3K27ac_PGC2_proxy_0_1000': ['H3K27ac (PGC2)-1k', 'H3K27ac (PGC2) (0-1kb)', 'fct-1k'],
    'H3K4me1_Trynka_proxy_0_1000': ['H3K4me1-1k', 'H3K4me1 (0-1kb)', 'fct-1k'],
    'H3K4me1_peaks_Trynka_proxy_0_1000': ['H3K4me1 peaks-1k', 'H3K4me1 peaks (0-1kb)', 'fct-1k'],
    'H3K4me3_Trynka_proxy_0_1000': ['H3K4me3-1k', 'H3K4me3 (0-1kb)', 'fct-1k'],
    'H3K9ac_Trynka_proxy_0_1000': ['H3K9ac-1k', 'H3K9ac (0-1kb)', 'fct-1k'],
    'Intron_UCSC_proxy_0_1000': ['Intron-1k', 'Intron (0-1kb)', 'fct-1k'],
    'Promoter_UCSC_proxy_0_1000': ['Promoter-1k', 'Promoter (0-1kb)', 'fct-1k'],
    'Repressed_Hoffman_proxy_0_1000': ['Repressed-1k', 'Repressed (0-1kb)', 'fct-1k'],
    'SuperEnhancer_Hnisz_proxy_0_1000': ['Super enhancer-1k', 'Super enhancer (0-1kb)', 'fct-1k'],
    'TFBS_ENCODE_proxy_0_1000': ['TFBS-1k', 'TFBS (0-1kb)', 'fct-1k'],
    'Transcribed_Hoffman_proxy_0_1000': ['Transcribed-1k', 'Transcribed (0-1kb)', 'fct-1k'],
    'Vahedi_Tcell_SE_proxy_0_1000': ['Super enhancer (Vahedi)-1k', 'Super enhancer (Vahedi) (0-1kb)', 'fct-1k'], 
    'Vahedi_Tcell_TE_proxy_0_1000': ['Typical enhancer-1k', 'Typical enhancer (0-1kb)', 'fct-1k'], 
}

In [18]:
df_info = pd.DataFrame(index=DIC_NAME_pAN)
df_info['Name'] = [DIC_NAME_pAN[x][0] for x in df_info.index]
df_info['Identifier'] = df_info.index
df_info['Type'] = [DIC_NAME_pAN[x][2] for x in df_info.index]
df_info['Description'] = [DIC_NAME_pAN[x][1] for x in df_info.index]
df_info['n_pair'] = 0
df_info['avg_dist'] = 0
for lbin in ['n100_p0', 'p0_p100']:
    for mbin in ['common', 'lf']:
        df_info['n_pair_ld_%s_maf_%s_block' % (lbin, mbin)] = 0
        df_info['avg_dist_ld_%s_maf_%s_block' % (lbin, mbin)] = 0
        df_info['avgr_ld_%s_maf_%s_block' % (lbin, mbin)] = 0

# n_pair & avg_dist
STATS_FILE = '/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/pannot/stats/@.dist.tsv'
for row in df_info.index:
    for lbin in ['n100_p0', 'p0_p100']:
        for mbin in ['common', 'lf']:
            temp_str = '%s_ld_%s_maf_%s_block' % (row, lbin, mbin)
            fpath = STATS_FILE.replace('@', temp_str)
            if os.path.exists(fpath):
                temp_df = pd.read_csv(fpath, sep='\t')
                df_info.loc[row, 'n_pair_ld_%s_maf_%s_block' % (lbin, mbin)] = temp_df['count'].sum()
                temp_df['mid'] = 0.5 * (temp_df['left'] + temp_df['right'])
                temp_df['ct_prop'] = temp_df['count'] / temp_df['count'].sum()
                df_info.loc[row, 'avg_dist_ld_%s_maf_%s_block' % (lbin, mbin)] = \
                    (temp_df['mid'] * temp_df['ct_prop']).sum()
            else:
                print('Missing dist: %s' % fpath)
                
df_info['n_pair'] = df_info[
    ['n_pair_ld_%s_maf_%s_block'%(x,y) for x in ['n100_p0', 'p0_p100'] for y in ['common', 'lf']]
].sum(axis=1)
for lbin in ['n100_p0', 'p0_p100']:
    for mbin in ['common', 'lf']:
        df_info['avg_dist'] += df_info['n_pair_ld_%s_maf_%s_block' % (lbin, mbin)] * \
            df_info['avg_dist_ld_%s_maf_%s_block' % (lbin, mbin)]
df_info['avg_dist'] = df_info['avg_dist'] / df_info['n_pair']
            
# avgr 
AVGR_FILE = '/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/pannot/main.avgr'
temp_df = pd.read_csv(AVGR_FILE, sep='\t', header=None)
temp_df.index = temp_df[0]
for row in df_info.index:
    for lbin in ['n100_p0', 'p0_p100']:
        for mbin in ['common', 'lf']:
            temp_str = 'pAN:%s_ld_%s_maf_%s_block' % (row, lbin, mbin)
            if temp_str not in temp_df.index:
                print('Missing avgr: %s' % temp_str)
                continue
            df_info.loc[row, 'avgr_ld_%s_maf_%s_block' % (lbin, mbin)] = temp_df.loc[temp_str, 1]
            
df_info.to_csv('/n/groups/price/martin/LDSPEC_data/results/tables/pannot_info.tsv', sep='\t', index=False)
display(df_info)

Unnamed: 0,Name,Identifier,Type,Description,n_pair,avg_dist,n_pair_ld_n100_p0_maf_common_block,avg_dist_ld_n100_p0_maf_common_block,avgr_ld_n100_p0_maf_common_block,n_pair_ld_n100_p0_maf_lf_block,avg_dist_ld_n100_p0_maf_lf_block,avgr_ld_n100_p0_maf_lf_block,n_pair_ld_p0_p100_maf_common_block,avg_dist_ld_p0_p100_maf_common_block,avgr_ld_p0_p100_maf_common_block,n_pair_ld_p0_p100_maf_lf_block,avg_dist_ld_p0_p100_maf_lf_block,avgr_ld_p0_p100_maf_lf_block
proxy_0_100,Proximal 0-100bp,proxy_0_100,proximity,0-100bp,3462411,47.114204,1349654,48.36697,-0.529833,607654,48.052099,-0.158873,1170633,46.370436,0.691593,334470,42.958262,0.530413
proxy_100_1000,Proximal 100bp-1kb,proxy_100_1000,proximity,100bp-1kb,27519642,546.126766,11211713,546.372874,-0.503829,4979020,546.794773,-0.139396,9074666,545.727482,0.643986,2254243,545.034636,0.459026
proxy_1000_10000,Proximal 1-10kb,proxy_1000_10000,proximity,1-10kb,253406873,5449.44062,102791868,5443.433006,-0.434025,47047595,5467.484448,-0.10886,83419977,5451.326182,0.548051,20147433,5430.148905,0.394316
exon,Same-exon,exon,gene,Same exon,805641,3646.400723,299255,3573.231316,-0.42277,182810,3849.917811,-0.062931,243488,3630.006356,0.528913,80088,3505.095957,0.320233
exonic_gene,Same-gene exonic,exonic_gene,gene,Exonic region of same gene,1781611,53996.64957,642276,50143.563262,-0.35992,442400,56670.27105,-0.047069,531101,55201.111017,0.449225,165834,57929.763649,0.266648
cS2G_promoter,Same-gene promoter,cS2G_promoter,gene,Promoter of same gene,1242157,46491.565972,472800,43308.034846,-0.384565,256997,50785.667877,-0.079439,404313,46048.668791,0.49234,108047,51865.813257,0.325346
protein_domain,Same-protein-domain,protein_domain,gene,Same protein domain of same gene,185818,47679.704671,52261,39765.752425,-0.361509,69807,57385.098378,-0.034545,41690,43406.685956,0.494421,22060,43791.61718,0.293549
gene,Same-gene,gene,gene,Same gene,1889012835,390693.959592,714746107,382433.944981,-0.150772,357342324,356852.790462,-0.032653,652746369,403845.250068,0.166108,164178035,448023.420293,0.106039
H3K27ac_Hnisz_proxy_0_100,H3K27ac-100,H3K27ac_Hnisz_proxy_0_100,fct-100,H3K27ac (0-100bp),1364866,46.886599,524564,48.197646,-0.535085,241009,47.775757,-0.165095,462382,46.245096,0.698096,136911,42.464722,0.54051
H3K27ac_PGC2_proxy_0_100,H3K27ac (PGC2)-100,H3K27ac_PGC2_proxy_0_100,fct-100,H3K27ac (PGC2) (0-100bp),922286,46.749896,352525,48.061308,-0.531089,165067,47.627857,-0.162887,311561,46.115006,0.695237,93133,42.353811,0.532742


### Table 1 : example pannots

In [4]:
def n_pair_str(x):
    if x > 1e7: # integer + M
        return '%dM' % (x/1e6)
    else: # two sig. fig. + M
        return '%.2gM' % (x/1e6)
def dist_str(x):
    if x < 1000: # integer + bp
        return '%dbp' % x
    elif x > 1e4: # integer + kb
        return '%dkb' % (x/1e3)
    else: # two sig. figure + kb
        return '%0.2gkb' % (x/1e3)
    
DF_PANNOT = pd.read_csv('/n/groups/price/martin/LDSPEC_data/results/tables/pannot_info.tsv', sep='\t')
print("n_pannot=%d" % DF_PANNOT.shape[0])
DF_PANNOT['n_pair'] = [n_pair_str(x) for x in DF_PANNOT['n_pair']]
DF_PANNOT['avg_dist'] = [dist_str(x) for x in DF_PANNOT['avg_dist']]

DF_PANNOT = DF_PANNOT[['Name', 'n_pair', 'avg_dist']]
DF_PANNOT.columns = ['Name', '# SNP pairs', 'Avg. dist.']
print(DF_PANNOT.to_latex(index=False))

print('100bp pannots n_pannot=%d' % sum([x.endswith('100') for x in DF_PANNOT['Name']]) )
print((', '.join(DF_PANNOT.loc[[x.endswith('100') for x in DF_PANNOT['Name']], 'Name'])))
print('')
print('1kb pannots n_pannot=%d' % sum([x.endswith('1k') for x in DF_PANNOT['Name']]) )
print((', '.join(DF_PANNOT.loc[[x.endswith('1k') for x in DF_PANNOT['Name']], 'Name'])))

n_pannot=34
\begin{tabular}{lll}
\toprule
                       Name & \# SNP pairs & Avg. dist. \\
\midrule
           Proximal 0-100bp &        3.5M &       47bp \\
         Proximal 100bp-1kb &         27M &      546bp \\
            Proximal 1-10kb &        253M &      5.4kb \\
                  Same-exon &       0.81M &      3.6kb \\
           Same-gene exonic &        1.8M &       53kb \\
         Same-gene promoter &        1.2M &       46kb \\
        Same-protein-domain &       0.19M &       47kb \\
                  Same-gene &       1889M &      390kb \\
                H3K27ac-100 &        1.4M &       46bp \\
         H3K27ac (PGC2)-100 &       0.92M &       46bp \\
                H3K4me1-100 &        1.4M &       46bp \\
                 Intron-100 &        1.3M &       46bp \\
              Repressed-100 &        1.6M &       45bp \\
         Super enhancer-100 &       0.61M &       46bp \\
            Transcribed-100 &        1.1M &       44bp \\
                    

### Heritability models

In [20]:
# 165 single-SNP annotations and 62 SNP-pair annotations 
with open(
    '/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/'
    'ldspec_res_092223.prox_gene_fct_all_ld/blood_PLATELET_COUNT.pickle', 'br'
) as f:
    temp_df = pickle.load(f)
    
df_annot = temp_df['summary']['tau'][['annot', 'type', 'n_snp']].copy()
temp_list = ['AN:all'] + [x for x in df_annot.index if ('mbin' in x) & ('common' in x)] + \
    [x for x in df_annot.index if ('mbin' in x) & ('lf' in x)]
temp_list += [x for x in df_annot.index if (x not in temp_list) & (df_annot.loc[x, 'type']=='non-binary')]
temp_list += [x for x in df_annot.index if (x not in temp_list)]
df_annot = df_annot.loc[temp_list]

df_annot.loc[df_annot['type']=='non-binary', 'n_snp'] = -1
df_annot['n_snp'] = df_annot['n_snp'].astype(int)
df_annot.to_csv('/n/groups/price/martin/LDSPEC_data/results/tables/baseline_165.tsv', sep='\t', index=False)
display(df_annot)

df_pannot = temp_df['summary']['omega'][['pannot', 'n_pair']].copy()
df_pannot.to_csv('/n/groups/price/martin/LDSPEC_data/results/tables/baseline_sp.tsv', sep='\t', index=False)
display(df_pannot)

Unnamed: 0,annot,type,n_snp
AN:all,AN:all,binary,14820648
AN:mbin0_common,AN:mbin0_common,binary,650304
AN:mbin1_common,AN:mbin1_common,binary,650306
AN:mbin2_common,AN:mbin2_common,binary,650305
AN:mbin3_common,AN:mbin3_common,binary,650302
...,...,...,...
AN:WeakEnhancer_Hoffman_lf,AN:WeakEnhancer_Hoffman_lf,binary,86752
AN:snpeff_nonsynonymous_variant_common,AN:snpeff_nonsynonymous_variant_common,binary,16964
AN:snpeff_nonsynonymous_variant_lf,AN:snpeff_nonsynonymous_variant_lf,binary,22060
AN:snpeff_synonymous_variant_common,AN:snpeff_synonymous_variant_common,binary,14458


Unnamed: 0,pannot,n_pair
pAN:proxy_0_100_ld_p0_p100_maf_common_block,pAN:proxy_0_100_ld_p0_p100_maf_common_block,2343178
pAN:proxy_0_100_ld_p0_p100_maf_lf_block,pAN:proxy_0_100_ld_p0_p100_maf_lf_block,670340
pAN:proxy_100_1000_ld_p0_p100_maf_common_block,pAN:proxy_100_1000_ld_p0_p100_maf_common_block,18149332
pAN:proxy_100_1000_ld_p0_p100_maf_lf_block,pAN:proxy_100_1000_ld_p0_p100_maf_lf_block,4508486
pAN:proxy_1000_10000_ld_p0_p100_maf_common_block,pAN:proxy_1000_10000_ld_p0_p100_maf_common_block,166839954
...,...,...
pAN:Transcribed_Hoffman_proxy_0_1000_ld_n100_p0_maf_lf_block,pAN:Transcribed_Hoffman_proxy_0_1000_ld_n100_p...,2683116
pAN:Vahedi_Tcell_SE_proxy_0_1000_ld_n100_p0_maf_common_block,pAN:Vahedi_Tcell_SE_proxy_0_1000_ld_n100_p0_ma...,450748
pAN:Vahedi_Tcell_SE_proxy_0_1000_ld_n100_p0_maf_lf_block,pAN:Vahedi_Tcell_SE_proxy_0_1000_ld_n100_p0_ma...,247626
pAN:Vahedi_Tcell_TE_proxy_0_1000_ld_n100_p0_maf_common_block,pAN:Vahedi_Tcell_TE_proxy_0_1000_ld_n100_p0_ma...,525678


### names.txt

In [4]:
# DF_TRAIT, DF_ANNOT, DF_PANNOT, DF_DIST
DF_TRAIT = pd.read_csv('/n/groups/price/martin/LDSPEC_data/results/tables/trait_info.tsv', sep='\t')
DF_TRAIT.index = DF_TRAIT['Trait_Identifier']
TRAIT_LIST = list(DF_TRAIT.index)
print("n_trait=%d" % DF_TRAIT.shape[0])

DF_ANNOT = pd.read_csv('/n/groups/price/martin/LDSPEC_data/results/tables/annot_info.tsv', sep='\t')
DF_ANNOT.index = DF_ANNOT['Identifier']
print("n_annot=%d" % DF_ANNOT.shape[0])

DF_PANNOT = pd.read_csv('/n/groups/price/martin/LDSPEC_data/results/tables/pannot_info.tsv', sep='\t')
DF_PANNOT.index = DF_PANNOT['Identifier']
print("n_pannot=%d" % DF_PANNOT.shape[0])

n_trait=70
n_annot=45
n_pannot=34


In [5]:
# DIC_NAME
DIC_NAME = {
    'ldsc' : 'S-LDSC',
    'md_bsl' : 'baseline',
    'md_prox_gene_fct_all_ld' : 'baseline-SP',
    'md_prox_all_ld' : 'baseline-SP-proximity',
    'md_gene_all_ld' : 'baseline-SP-gene',
    'md_fct_all_ld' : 'baseline-SP-functional',
    'common' : 'common',
    'common_short' : 'C',
    'common_common' : 'common',
    'common_common_short' : 'C',
    'common_block' : 'common',
    'common_block_short' : 'C',
    'lf' : 'low-freq',
    'lf_short' : 'LF',
    'lf_lf' : 'low-freq',
    'lf_lf_short' : 'LF',
    'lf_block' : 'low-freq',
    'lf_block_short' : 'LF',
    'h2' : 'heritability',
    'scv' : 'sum of causal effect size variance',
    'h2p' : 'SNP-pair heritability',
    'h2_enrich' : 'heritability enrichment',
    'scv_enrich' : 'SCV enrichment',
    'h2_shrink' : 'heritability shrinkage',
    'h2_enrich_shrink' : 'heritability enrichment shrinkage',
    'tau' : 'tau',
    'omega' : 'omega',
    'cov' : 'SNP-pair effect covariance',
    'cor' : 'SNP-pair effect correlation',
    'ecov' : 'excess SNP-pair effect covariance',
    'ecor' : 'excess SNP-pair effect correlation',
    'loglss' : '$logl_{ss}$',
    'sqe' : 'SSE', 
    'abe' : 'SAE',
}

DIC_NAME.update({x : DF_TRAIT.loc[x, 'Trait Name'] for x in TRAIT_LIST})
DIC_NAME.update({x : DF_ANNOT.loc[x, 'Name'] for x in DF_ANNOT.index})
DIC_NAME.update({'%s.flanking'%x: '%s + 500bp'%DF_ANNOT.loc[x, 'Name'] for x in DF_ANNOT.index})
DIC_NAME.update({x : DF_PANNOT.loc[x, 'Name'] for x in DF_PANNOT.index})
DIC_NAME.update({'AN:mbin%d_common'%x : 'Common-mbin %d'%x for x in range(10)})
DIC_NAME.update({'AN:mbin%d_lf'%x : 'LF-mbin %d'%x for x in range(5)})
DIC_NAME.update({'AN:all' : 'All SNPs'})
for mbin in ['common', 'lf']:
    DIC_NAME.update({'AN:%s_%s'%(x,mbin): '%s (%s)'%(DF_ANNOT.loc[x, 'Name'], DIC_NAME['%s_short'%mbin])
                     for x in DF_ANNOT.index})
temp_dic = {
    ('p0_p100', 'common') : 'common pos-LD',
    ('p0_p100', 'lf') : 'low-freq pos-LD',
    ('n100_p0', 'common') : 'common neg-LD',
    ('n100_p0', 'lf') : 'low-freq neg-LD',
}
for lbin in ['n100_p0', 'p0_p100']:
    for mbin in ['lf', 'common']:
        DIC_NAME.update({
            'pAN:%s_ld_%s_maf_%s_block'%(x,lbin,mbin) : 
            '%s (%s)'%(DF_PANNOT.loc[x, 'Name'], temp_dic[(lbin, mbin)])
            for x in DF_PANNOT.index
        })
        
with open('../names.tsv', 'w') as f:
    for term in DIC_NAME:
        f.write('%s\t%s\n'%(term, DIC_NAME[term]))