In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import time
import os
import gdreg
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from statsmodels.stats.multitest import multipletests


# autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Trait info

In [117]:
DIC_NAME = {
    'blood_PLATELET_COUNT': 'Platelet Count', 
    'blood_RBC_DISTRIB_WIDTH': 'Red Blood Cell Distribution Width',
    'blood_RED_COUNT': 'Red Blood Cell Count',
    'blood_WHITE_COUNT': 'White Blood Cell Count', 
    'bmd_HEEL_TSCOREz': 'Heel T Score', 
    'body_BALDING1': 'Balding Type I', 
    'body_BMIz': 'BMI',
    'body_HEIGHTz': 'Height',
    'body_WHRadjBMIz': 'Waist-hip Ratio',
    'bp_DIASTOLICadjMEDz': 'Diastolic Blood Pressure',
    'cov_EDU_YEARS': 'Years of Education',
    'disease_ALLERGY_ECZEMA_DIAGNOSED': 'Eczema',
    'lung_FEV1FVCzSMOKE': 'FEV1-FVC Ratio',
    'lung_FVCzSMOKE': 'Forced Vital Capacity (FVC)',
    'mental_NEUROTICISM': 'Neuroticism',
    'other_MORNINGPERSON': 'Morning Person',
    'pigment_SUNBURN': 'Sunburn Occasion',
    'repro_MENARCHE_AGE': 'Age at Menarche',
    'repro_MENOPAUSE_AGE': 'Age at Menopause',
    'repro_NumberChildrenEverBorn_Pooled': 'Number Children (Pooled)',
    'cancer_BREAST' : 'Breast Cancer',
    'cancer_PROSTATE' : 'Prostate Cancer',
    'disease_HYPOTHYROIDISM_SELF_REP' : 'Hypothyroidism',
    'biochemistry_AlkalinePhosphatase' : 'Alkaline Phosphatase',
    'biochemistry_AspartateAminotransferase' : 'Aspartate Aminotransferase',
    'biochemistry_Cholesterol' : 'Cholesterol', 
    'biochemistry_Creatinine' : 'Creatinine',
    'biochemistry_IGF1' : 'IGF1',
    'biochemistry_Phosphate' : 'Phosphate',
    'biochemistry_TotalBilirubin' : 'Total Bilirubin',
    'biochemistry_TotalProtein' : 'Total Protein',
    'biochemistry_VitaminD' : 'Vitamin D',
}

In [118]:
df_info = pd.DataFrame(index=list(DIC_NAME))
df_info['Trait Name'] = [DIC_NAME[x] for x in df_info.index]
df_info['Trait_Identifier'] = df_info.index

sumstats_file = '/n/groups/price/martin/data_GDREG/UKBimp_337K_MAF001/UKBB_trait/@.sumstats.gz'
df_info['N.ukbimp'] = [pd.read_csv(sumstats_file.replace('@', x), nrows=10, sep='\t')['N'].median()
                for x in df_info.index]
df_info['N.ukbimp'] = df_info['N.ukbimp'].astype(int)

df_info.to_csv('/n/groups/price/martin/data_GDREG/results/tables/trait_info.tsv', sep='\t', index=False)

### Single-SNP annotation info

In [2]:
# [Name, type, reference]
DIC_NAME_AN = {
    'BivFlnk' : ['Conserved (phastCons - Primate)', 'b', 'Roadmap Consortium 2015'],
    'Conserved_LindbladToh' : ['Conserved (Lindblad-Toh)', 'b', 'Ward and Kellis 2012'],
    'Conserved_Vertebrate_phastCons46way' : ['Conserved (phastCons - Vertebrate)', 'b', 'Siepel et al. 2005'],
    'Conserved_Mammal_phastCons46way' : ['Conserved (phastCons - Mammal)', 'b', 'Siepel et al. 2005'],
    'Conserved_Primate_phastCons46way' : ['Conserved (phastCons - Primate)', 'b', 'Siepel et al. 2005'],
    'Coding_UCSC' : ['Coding', 'b', 'UCSC'],
    'UTR_3_UCSC' : ['3UTR', 'b', 'UCSC'],
    'UTR_5_UCSC' : ['5UTR', 'b', 'UCSC'],
    'Intron_UCSC' : ['Intron', 'b', 'UCSC'],
    'Enhancer_Andersson' : ['FANTOM5 Enhancer', 'b', 'Andersson et al. 2014'],
    'SuperEnhancer_Hnisz' : ['Super Enhancer (Hnisz)', 'b', 'Hnisz et al. 2013'],
    'Enhancer_Hoffman' : ['Enhancer', 'b', 'Hoffman et al. 2013'],
    'WeakEnhancer_Hoffman' : ['Weak Enhancer', 'b', 'Hoffman et al. 2013'],
    'Promoter_UCSC' : ['Promoter', 'b', 'UCSC'],
    'PromoterFlanking_Hoffman' : ['Promoter Flanking', 'b', 'Hoffman et al. 2013'],
    'DHS_Trynka' : ['DHS', 'b', 'Trynka et al. 2013'],
    'FetalDHS_Trynka' : ['Fetal DHS', 'b', 'Trynka et al. 2013'],
    'H3K27ac_Hnisz' : ['H3K27ac (Hnisz)', 'b', 'Hnisz et al. 2013'],
    'H3K27ac_PGC2' : ['H3K27ac (PGC2)', 'b', 'PGC 2014'],
    'H3K4me1_Trynka' : ['H3K4me1', 'b', 'Trynka et al. 2013'],
    'H3K4me3_Trynka' : ['H3K4me3', 'b', 'Trynka et al. 2013'],
    'H3K9ac_Trynka' : ['H3K9ac', 'b', 'Trynka et al. 2013'],
    'TFBS_ENCODE' : ['TFBS', 'b', 'ENCODE'],
    'Transcribed_Hoffman' : ['Transcribed', 'b', 'Hoffman et al. 2013'],
    'TSS_Hoffman' : ['TSS', 'b', 'Hoffman et al. 2013'],
    'CTCF_Hoffman' : ['CTCF', 'b', 'Hoffman et al. 2013'],
    'DGF_ENCODE' : ['DGF', 'b', 'ENCODE'],
    'Repressed_Hoffman' : ['Repressed', 'b', 'Hoffman et al. 2013'],
    'Vahedi_Tcell_SE' : ['Super Enhancer (Vahedi)', 'b', 'Vahedi et al. 2015'],
    'Vahedi_Tcell_TE' : ['Typical Enhancer', 'b', 'Vahedi et al. 2015'],
    'GerpRS_g4' : ['Conserved (GERP RS >= 4)', 'b', 'Davydov et al. 2010'], # from CADD
    'GerpN' : ['Conserved (GERP NS)', 'c', 'Davydov et al. 2010'], # from CADD
    'alleleage' : ['MAF-adjusted predicted allele age', 'c', 'Rasmussen et al. 2014'],
    # MAF-adjusted LLD-AFR removed
    'nucleotide_div' : ['Nucleotide diversity', 'c', ''],
    'recomb_rate' : ['Recombination Rate', 'c', 'HapMap2 map'],
    'Backgrd_Selection_Stat' : ['McVicker B statistics', 'c', 'McVicker et al. 2009'],
    'CpG' : ['CpG-Content', 'c', ''], # from CADD
    # Non-synonymous removed
    # Synonymous removed
    'CADD_g20' : ['Deleterious (CADD >= 20)', 'b', 'Kircher et al. 2014'],
    'snpeff_downstream_gene_variant' : ['Downstream Gene', 'b', 'Cingolani et al. 2012'],
    'snpeff_non_coding_transcript_exon_variant' : ['Non-coding Exon', 'b', 'Cingolani et al. 2012'],
    'snpeff_nonsynonymous_variant' : ['Non-synonymous', 'b', 'Cingolani et al. 2012'],
    'snpeff_splice_region_variant' : ['Splice Region', 'b', 'Cingolani et al. 2012'],
    'snpeff_synonymous_variant' : ['Synonymous', 'b', 'Cingolani et al. 2012'],
    'snpeff_upstream_gene_variant' : ['Upstream Gene', 'b', 'Cingolani et al. 2012'],
}

In [4]:
def get_type(line):
    if line == 'b':
        return 'Binary'
    if line == 'c':
        return 'Continuous'

df_info = pd.DataFrame(index=DIC_NAME_AN)
df_info['Name'] = [DIC_NAME_AN[x][0] for x in df_info.index]
df_info['Identifier'] = df_info.index
df_info['Type'] = [get_type(DIC_NAME_AN[x][1]) for x in df_info.index]
df_info['Reference/dataset'] = [DIC_NAME_AN[x][2] for x in df_info.index]

# Version of the baseline model
df_ldsc = pd.read_excel('/n/groups/price/martin/data_GDREG/results/Gazal_NG_2018_table.xlsx',
                        sheet_name='S1', skiprows=1)
df_ldsc.index = df_ldsc['Main annotation']
df_ldsc.loc[df_ldsc['Version of the baseline model']=='New annotation', 'Version of the baseline model'] = \
    'Gazal et al. 2018' 
df_info['Version of the baseline model'] = [
    df_ldsc.loc[x, 'Version of the baseline model'] 
    if (x in df_ldsc.index) & (x not in ['Non-synonymous', 'Synonymous']) 
    else 'New annotation' 
    for x in df_info['Name']
]

df_info.to_csv('/n/groups/price/martin/data_GDREG/results/tables/annot_info.tsv', sep='\t', index=False)

### SNP-pair annotation info

In [3]:
DIC_NAME_pAN = {
    'proxy_0_100' : ['Proximal (0-0.1 kb)', 'All pairs of SNPs within 100 bp'],
    'proxy_100_1000' : ['Proximal (0.1-1 kb)', 'All pairs of SNPs between 100 bp and 1 kb'],
    'proxy_1000_10000' : ['Proximal (1-10 kb)', 'All pairs of SNPs between 1 kb and 10 kb'],
    'ldp5_proxy_10000' : ['High-LD', 'All pairs of SNPs with LD >= 0.5 and dist < 10 kb'],
    'exon' : ['Exon', 'All pairs of SNPs in the same exon'],
    'gene' : ['Gene', 'All pairs of SNPs in the same gene'],
    'exonic_gene' : ['Exonic-gene', 'All pair of exonic SNPs in the same gene body'],
    'protein_domain' : ['Protein-domain', 'All pair of SNPs in the same protein domain and same gene'],
    'cS2G_all' : ['cS2G-all', 'All pairs of SNPs linked to the same gene by cS2G'],
    'cS2G_promoter' : ['cS2G-promoter', 'All pairs of promoter SNPs linked to the same gene by cS2G'],
    'cS2G_other' : [
        'cS2G-other', 
        'All pairs of other SNPs linked to the same gene by cS2G,  excluding exonic or promoter SNPs'],
}

In [4]:
df_info = pd.DataFrame(index=DIC_NAME_pAN)
df_info['Name'] = [DIC_NAME_pAN[x][0] for x in df_info.index]
df_info['Identifier'] = df_info.index
df_info['Description'] = [DIC_NAME_pAN[x][1] for x in df_info.index]

df_info.to_csv('/n/groups/price/martin/data_GDREG/results/tables/pannot_info.tsv', sep='\t', index=False)