In [2]:
import pandas as pd

### Gencode exon & gene

In [14]:
# df_gencode = pd.read_csv(
#     "/n/groups/price/martin/data_GDREG/gene_annotation/gencode.v41lift37.basic.annotation.gtf.gz", 
#     skiprows=5, sep='\t', header=None
# )

df_gene_cs2g = pd.read_csv(
    "/n/groups/price/martin/data_GDREG/gene_annotation/cS2G/list_genes_qc.txt.gz", sep='\t'
)

In [42]:
def get_gene_info(line, term='gene_id'):
    """ term is one of ['gene_id', 'gene_type', 'gene_name']
    """    
    for info in line.split(';'):
        info = info.strip()
        if info.startswith(term):
            return info.replace(term, '').replace('\"', '').strip()

df_gene = df_gencode.loc[df_gencode[2]=='gene'].copy()
df_gene['ENSG'] = [get_gene_info(x, term='gene_id').split('.')[0] for x in df_gene[8]]
df_gene['GENE_TYPE'] = [get_gene_info(x, term='gene_type').split('.')[0] for x in df_gene[8]]
df_gene['GENE_NAME'] = [get_gene_info(x, term='gene_name').split('.')[0] for x in df_gene[8]]
print("Get gene annotations", df_gene.shape[0])

df_gene = df_gene.loc[~df_gene[0].isin(['chrX', 'chrY', 'chrM'])].copy()
df_gene['CHR'] = df_gene[0]
df_gene['START'] = df_gene[3]
df_gene['END'] = df_gene[4] 
print("Retain annots in CHRs 1-22", df_gene.shape[0])

df_gene.drop_duplicates(subset=['CHR', 'START', 'END', 'ENSG', 'GENE_TYPE', 'GENE_NAME'], inplace=True)
print("Drop duplicates", df_gene.shape[0])

gene_set = set(df_gene_cs2g['ENSG'])
df_gene = df_gene.loc[df_gene['ENSG'].isin(gene_set)]
print("Restrict to Gazel et al. 19,995 genes: ", df_gene.shape[0])

df_gene = df_gene[['CHR', 'START', 'END', 'ENSG', 'GENE_TYPE', 'GENE_NAME']].copy()
df_gene.to_csv('/n/groups/price/martin/data_GDREG/gene_annotation/ENSG_gene_annot_v41.txt',
               sep='\t', index=False)

Get gene annotations 63678
Retain annots in CHRs 1-22 60472
Drop duplicates 60472
Restrict to Gazel et al. 19,995 genes:  19606


In [47]:
def get_exon_info(line, term='exon_id'):
    """ term is one of ['exon_id', 'gene_id', 'gene_type', 'gene_name']
    """    
    for info in line.split(';'):
        info = info.strip()
        if info.startswith(term):
            return info.replace(term, '').replace('\"', '').strip()

df_exon = df_gencode.loc[df_gencode[2]=='exon'].copy()
df_exon['ENSE'] = [get_exon_info(x, term='exon_id').split('.')[0] for x in df_exon[8]]
df_exon['ENSG'] = [get_exon_info(x, term='gene_id').split('.')[0] for x in df_exon[8]]
df_exon['GENE_TYPE'] = [get_exon_info(x, term='gene_type') for x in df_exon[8]]
df_exon['GENE_NAME'] = [get_exon_info(x, term='gene_name') for x in df_exon[8]]
print("Get exon annotations", df_exon.shape[0])

df_exon = df_exon.loc[~df_exon[0].isin(['chrX', 'chrY', 'chrM'])].copy()
df_exon['CHR'] = df_exon[0]
df_exon['START'] = df_exon[3]
df_exon['END'] = df_exon[4] 
print("Retain annots in CHRs 1-22", df_exon.shape[0])

df_exon.drop_duplicates(
    subset=['CHR', 'START', 'END', 'ENSE', 'ENSG', 'GENE_TYPE', 'GENE_NAME'], inplace=True
)
print("Drop duplicates", df_exon.shape[0])

gene_set = set(df_gene_cs2g['ENSG'])
df_exon = df_exon.loc[df_exon['ENSG'].isin(gene_set)]
print("Restrict exons to Gazel et al. 19,995 genes: ", df_exon.shape[0])

df_exon = df_exon[['CHR', 'START', 'END', 'ENSE', 'ENSG', 'GENE_TYPE', 'GENE_NAME']].copy()
df_exon.to_csv('/n/groups/price/martin/data_GDREG/gene_annotation/ENSE_exon_annot_v41.txt',
               sep='\t', index=False)

Get exon annotations 840632
Retain annots in CHRs 1-22 805376
Drop duplicates 411792
Restrict exons to Gazel et al. 19,995 genes:  297521


### cS2G

In [5]:
ANNOT_FILE = '/n/groups/price/martin/data_GDREG/gene_annotation/cS2G/cS2G_UKBB/cS2G.@.SGscore.gz'

for CHR in [22]:
    df_annot = pd.read_csv(ANNOT_FILE.replace('@', '%d' % CHR), sep='\t')

In [9]:
df_annot

Unnamed: 0,SNP,GENE,cS2G,INFO
0,22:16258211_A_G,POTEH,1.0,|Exon=1
1,22:16258290_CATT_C,POTEH,1.0,|Exon=1
2,22:16258294_T_G,POTEH,1.0,|Exon=1
3,22:16266920_C_CAG,POTEH,1.0,|Exon=1
4,22:16279319_GCAAAAACGTATGTAATT_G,POTEH,1.0,|Exon=1
...,...,...,...,...
113011,22:51239281_G_C,RPL23AP82,1.0,|Exon=1
113012,22:51239296_T_C,RPL23AP82,1.0,|Exon=1
113013,22:51239304_C_T,RPL23AP82,1.0,|Exon=1
113014,22:51239586_T_G,RPL23AP82,1.0,|Exon=1


In [8]:
df_annot['INFO'].unique()

array(['|Exon=1', '|Promoter=1', '|GTeX_Finemapped=1', '|ABC=1',
       '|EpiMap=1', '|GTeX_Finemapped=1|eQTLGen_Finemapped=1',
       '|Promoter=1|ABC=1|Cicero=1',
       '|Promoter=1|GTeX_Finemapped=1|ABC=1|Cicero=1',
       '|Promoter=1|ABC=1', '|ABC=1|Cicero=1',
       '|GTeX_Finemapped=1|ABC=1|Cicero=1', '|GTeX_Finemapped=1|ABC=1',
       '|Exon=1|ABC=1', '|Exon=1|GTeX_Finemapped=1',
       '|eQTLGen_Finemapped=1', '|Cicero=1',
       '|eQTLGen_Finemapped=1|ABC=1', '|GTeX_Finemapped=1|EpiMap=1',
       '|Exon=1|EpiMap=1', '|Exon=1|eQTLGen_Finemapped=1|EpiMap=1',
       '|Exon=1|eQTLGen_Finemapped=1', '|EpiMap=1|ABC=1',
       '|Promoter=1|eQTLGen_Finemapped=1', '|Promoter=1|Cicero=1',
       '|Promoter=1|eQTLGen_Finemapped=1|Cicero=1', '|Exon=1|Cicero=1',
       '|Promoter=1|GTeX_Finemapped=1|eQTLGen_Finemapped=1|ABC=1',
       '|Promoter=1|GTeX_Finemapped=1|ABC=1',
       '|GTeX_Finemapped=1|eQTLGen_Finemapped=1|ABC=1',
       '|Exon=1|GTeX_Finemapped=1|eQTLGen_Finemapped=1',
   