# Using Ensembl REST API

In [1]:
import requests, sys
from pprint import pprint
#import xmltodict
import re
import numpy as np
import pandas as pd

In [2]:
genes_df = pd.read_csv('DriverMapTPM.tsv', delimiter='\t')

In [15]:
genes_comp = list(genes_df.iloc[0:9,0])
genes_list = []

for gene in genes_comp:
    gene_name = gene.split('|')
    genes_list.append(gene_name[1])


In [16]:
genes_list

['CENPU',
 'PEX14',
 'RAD52',
 'CLYBL',
 'RANBP1',
 'KDF1',
 'TBC1D30',
 'TMEM258',
 'LRG1']

In [None]:
# This function is performed to get list of available Homo Sapiens tissue types from Ensembl
 
server = "http://grch37.rest.ensembl.org"
ext = "/eqtl/tissue/homo_sapiens?"
 
r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
 
if not r.ok:
  r.raise_for_status()
  sys.exit()
 
tissue_dict = r.json() # total of 44 tissue types

pprint(tissue_dict)

In [20]:
# convert tissue dictionary to tissue list
dlist = []
for i in tissue_dict.items():
    dlist.append(i)

tissue_list = []    
for i in range(len(dlist)):
    tissue_list.append(dlist[i][0])
tissue_list.sort()
tissue_brain = tissue_list[6:14]
tissue_brain

['Brain_Anterior_cingulate_cortex_BA24',
 'Brain_Caudate_basal_ganglia',
 'Brain_Cerebellar_Hemisphere',
 'Brain_Cerebellum',
 'Brain_Cortex',
 'Brain_Frontal_Cortex_BA9',
 'Brain_Hippocampus',
 'Brain_Hypothalamus']

In [22]:
failed_genes = []
tissue_gene_values_dict = {}
for gene in genes_list:
    for tissue in tissue_brain:
        statistic = 'p-value'
        server = "http://grch37.rest.ensembl.org"
        ext = f"/eqtl/id/homo_sapiens/{gene}?;tissue={tissue};statistic={statistic}"
 
        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
        print(f'Retrieving {gene} for tissue {tissue}')
        
        if not r.ok:
            failed_genes.append(gene)
            print('Gene not found, passing')
            pass
        
        tissue_gene_values_dict[gene] = r.json()
        


Retrieving CENPU for tissue Brain_Anterior_cingulate_cortex_BA24
Retrieving CENPU for tissue Brain_Caudate_basal_ganglia
Retrieving CENPU for tissue Brain_Cerebellar_Hemisphere
Retrieving CENPU for tissue Brain_Cerebellum
Retrieving CENPU for tissue Brain_Cortex
Retrieving CENPU for tissue Brain_Frontal_Cortex_BA9
Retrieving CENPU for tissue Brain_Hippocampus
Retrieving CENPU for tissue Brain_Hypothalamus
Retrieving PEX14 for tissue Brain_Anterior_cingulate_cortex_BA24
Retrieving PEX14 for tissue Brain_Caudate_basal_ganglia
Retrieving PEX14 for tissue Brain_Cerebellar_Hemisphere
Retrieving PEX14 for tissue Brain_Cerebellum
Retrieving PEX14 for tissue Brain_Cortex
Retrieving PEX14 for tissue Brain_Frontal_Cortex_BA9
Retrieving PEX14 for tissue Brain_Hippocampus
Retrieving PEX14 for tissue Brain_Hypothalamus
Retrieving RAD52 for tissue Brain_Anterior_cingulate_cortex_BA24
Retrieving RAD52 for tissue Brain_Caudate_basal_ganglia
Retrieving RAD52 for tissue Brain_Cerebellar_Hemisphere
Retri

In [None]:
tissue_gene_values_dict = tissue_gene_values_dict.pop('KDF1')

In [50]:
tissue_gene_values_dict.keys()

dict_keys(['CENPU', 'PEX14', 'RAD52', 'CLYBL', 'RANBP1', 'TBC1D30', 'TMEM258', 'LRG1'])

In [53]:
# getting list of specific values from the {tissue_gene_values_dict}
# collecting values and negative log10(p-value) for that gene

gene_values_dict = {}

for gene in tissue_gene_values_dict:
    gene_values_dict[gene] = {}
    for i in range(len(tissue_gene_values_dict[gene])):
        #print(gene,tissue_gene_values_dict[gene][i]['minus_log10_p_value'])
        gene_values_dict[gene][i] = [tissue_gene_values_dict[gene][i]['minus_log10_p_value'], 
                                  tissue_gene_values_dict[gene][i]['value']]

gene_values_dict

{'CENPU': {0: [0.379286040666131, 0.417555260396652],
  1: [1.23707613772727, 0.0579327123602391],
  2: [0.249487900480773, 0.563004802586724],
  3: [0.0475910534256745, 0.896208268784655],
  4: [0.35977622910325, 0.436740805645047],
  5: [0.969193340755486, 0.10735113963415],
  6: [0.494415439506154, 0.320320372244829],
  7: [0.00875470538651121, 0.980043369096568],
  8: [0.997257891913443, 0.100633391212299],
  9: [1.493685998024, 0.032085883476349],
  10: [1.493685998024, 0.032085883476349],
  11: [1.493685998024, 0.032085883476349],
  12: [1.493685998024, 0.032085883476349],
  13: [1.51363028570009, 0.0306457118881589],
  14: [1.49337842044267, 0.0321086155034973],
  15: [1.49337842044267, 0.0321086155034973],
  16: [1.49307005196111, 0.0321314221506685],
  17: [1.49307005196111, 0.0321314221506685],
  18: [1.49286403356586, 0.0321466681062445],
  19: [0.749490957455418, 0.178036497785358],
  20: [1.81515088782251, 0.0153055560522258],
  21: [1.82668598067795, 0.0149043835890737],


In [60]:
# because dictionary contains multiple location expression of the gene in chosen tissue,
# we collect all values and take the mean
mean_pvalues = {}
mean_values = {}
for gene in gene_values_dict:
    temp_dict = gene_values_dict[gene]
    list_pvalues = []
    list_values = []
    for i in range(len(temp_dict.keys())):
        list_pvalues.append(temp_dict[i][0])
        list_values.append(temp_dict[i][1])
    mean_pvalues[gene] = np.mean(list_pvalues)
    mean_values[gene] = np.mean(list_values)   
    
print(f'Mean of p-values: {mean_pvalues}')
print(f'Mean of values: {mean_values}')

Mean of p-values: {'CENPU': 0.6046874207143056, 'PEX14': 0.3612288053977717, 'RAD52': 0.4831311476544303, 'CLYBL': 0.4857805199442888, 'RANBP1': 0.5025294018180532, 'TBC1D30': 0.28769339766171936, 'TMEM258': 0.40369389546815276, 'LRG1': 0.4522683899591349}
Mean of values: {'CENPU': 0.43619547997982155, 'PEX14': 0.5411148082417763, 'RAD52': 0.46716437210488704, 'CLYBL': 0.4781123920492219, 'RANBP1': 0.4976978548905414, 'TBC1D30': 0.608018286938663, 'TMEM258': 0.5397171452895307, 'LRG1': 0.48367725393903804}
