# Using Ensembl REST API

In [1]:
import requests, sys
from pprint import pprint
#import xmltodict
import re
import numpy as np
import pandas as pd

In [3]:
genes_df = pd.read_csv('Data/DriverMapTPM.tsv', delimiter='\t')

In [4]:
genes_comp = list(genes_df.iloc[0:10,0])
genes_list = []

for gene in genes_comp:
    gene_name = gene.split('|')
    genes_list.append(gene_name[1])


In [5]:
genes_list

['CENPU',
 'PEX14',
 'RAD52',
 'CLYBL',
 'RANBP1',
 'KDF1',
 'TBC1D30',
 'TMEM258',
 'LRG1',
 'HYAL1']

In [6]:
# This function is performed to get list of available Homo Sapiens tissue types from Ensembl
 
server = "http://grch37.rest.ensembl.org"
ext = "/eqtl/tissue/homo_sapiens?"
 
r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
 
if not r.ok:
  r.raise_for_status()
  sys.exit()
 
tissue_dict = r.json() # total of 44 tissue types

pprint(tissue_dict)

{'Adipose_Subcutaneous': 1,
 'Adipose_Visceral_Omentum': 1,
 'Adrenal_Gland': 1,
 'Artery_Aorta': 1,
 'Artery_Coronary': 1,
 'Artery_Tibial': 1,
 'Brain_Anterior_cingulate_cortex_BA24': 1,
 'Brain_Caudate_basal_ganglia': 1,
 'Brain_Cerebellar_Hemisphere': 1,
 'Brain_Cerebellum': 1,
 'Brain_Cortex': 1,
 'Brain_Frontal_Cortex_BA9': 1,
 'Brain_Hippocampus': 1,
 'Brain_Hypothalamus': 1,
 'Brain_Nucleus_accumbens_basal_ganglia': 1,
 'Brain_Putamen_basal_ganglia': 1,
 'Breast_Mammary_Tissue': 1,
 'Cells_EBV-transformed_lymphocytes': 1,
 'Cells_Transformed_fibroblasts': 1,
 'Colon_Sigmoid': 1,
 'Colon_Transverse': 1,
 'Esophagus_Gastroesophageal_Junction': 1,
 'Esophagus_Mucosa': 1,
 'Esophagus_Muscularis': 1,
 'Heart_Atrial_Appendage': 1,
 'Heart_Left_Ventricle': 1,
 'Liver': 1,
 'Lung': 1,
 'Muscle_Skeletal': 1,
 'Nerve_Tibial': 1,
 'Ovary': 1,
 'Pancreas': 1,
 'Pituitary': 1,
 'Prostate': 1,
 'Skin_Not_Sun_Exposed_Suprapubic': 1,
 'Skin_Sun_Exposed_Lower_leg': 1,
 'Small_Intestine_Terminal

In [7]:
# convert tissue dictionary to tissue list
dlist = []
for i in tissue_dict.items():
    dlist.append(i)

tissue_list = []    
for i in range(len(dlist)):
    tissue_list.append(dlist[i][0])
tissue_list.sort()
tissue_brain = tissue_list[6:14]
tissue_brain

['Brain_Anterior_cingulate_cortex_BA24',
 'Brain_Caudate_basal_ganglia',
 'Brain_Cerebellar_Hemisphere',
 'Brain_Cerebellum',
 'Brain_Cortex',
 'Brain_Frontal_Cortex_BA9',
 'Brain_Hippocampus',
 'Brain_Hypothalamus']

In [13]:
failed_genes = []
tissue_gene_values_dict = {}
for gene in genes_list:
    for tissue in tissue_brain:
        statistic = 'p-value'
        server = "http://grch37.rest.ensembl.org"
        ext = f"/eqtl/id/homo_sapiens/{gene}?;tissue={tissue};statistic={statistic}"
 
        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
        print(f'Retrieving {gene} for tissue {tissue}')
        
        if not r.ok:
            if gene not in failed_genes:
                failed_genes.append(gene)
            print('Gene not found, passing')
            pass
        
        tissue_gene_values_dict[gene] = r.json()
        


Retrieving CENPU for tissue Brain_Anterior_cingulate_cortex_BA24
Retrieving CENPU for tissue Brain_Caudate_basal_ganglia
Retrieving CENPU for tissue Brain_Cerebellar_Hemisphere
Retrieving CENPU for tissue Brain_Cerebellum
Retrieving CENPU for tissue Brain_Cortex
Retrieving CENPU for tissue Brain_Frontal_Cortex_BA9
Retrieving CENPU for tissue Brain_Hippocampus
Retrieving CENPU for tissue Brain_Hypothalamus
Retrieving PEX14 for tissue Brain_Anterior_cingulate_cortex_BA24
Retrieving PEX14 for tissue Brain_Caudate_basal_ganglia
Retrieving PEX14 for tissue Brain_Cerebellar_Hemisphere
Retrieving PEX14 for tissue Brain_Cerebellum
Retrieving PEX14 for tissue Brain_Cortex
Retrieving PEX14 for tissue Brain_Frontal_Cortex_BA9
Retrieving PEX14 for tissue Brain_Hippocampus
Retrieving PEX14 for tissue Brain_Hypothalamus
Retrieving RAD52 for tissue Brain_Anterior_cingulate_cortex_BA24
Retrieving RAD52 for tissue Brain_Caudate_basal_ganglia
Retrieving RAD52 for tissue Brain_Cerebellar_Hemisphere
Retri

In [14]:
failed_genes

['KDF1']

In [15]:
for i in failed_genes:
    tissue_gene_values_dict.pop(i)

In [16]:
tissue_gene_values_dict.keys()

dict_keys(['CENPU', 'PEX14', 'RAD52', 'CLYBL', 'RANBP1', 'TBC1D30', 'TMEM258', 'LRG1', 'HYAL1'])

In [23]:
tissue_gene_values_dict['HYAL1'] # for some reason this gene doesn't have info

[]

In [17]:
# getting list of specific values from the {tissue_gene_values_dict}
# collecting values and negative log10(p-value) for that gene

gene_values_dict = {}

for gene in tissue_gene_values_dict:
    gene_values_dict[gene] = {}
    for i in range(len(tissue_gene_values_dict[gene])):
        #print(gene,tissue_gene_values_dict[gene][i]['minus_log10_p_value'])
        gene_values_dict[gene][i] = [tissue_gene_values_dict[gene][i]['minus_log10_p_value'], 
                                  tissue_gene_values_dict[gene][i]['value']]


In [18]:
# because dictionary contains multiple location expression of the gene in chosen tissue,
# we collect all values and take the mean
mean_pvalues = {}
mean_values = {}
for gene in gene_values_dict:
    temp_dict = gene_values_dict[gene]
    list_pvalues = []
    list_values = []
    for i in range(len(temp_dict.keys())):
        list_pvalues.append(temp_dict[i][0])
        list_values.append(temp_dict[i][1])
    mean_pvalues[gene] = np.mean(list_pvalues)
    mean_values[gene] = np.mean(list_values)   
    
print(f'Mean of p-values: {mean_pvalues}')
print(f'Mean of values: {mean_values}')

Mean of p-values: {'CENPU': 0.6046874207143056, 'PEX14': 0.3612288053977717, 'RAD52': 0.4831311476544303, 'CLYBL': 0.4857805199442888, 'RANBP1': 0.5025294018180532, 'TBC1D30': 0.28769339766171936, 'TMEM258': 0.40369389546815276, 'LRG1': 0.4522683899591349, 'HYAL1': nan}
Mean of values: {'CENPU': 0.43619547997982155, 'PEX14': 0.5411148082417763, 'RAD52': 0.46716437210488704, 'CLYBL': 0.4781123920492219, 'RANBP1': 0.4976978548905414, 'TBC1D30': 0.608018286938663, 'TMEM258': 0.5397171452895307, 'LRG1': 0.48367725393903804, 'HYAL1': nan}


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [19]:
means_pv_df = pd.DataFrame.from_dict(mean_pvalues, orient='index')
means_pv_df = means_pv_df.rename(columns={0:'Mean p-value'})
means_v_df = pd.DataFrame.from_dict(mean_values, orient='index')
means_v_df = means_v_df.rename(columns={0:'Mean value'})
means_df = pd.merge(means_pv_df, means_v_df, left_index=True, right_index=True)
means_df

Unnamed: 0,Mean p-value,Mean value
CENPU,0.604687,0.436195
PEX14,0.361229,0.541115
RAD52,0.483131,0.467164
CLYBL,0.485781,0.478112
RANBP1,0.502529,0.497698
TBC1D30,0.287693,0.608018
TMEM258,0.403694,0.539717
LRG1,0.452268,0.483677
HYAL1,,
