In [1]:
import requests
import json
import pandas as pd
import numpy as np

#### This notebook reads taxon names stored in an Excel file and calls on the OLS API to get the NCBI Taxon identifiers. The taxon names are tentatively mapped to the main NCBI label but also to the different synonym fields 

### example code

In [86]:
## make request
url = "http://www.ebi.ac.uk/ols/api/select"
payload = "Homo sapiens"
param = {"q": payload, "ontology": "ncbitaxon"}
r = requests.get(url, params=param)
print(r.status_code, r.reason)

200 OK


In [87]:
#print(r.json()["response"]["docs"][0]["iri"])
#print(r.json()["response"]["docs"][0]["short_form"])
#print(r.json()["response"]["docs"][0]["label"])

print(r.json())


{'responseHeader': {'status': 0, 'QTime': 2, 'params': {'hl': 'true', 'fl': 'iri,ontology_name,ontology_prefix,short_form,label,id,type,obo_id', 'start': '0', 'fq': ['ontology_name: (ncbitaxon)', 'is_obsolete:false'], 'rows': '10', 'hl.simple.pre': '<b>', 'bq': 'type:ontology^10.0 is_defining_ontology:true^100.0 label_s:"homo sapiens"^1000  label_autosuggest_e:"homo sapiens"^500 synonym_s:"homo sapiens" synonym_autosuggest_e:"homo sapiens"^100', 'q': '(Homo AND sapiens)', 'defType': 'edismax', 'hl.simple.post': '</b>', 'qf': 'label synonym label_autosuggest_e label_autosuggest synonym_autosuggest_e synonym_autosuggest shortform_autosuggest iri', 'hl.fl': ['label_autosuggest', 'label', 'synonym_autosuggest', 'synonym'], 'wt': 'json'}}, 'response': {'numFound': 10, 'start': 0, 'docs': [{'id': 'ncbitaxon:class:http://purl.obolibrary.org/obo/NCBITaxon_9606', 'iri': 'http://purl.obolibrary.org/obo/NCBITaxon_9606', 'short_form': 'NCBITaxon_9606', 'obo_id': 'NCBITaxon:9606', 'label': 'Homo sa

### run the script with data

In [89]:
## parse the file and get taxon name
path='path_to_file.xlsx'
xls = pd.ExcelFile(path)
df = pd.read_excel(xls, 'List') ## reads the sheet List

species = df['Species'] ## get the content of the column Species


iris = []
redu = []
for sp in species:
    if sp:
        url = "http://www.ebi.ac.uk/ols/api/select"
        param = {"q": sp, "ontology": "ncbitaxon"}
        r = requests.get(url, params=param)

        if r.status_code == 200:
            if r.json()["response"]["numFound"] > 0:
                iri = r.json()["response"]["docs"][0]["iri"]
                short_form = r.json()["response"]["docs"][0]["short_form"]
                name = r.json()["response"]["docs"][0]["label"]

                iris.append(iri)
                redu.append(short_form.split("_")[1]+"|"+name)
            else:
                iris.append("")
                redu.append("")

        else:
            iris.append("")
            redu.append("")
    else:
        iris.append("")
        redu.append("")



In [90]:
## add the column to the df
species_index = df.columns.get_loc("Species")
df.insert(species_index+1, "NCBI URI", iris) ## insert NCBI URI column after the Species column
df.insert(species_index+2, "NCBI REDU", redu) ## insert NCBI REDU column after the NCBI URI column

#df.head()


In [85]:
## save the file
df.to_excel("species_with_ncbitaxon_ids.xlsx")