#### Entity identification and Wikidata linking with MeSH terms

Based on scispacy_linking_via_umls.ipynb


In [None]:
## Install required dependencies
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_md-0.2.5.tar.gz # Medium language model
!pip install wikidataintegrator
!pip install Wikidata 

In [None]:
# Load libraries
import scispacy
import spacy
import en_core_sci_md
import urllib.parse
import pandas as pd
from wikidataintegrator import wdi_core
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
from functools import lru_cache
import requests
import json
import os
import sys
import stat
import re
import numpy as np
import requests

In [None]:
#fetch article abstracts from EuroPMC articles api
def get_pmc_meta(pid):
   pmeta = {} #pmc metadata (mesh terms, title, abstract
   try:
       if re.search('PMC',pid) is None:
           pq = 'ext_id%3A'+pid+'%20src%3Amed'
       else:
           pq = pid       
       url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+pq+'&resultType=core&synonym=TRUE&cursorMark=*&pageSize=1000&format=json'
       response = requests.get(url)
       rjson = response.json()       
       for rslt in rjson['resultList']['result']:
           pmeta['pmid'] = rslt['pmid'] if 'pmid' in rslt.keys() else ''
           pmeta['pmcid'] = rslt['pmcid'] if 'pmcid' in rslt.keys() else ''
           pmeta['mesh'] = [] # pmc mesh terms
           if 'meshHeadingList' in rslt.keys():
               for m in rslt['meshHeadingList']['meshHeading']:
                   if 'meshQualifierList' in m.keys():
                       for q in m['meshQualifierList']['meshQualifier']:
                           pmeta['mesh'].append(m['descriptorName'])
                           pmeta['mesh'].append(q['qualifierName'])
                   else:
                       pmeta['mesh'].append(m['descriptorName'])
           pmeta['pmc_title'] = rslt['title'] #pmc title
           pmeta['pmc_abstract'] = rslt['abstractText'] if 'abstractText' in rslt.keys() else '' #
   except:
       print('ERROR IN PMC ID:'+pid)
   return pmeta['pmc_abstract']


In [None]:
nlp = en_core_sci_md.load()

In [None]:
#Add mesh terms
abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe)
linker = EntityLinker(resolve_abbreviations=True, name="mesh")
nlp.add_pipe(linker)

https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/mesh_linking_model/tfidf_vectors_sparse.npz not found in cache, downloading to /tmp/tmp2c4mogq1
Finished download, copying /tmp/tmp2c4mogq1 to cache at /root/.scispacy/datasets/b28c5ae2b3052b66e3df4d9e8082fd6138060d0369555a603bf103facbc8a175.cdcb8550ec06b33ef35938f3ffb30ca58f6082bc649ce9c8069d041eb33c22b6.tfidf_vectors_sparse.npz
https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/mesh_linking_model/nmslib_index.bin not found in cache, downloading to /tmp/tmpwxqnuuam
Finished download, copying /tmp/tmpwxqnuuam to cache at /root/.scispacy/datasets/6812e57b9f4b0e14d6f9974a745e136fb47b5c2a2d955635a4d13675f6add07d.62b9b370bfb8c9433ba8fb69c1fb83405116079c4f741698b8159319d01833c0.nmslib_index.bin
https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/mesh_linking_model/tfidf_vectorizer.joblib not found in cache, downloading to /tmp/tmp7io73io8
Finished download, copying /tmp/tmp7io73io8 to cache at /root/.scispacy/datasets/418d053a

In [None]:
# Function by github.com/lubianat with some slight alterations by me
@lru_cache(maxsize=None)
def get_wikidata_item(wikidata_property, value):
    query_result = wdi_core.WDItemEngine.execute_sparql_query(
        f'SELECT distinct ?item WHERE {{ ?item wdt:{wikidata_property} "{value}" }}'
    )
    try:
        match = query_result["results"]["bindings"][0]
    except:
        return None
    qid = match["item"]["value"]

    qid = qid.split("/")[4]
    return qid

In [None]:
# Function to fetch first serach result of wikidata entry
# source https://stackoverflow.com/questions/51419785/extract-data-from-wikidata-in-python
API_ENDPOINT = "https://www.wikidata.org/w/api.php"
def get_arbitrary_wdataids(term):
  term = urllib.parse.quote(term)
  params = {
       'action': 'wbsearchentities',
       'format': 'json',
       'language': 'en',
       'search': term
  }
  r = requests.get(API_ENDPOINT, params = params) #make the request
  try:
    wdataid = r.json()['search'][0]['id']
  except IndexError:
    wdataid = None
  return wdataid

In [None]:
#mesh term to wikidata
def get_wdt_items_from_mesh_entities(doc):

  identified = []
  for ent in doc.ents:
      arbitrary_wdata_id = get_arbitrary_wdataids(str(ent))
      try:
        best_id = ent._.kb_ents[0][0]
        match_score = ent._.kb_ents[1][1]
      except IndexError:
        best_id = None
        match_score = "NA"
      identified.append([ent.text, ent.start_char, ent.end_char, best_id, match_score, arbitrary_wdata_id])

  entity_df = pd.DataFrame.from_records(identified, 
                                        columns=['label', 'start_pos', 'end_pos', 'mesh_id', 'meshid_match_score', 'arbitrary_wdata_id'])
  
  entity_df['wdata_id'] = entity_df['mesh_id'].apply(lambda x: get_wikidata_item("P486", x))

  return entity_df

In [None]:
#Wrapper function for EuroPMC
def get_pmcid_annotations(pmcid):
  text = get_pmc_meta(pmcid)
  doc = nlp(text)
  data = get_wdt_items_from_mesh_entities(doc)
  return data


### Testing


In [None]:
get_pmcid_annotations("PMC7448226")

Unnamed: 0,label,start_pos,end_pos,mesh_id,meshid_match_score,arbitary_wdata_id,wdata_id
0,Exosomes,12,20,D055354,0.721988,Q903634,Q903634
1,membranous vesicles,42,61,,,,
2,RNA,69,72,D012313,0.760737,Q11053,Q11053
3,content,73,80,,,Q1260632,
4,exosomes,99,107,D055354,0.721988,Q903634,Q903634
...,...,...,...,...,...,...,...
84,bioindicator,1910,1922,D000074062,0.790585,Q864438,
85,diagnosis,1931,1940,D003933,0.87343,Q177719,Q16644043
86,prognosis,1945,1954,,,Q592442,
87,solid tumors,1958,1970,,,,
