#### Enviroment setup

Downloading models and defining functions, this takes a while.

In [None]:
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_md-0.2.5.tar.gz # Medium sized language model
!pip install wikidataintegrator

In [None]:
import scispacy
import spacy
import pandas as pd
from wikidataintegrator import wdi_core
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
from functools import lru_cache

In [None]:
import en_core_sci_md
nlp = en_core_sci_md.load()

In [None]:
abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe)
linker = EntityLinker(resolve_abbreviations=True, name="umls")
nlp.add_pipe(linker)



In [None]:
# Function by github.com/lubianat with some slight alterations by github.com/jvfe
@lru_cache(maxsize=None)
def get_wikidata_item(wikidata_property, value):
    """Gets a Wikidata item for a determined property-value pair


      Args:
        property (str): The property to search
        value (str): The value of said property
      
      Returns:
        str: A Wikidata item or a "None" value if no item found.
    """
    query_result = wdi_core.WDItemEngine.execute_sparql_query(
        f'SELECT distinct ?item WHERE {{ ?item wdt:{wikidata_property} "{value}" }}'
    )
    try:
        match = query_result["results"]["bindings"][0]
    except:
        return None
    qid = match["item"]["value"]

    qid = qid.split("/")[4]
    return qid

In [None]:
def get_wdt_items_from_umls_entities(doc):
  """Create a table from the UMLS entities and link them to WDT
  """
  identified = []
  for ent in doc.ents:
      try:
        best_id = ent._.kb_ents[0][0]
      except IndexError:
        best_id = None
      identified.append([ent.text, ent.start_char, ent.end_char, best_id])

  entity_df = pd.DataFrame.from_records(identified, 
                                        columns=['label', 'start_pos', 'end_pos', 'umls_id'])
  
  entity_df['qid'] = entity_df['umls_id'].apply(lambda x: get_wikidata_item("P2892", x))

  return entity_df

### Testing out

In [None]:
text = """
Spinal and bulbar muscular atrophy (SBMA) is an
inherited motor neuron disease caused by the expansion
of a polyglutamine tract within the androgen receptor (AR).
SBMA can be caused by this easily.
"""

doc = nlp(text)

In [None]:
get_wdt_items_from_umls_entities(doc)

Unnamed: 0,label,start_pos,end_pos,umls_id,qid
0,Spinal,1,7,C0521329,
1,bulbar muscular atrophy,12,35,C1839259,Q1995327
2,SBMA,37,41,C1839259,Q1995327
3,inherited,49,58,C0439660,
4,motor neuron disease,59,79,C0085084,Q3221083
5,expansion,94,103,C0007595,
6,polyglutamine tract,109,128,C0032500,
7,androgen receptor,140,157,C0034786,
8,AR,159,161,C0034786,
9,SBMA,164,168,C1839259,Q1995327
