In [1]:
# Adapted from https://github.com/lubianat/ann/blob/main/prototypes/notebooks/scispacy_linking_via_umls.ipynb

import scispacy
import spacy
import pandas as pd
from wikidataintegrator import wdi_core
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
from functools import lru_cache

import en_core_sci_sm
nlp = en_core_sci_sm.load()



In [2]:
abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe)
linker = EntityLinker(resolve_abbreviations=True, name="umls")
nlp.add_pipe(linker)




https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv not found in cache, downloading to /tmp/tmpfmm6z43j
Finished download, copying /tmp/tmpfmm6z43j to cache at /home/lubianat/.scispacy/datasets/21a1012c532c3a431d60895c509f5b4d45b0f8966c4178b892190a302b21836f.330707f4efe774134872b9f77f0e3208c1d30f50800b3b39a6b8ec21d9adf1b7.umls_semantic_type_tree.tsv


In [3]:

# Function by github.com/lubianat with some slight alterations by github.com/jvfe
@lru_cache(maxsize=None)
def get_wikidata_item(wikidata_property, value):
    """Gets a Wikidata item for a determined property-value pair


      Args:
        property (str): The property to search
        value (str): The value of said property
      
      Returns:
        str: A Wikidata item or a "None" value if no item found.
    """
    query_result = wdi_core.WDItemEngine.execute_sparql_query(
        f'SELECT distinct ?item WHERE {{ ?item wdt:{wikidata_property} "{value}" }}'
    )
    try:
        match = query_result["results"]["bindings"][0]
    except:
        return None
    qid = match["item"]["value"]

    qid = qid.split("/")[4]
    return qid

In [24]:
def get_wdt_items_from_umls_entities(doc):
    """Create a table from the UMLS entities and link them to WDT
    """
    identified = []
    for ent in doc.ents:
        print(ent)
        try:
            best_id = ent._.kb_ents[0][0]
        except IndexError:
            best_id = None
        identified.append([ent.text, ent.start_char, ent.end_char, best_id])

    entity_df = pd.DataFrame.from_records(identified, 
                                        columns=['label', 'start_pos', 'end_pos', 'umls_id'])
    
    entity_df['qid'] = entity_df['umls_id'].apply(lambda x: get_wikidata_item("P2892", x))

    return entity_df

IndentationError: unexpected indent (<ipython-input-24-fc436cd97859>, line 4)

In [20]:
text = """
Name-calling in the hippocampus (and beyond): coming to terms with neuron types and properties
D."""

doc = nlp(text)

  extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1]
  extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1]


In [21]:
doc.ents

(Name-calling, hippocampus, neuron types, properties)

In [22]:
get_wdt_items_from_umls_entities(doc)

Name-calling
hippocampus
neuron types
properties


KeyboardInterrupt: 