In [1]:
import json
import spacy
import stanfordnlp
import pandas as pd
import warnings
import os
import sys
from spacy_stanfordnlp import StanfordNLPLanguage

# nlp preprocessing pipeline
warnings.filterwarnings('ignore')
snlp = stanfordnlp.Pipeline(lang="en")
nlp = StanfordNLPLanguage(snlp)

module_path = os.path.abspath(os.path.join('../data_processing'))
if module_path not in sys.path:
    sys.path.append(module_path)
from data_processing_utils import read_spacy_docs, write_spacy_docs 

with open("../data/preprocessed_data/Life_Biology_kb_lexicon.json", "r") as f:
    lexicon = json.load(f)

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/Users/mattboggess/sta

In [54]:
texts = []
labels = []
for concept in lexicon:
    class_label = lexicon[concept]["class_label"]
    for tr in lexicon[concept]["text_representations"]:
        if tr not in texts:
            texts.append(tr)
            labels.append(class_label)


In [55]:
spacy_texts = [nlp(text) for text in texts]
print(len(spacy_texts))
#write_spacy_docs(spacy_texts, "../data/term_extraction/summary/lexicon_entries_spacy")

9167


In [4]:
#spacy_texts = read_spacy_docs("../data/term_extraction/summary/lexicon_entries_spacy", nlp)

In [79]:
def determine_term_type(term):
    """ Categorizes a term as either entity or event based on several derived rules.

    Parameters
    ----------
    term: spacy.tokens.doc.Doc
        Spacy preprocessed representation of the term 

    Returns
    -------
    str ('entity' | 'event')
        The class of the term
    """
    
    NOMINALS = ["ation", "ition", "ption", "ing", "sis", "lism", "ment", "sion"]
    EVENT_KEYWORDS = ["process", "cycle"]
    
    # key words that indicate events despite being nouns 
    if any([ek in term.text.lower() for ek in EVENT_KEYWORDS]):
        term_type = "event"
    # key endings indicating a nominalized form of an event 
    elif any([term[i].text.endswith(ne) for ne in NOMINALS for i in range(len(term))]):
        term_type = "event"
    # POS = Verb implies event 
    elif any([t.pos_ == "VERB" for t in term]):
        term_type = "event"
    # default is otherwise entity 
    else:
        term_type = "entity"
    
    return term_type

In [80]:
errors = []
correct = []
for term, label in zip(spacy_texts, labels):
    pred = determine_term_type(term)
    
    if pred == label.lower():
        correct.append(term)
    else:
        print(term)
        print([t.pos_ for t in term])
        print(label)
        print(pred)
        print()
        
        #if label == "Event":
            #print(term)
            #print(" ".join([t.pos_ for t in term]))
        errors.append(term)

acquired immune deficiency syndrome
['VERB', 'ADJ', 'NOUN', 'NOUN']
Entity
event

ATP hydrolyzing transport protein
['PROPN', 'NOUN', 'NOUN', 'NOUN']
Entity
event

atp hydrolyzing transport protein
['NOUN', 'NOUN', 'NOUN', 'NOUN']
Entity
event

start codon
['VERB', 'NOUN']
Entity
event

abnormal cloned embryo
['ADJ', 'VERB', 'NOUN']
Entity
event

abortion
['NOUN']
Event
entity

abrade
['PROPN']
Event
entity

absorption spectrum
['NOUN', 'NOUN']
Entity
event

accessory pigment
['NOUN', 'NOUN']
Entity
event

acid-rain
['NOUN', 'PUNCT', 'NOUN']
Event
entity

fall
['NOUN']
Event
entity

acid reflux
['NOUN', 'NOUN']
Event
entity

acrosomal reaction
['ADJ', 'NOUN']
Event
entity

action potential
['NOUN', 'NOUN']
Event
entity

energy-of-activation
['NOUN', 'PUNCT', 'ADP', 'PUNCT', 'NOUN']
Entity
event

activation energy
['NOUN', 'NOUN']
Entity
event

free-energy-of-activation
['ADJ', 'PUNCT', 'NOUN', 'PUNCT', 'ADP', 'PUNCT', 'NOUN']
Entity
event

transport
['NOUN']
Event
entity

move
['NOUN']

In [78]:
print(len(correct))
print(len(errors))
print(len(correct) / (len(correct) + len(errors)))

7587
1580
0.8276426311770481
