In [10]:
import json
import spacy
import stanfordnlp
import pandas as pd
import warnings
import os
import sys
from spacy_stanfordnlp import StanfordNLPLanguage

# nlp preprocessing pipeline
warnings.filterwarnings('ignore')
snlp = stanfordnlp.Pipeline(lang="en")
nlp = StanfordNLPLanguage(snlp)

module_path = os.path.abspath(os.path.join('../data_processing'))
if module_path not in sys.path:
    sys.path.append(module_path)
from data_processing_utils import read_spacy_docs, write_spacy_docs 

with open("../data/preprocessed_data/Life_Biology_kb_lexicon.json", "r") as f:
    lexicon = json.load(f)

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/Users/mattboggess/sta

In [17]:
texts = []
labels = []
for concept in lexicon:
    texts += lexicon[concept]["text_representations"]
    labels += [lexicon[concept]["class_label"]] * len(lexicon[concept]["text_representations"])

In [27]:
spacy_texts = [nlp(text) for text in texts]
write_spacy_docs(spacy_texts, "../data/term_extraction/summary/lexicon_entries_spacy")

In [13]:
spacy_texts = read_spacy_docs("../data/term_extraction/summary/lexicon_entries_spacy", nlp)

In [50]:
errors = []
correct = []
nominal_endings = ["tion", "ing", "cycle", "sis", "lism", "ment", "ence", "sion"]
for term, label in zip(spacy_texts, labels):
    if len(term) > 3:
        continue
    if any([term[i].pos_ == "VERB" for i in range(len(term))]):
        pred = "Event"
    elif "process" in term.text:
        pred = "Event"
    elif any([term[i].text.endswith(ne) for ne in nominal_endings for i in range(len(term))]):
        pred = "Event"
    else:
        pred = "Entity"
    
    if pred == label:
        correct.append(term)
    else:
        if label == "Event":
            print(term)
            print(" ".join([t.pos_ for t in term]))
        errors.append(term)
print(len(errors))

abrade
PROPN
acid-rain
NOUN PUNCT NOUN
fall
NOUN
acid reflux
NOUN NOUN
transport
NOUN
move
NOUN
transport
NOUN
active transport
ADJ NOUN
transport
NOUN
transport
NOUN
primary active transport
ADJ ADJ NOUN
adaptive immunity
ADJ NOUN
specific-immunity
NOUN
specific immunity
ADJ NOUN
adulthood
NOUN
aerobic
NOUN
age structure
NOUN NOUN
agonistic behavior
ADJ NOUN
red-tide
ADJ PUNCT NOUN
algal-bloom
NOUN PUNCT NOUN
harmful-algal-bloom
ADV
algae bloom
NOUN NOUN
hab
INTJ
allee effect
PROPN NOUN
allergy
NOUN
splice
NOUN
altruism
NOUN
anabolic reactions
ADJ NOUN
biosynthetic-pathway
NOUN PUNCT NOUN
anabolic pathway
ADJ NOUN
anabolic-pathway
ADJ
anaphase
PROPN
anaphase-I
NOUN PUNCT NUM
anaphase i
NOUN X
anaphase-II
PROPN PUNCT NUM
anaphase ii
PROPN NUM
anatomy
NOUN
anchor
NOUN
anchor
NOUN
animal cell growth
NOUN NOUN NOUN
animal growth
NOUN NOUN
animal husbandry
NOUN NOUN
culture
NOUN
culture
NOUN
animal transport
NOUN NOUN
anneal
NOUN
antibiotic resistance
ADJ NOUN
be resistant
AUX ADJ
present


In [40]:
print(len(correct))

7338
