# RE Data Pipeline Development

1. Extract Relation Triples
  1. Take existing lexicon of all nominalizations/synonyms for each base noun/verb concept. Condense these into unique lemmatized versions. (cache for re-use)
  2. Take all taxonomy, process, and structure relations & lemmatize arguments for relations (cache for re-use)
  3. Create dictionary mapping relation type (meronym, etc.) -> relation (has-part, etc.) -> dictionary with entity/event pairs as keys
  4. Find way to extract no-relation terms
2. Tag Sentences w/ Relations
  1. Take all extracted biology sentences and spacy process (cache for re-use)
  2. For each relation, check each sentence for existence of term pair. If exists, add sentence with <e1> and <e2> tags denoting location of terms to appropriate list in dictionary.

Result will be json file corresponding to dictionary mapping relation type (meronym, etc.) -> relation (has-part, etc.) -> entity/event pairs as keys -> list of sentences expressing such a relation

In [129]:
import pandas as pd
from io import StringIO

import stanfordnlp
from spacy_stanfordnlp import StanfordNLPLanguage
snlp = stanfordnlp.Pipeline(lang="en")
nlp = StanfordNLPLanguage(snlp)

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

# fix for importing utils
import os
import sys
module_path = os.path.abspath(os.path.join('../utils'))
if module_path not in sys.path:
    sys.path.append(module_path)
from utils import write_spacy_docs, read_spacy_docs

data_dir = "../data/relation_extraction"

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/Users/mattboggess/sta

# 1. Extract Relation Triples

First we condense all equivalent lexicon representations for a concept and spacy pre-process these lexical forms 

In [147]:
# read in nouns lexicon
with open(f"{data_dir}/raw_data/kb_lexicon_nouns.txt", "r") as f:
    lexicon = f.read()
lexicon = lexicon.replace(' | "n"', '').replace('"', '')
lexicon = pd.read_csv(StringIO(lexicon), sep="\s*\|\s*", header=None, names=["concept", "relation", "text"])

# create mapping from kb concept to unique text representations
lexicon = lexicon[~lexicon.text.str.contains("Concept-Word-Frame")]
lexicon = lexicon.groupby("concept")["text"].apply(lambda x: list(set(x))).reset_index()

# spacy process terms
spacy_terms = []
lemmas = []
for concept in lexicon.concept:
    terms = list(lexicon.loc[lexicon.concept == concept, "text"])[0]
    spacy_terms_tmp = [nlp(term) for term in terms]
    lemma_terms = list([" ".join([tok.lemma_ for tok in t]) for t in spacy_terms_tmp])
    spacy_terms += spacy_terms_tmp
    lemmas.append(lemma_terms)
    
write_spacy_docs(spacy_terms, f"{data_dir}/processed_data/kb_terms_spacy")
lexicon["lemmas"] = lemmas
lexicon

Unnamed: 0,concept,text,lemmas
0,"1,3-Bisphosphoglycerate","[PGAP, 1,3-bisphosphoglycerate]","[pgap, 1,3-bisphosphoglycerate]"
1,3-Prime-End,"[3'-end, 3 prime end]","[3 '- end, 3 prime end]"
2,5-Prime-End,"[5'-end, 5 prime end, five-prime-end]","[5 '- end, 5 prime end, five - prime - end]"
3,5-Prime-Nucleotide,"[5' base, 5'-nucleotide, five prime base]","[5 ' base, 5 '- nucleotide, five prime base]"
4,A-Antigen,"[A antigen, carbohydrate-a]","[a antigen, carbohydrate-a]"
...,...,...,...
2727,Zoned-Reserve,[zoned reserve],[zone reserve]
2728,Zoospore,[zoospore],[zoospore]
2729,Zygomycete,"[zygomycota, zygomycete]","[zygomycota, zygomycete]"
2730,Zygosporangium,[zygosporangium],[zygosporangium]


Next we extract all relations accounting for lexicon variability 

In [203]:
import itertools
import re
import string


relations_db = {}

def extract_lemmas(lexicon, concept, instance):
    if concept in lexicon.concept:
        instance_texts = lexicon.at[lexicon.concept == concept, "text"]
        instance_lemmas = lexicon.at[lexicon.concept == concept, "lemma"]
    else:
        instance_texts = []
        instance_lemmas = []
    if instance not in instance_texts:
        instance_texts.append(instance)
        instance_lemmas.append(" ".join([tok.lemma_ for tok in nlp(instance)]))
    instance_pairs = {}
    for lemma, text in zip(instance_lemmas, instance_texts):
        if lemma in instance_pairs:
            instance_pairs[lemma].append(text)
        else:
            instance_pairs[lemma] = [text]
    return instance_pairs

# extract taxonomy relations
with open(f"{data_dir}/raw_data/taxonomy.txt", "r") as f:
    taxonomy = f.readlines()
relations_db = {}
print(len(taxonomy))
for i, r in enumerate(taxonomy):
    
    if i % 100 == 0:
        print(i)
    
    c1, e1, relation, c2, e2 = [tok.strip() for tok in r.split("|")]
    
    if relation not in relations_db:
        relations_db[relation] = {}
        
    e1_pairs = extract_lemmas(lexicon, c1, e1)
    e2_pairs = extract_lemmas(lexicon, c2, e2)
        
    for pair in itertools.product(e1_pairs.keys(), e2_pairs.keys()):
        relations_db[relation][" | ".join(pair)] = {"sentences": [], "e1_representations": e1_pairs[pair[0]], "e2_representations": e2_pairs[pair[1]]}
    
# structure relations 
with open(f"{data_dir}/raw_data/structure_relations.txt", "r") as f:
    structure = f.readlines()
    
meronym_relations = ["has-part", "has-region", "element", "possesses", "material"]
spatial_relations = ["is_at", "is_inside", "is_outside", "abuts", "between"]
print(len(structure))
for i, r in enumerate(structure):
    
    if i % 100 == 0:
        print(i)
    
    _, _, c1, e1, relation, c2, e2 = [tok.strip() for tok in r.split("|")]
    
    c1 = c1.strip("_").replace("_ABOX_", "").rstrip(string.digits)
    c2 = c2.strip("_").replace("_ABOX_", "").rstrip(string.digits)
    
    if relation not in meronym_relations and relation not in spatial_relations:
        continue
        
    if relation not in relations_db:
        relations_db[relation] = {}
        
    e1_pairs = extract_lemmas(lexicon, c1, e1)
    e2_pairs = extract_lemmas(lexicon, c2, e2)
    
    for pair in itertools.product(e1_pairs.keys(), e2_pairs.keys()):
        relations_db[relation][" | ".join(pair)] = {"sentences": [], "e1_representations": e1_pairs[pair[0]], "e2_representations": e2_pairs[pair[1]]}
        
relations_db["no-relation"] = {}
print(relations_db)

6750
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
44402
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800

In [163]:
print(relations_db)

{'subclass-of': {('chemical', 'substance'): {'sentences': [], 'e1_representations': ['chemical'], 'e2_representations': ['substance']}, ('chemical', 'object'): {'sentences': [], 'e1_representations': ['chemical'], 'e2_representations': ['object']}, ('role', 'thing'): {'sentences': [], 'e1_representations': ['role'], 'e2_representations': ['thing']}, ('mixture', 'substance'): {'sentences': [], 'e1_representations': ['mixture'], 'e2_representations': ['substance']}, ('aggregate', 'object'): {'sentences': [], 'e1_representations': ['aggregate'], 'e2_representations': ['object']}, ('region', 'object'): {'sentences': [], 'e1_representations': ['region'], 'e2_representations': ['object']}, ('community', 'aggregate'): {'sentences': [], 'e1_representations': ['community'], 'e2_representations': ['aggregate']}, ('animate thing', 'object'): {'sentences': [], 'e1_representations': ['animate thing'], 'e2_representations': ['object']}, ('animate thing', 'organic entity'): {'sentences': [], 'e1_repr

# 2. Tag Sentences w/ Relations

First, we pre-process the sentences with spacy.

In [113]:
stax_bio_sentences = pd.read_csv(f"{data_dir}/raw_data/final_bio_parsed.csv")
exclude_sections = ["Preface", "Chapter Outline", "Index", "Chapter Outline", "Critical Thinking Questions", "Visual Connection Questions", 
                    "Key Terms", "Review Questions", "The Periodic Table of Elements", "Measurements and the Metric System"]
stax_bio_sentences = stax_bio_sentences[~(stax_bio_sentences.section_name.isin(exclude_sections))]
docs = [nlp(sent) for sent in stax_bio_sentences.sentence]
write_spacy_docs(docs, f"{data_dir}/processed_data/openstax_biology_sentences_spacy")

In [114]:
with open(f"{data_dir}/raw_data/life_bio_selected_sentences.txt", "r") as f:
    life_bio_sentences = f.readlines()
docs = [nlp(sent) for sent in life_bio_sentences]
write_spacy_docs(docs, f"{data_dir}/processed_data/life_biology_sentences_spacy")

Next, we tag sentences with relation pairs extracted earlier.

In [208]:
import spacy
from collections import defaultdict
def locate_relations(text, terms, relations, nlp=None):
    
    # default to Stanford NLP pipeline wrapped in Spacy
    if nlp is None:
        snlp = stanfordnlp.Pipeline(lang="en")
        nlp = StanfordNLPLanguage(snlp)
        
    # preprocess with spacy if needed
    if type(terms[0]) != spacy.tokens.doc.Doc:
        terms = [nlp(term) for term in terms]
    if type(text) != spacy.tokens.doc.Doc:
        text = nlp(text)

    normalized_text = [token.lemma_ for token in text]
    tokenized_text = [token.text for token in text]

    # storage variables
    tagged_text = ['O'] * len(text)
    found_terms = defaultdict(lambda: {"text": [], "indices": [], "tag": []})

    # iterate through terms from longest to shortest
    terms = sorted(terms, key=len)[::-1]
    for term in terms:
        normalized_term = [token.lemma_ for token in term]

        for ix in range(len(text) - len(term)):

            if normalized_text[ix:ix + len(term)] == normalized_term:
                # only add term if not part of larger term
                if tagged_text[ix:ix + len(term)] == ["O"] * len(term):
                    lemma = " ".join(normalized_term)
                    found_terms[lemma]["text"].append(" ".join([token.text for token in term]))
                    found_terms[lemma]["indices"].append(ix)
                    found_terms[lemma]["tag"].append(" ".join([token.tag_ for token in term]))
                    tagged_text = tag_bioes(tagged_text, ix, len(term))
                    
    # extract relations
    for i in range(len(found_terms) - 1):
        for j in range(i + 1, len(found_terms)):
            terms = list(found_terms.keys())
            # sort alphabetically so no-relations will have same ordering across all texts 
            term1, term2 = sorted([terms[i], terms[j]])
            relation_count = 0
            for relation in relations:
                if " | ".join((term1, term2)) in relations[relation]: 
                    relation_text = " ".join(tokenized_text)
                    term_text = found_terms[term1]["text"][0]
                    relation_text = relation_text.replace(term_text, f"<e1> {term_text} </e1>")
                    term_text = found_terms[term2]["text"][0] 
                    relation_text = relation_text.replace(term_text, f"<e2> {term_text} </e2>")
                    relation_count += 1
                    relations[relation][" | ".join((term1, term2))]["sentences"].append(relation_text)
                elif " | ".join((term2, term1)) in relations[relation]: 
                    relation_text = " ".join(tokenized_text)
                    term_text = found_terms[term1]["text"][0]
                    relation_text = relation_text.replace(term_text, f"<e2> {term_text} </e2>")
                    term_text = found_terms[term2]["text"][0] 
                    relation_text = relation_text.replace(term_text, f"<e1> {term_text} </e1>")
                    relation_count += 1
                    relations[relation][" | ".join((term2, term1))]["sentences"].append(relation_text)
            if relation_count == 0:
                relation_text = " ".join(tokenized_text)
                term_text = found_terms[term1]["text"][0]
                relation_text = relation_text.replace(term_text, f"<e1> {term_text} </e1>")
                term_text = found_terms[term2]["text"][0] 
                relation_text = relation_text.replace(term_text, f"<e2> {term_text} </e2>")
                    
                if " | ".join((term1, term2)) not in relations["no-relation"]:
                    relations["no-relation"][" | ".join((term1, term2))] = {"sentences": [relation_text], "e1_representations": [], "e2_representations": []}
                else:
                    relations["no-relation"][" | ".join((term1, term2))]["sentences"].append(relation_text)
                
    
    return relations

def tag_bioes(tags, match_index, term_length):
    """ Updates tags for a text using the BIOES tagging scheme.

    B = beginning of term phrase
    I = interior of term phrase
    O = non-term
    E = end of term phrase
    S = singleton term

    Parameters
    ----------
    tags: list of str
        List of current BIOES tags for given tokenized text that we will be updating
    match_index: int
        Index at which the term was matched in the text
    term_length: int
        Number of tokens that compose the term ('cell wall' -> 2)

    Returns
    -------
    list of str
        Updated list of BIOES tags for the given tokenized text

    Examples
    --------

    >>> tag_bioes(['O', 'O', 'O', 'O'], 1, 2)
    ['O', 'B', 'E', 'O']

    >>> tag_bioes(['O', 'O', 'O', 'O'], 0, 1)
    ['S', 'O', 'O', 'O']

    >>> tag_bioes(['O', 'O', 'O', 'O'], 1, 3)
    ['O', 'B', 'I', 'E']

    """

    if term_length == 1:
        tags[match_index] = "S"
    else:
        for i in range(term_length):
            if i == 0:
                tags[match_index + i] = "B"
            elif i == term_length - 1:
                tags[match_index + i] = "E"
            else:
                tags[match_index + i] = "I"
    return tags

terms = read_spacy_docs(f"{data_dir}/processed_data/kb_terms_spacy")
sentences1 = read_spacy_docs(f"{data_dir}/processed_data/openstax_biology_sentences_spacy")
sentences2 = read_spacy_docs(f"{data_dir}/processed_data/life_biology_sentences_spacy")
sentences = sentences1 + sentences2

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/Users/mattboggess/sta

In [209]:
relations_db["no-relation"] = {}
for sentence in sentences:
    
    relations_db = locate_relations(sentence, terms, relations_db, nlp)
    

In [210]:
import json
with open(f"{data_dir}/processed_data/relations_db.json", "w") as f:
    json.dump(relations_db, f, indent=4)