# RE Data Pipeline Development

1. Extract Relation Triples
  1. Take existing lexicon of all nominalizations/synonyms for each base noun/verb concept. Condense these into unique lemmatized versions. (cache for re-use)
  2. Take all taxonomy, process, and structure relations & lemmatize arguments for relations (cache for re-use)
  3. Create dictionary mapping relation type (meronym, etc.) -> relation (has-part, etc.) -> dictionary with entity/event pairs as keys
  4. Find way to extract no-relation terms
2. Tag Sentences w/ Relations
  1. Take all extracted biology sentences and spacy process (cache for re-use)
  2. For each relation, check each sentence for existence of term pair. If exists, add sentence with <e1> and <e2> tags denoting location of terms to appropriate list in dictionary.

Result will be json file corresponding to dictionary mapping relation type (meronym, etc.) -> relation (has-part, etc.) -> entity/event pairs as keys -> list of sentences expressing such a relation

In [225]:
import pandas as pd
import json
from io import StringIO
import spacy
from collections import defaultdict

import stanfordnlp
from spacy_stanfordnlp import StanfordNLPLanguage
snlp = stanfordnlp.Pipeline(lang="en")
nlp = StanfordNLPLanguage(snlp)

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

# fix for importing utils
import os
import sys
module_path = os.path.abspath(os.path.join('../utils'))
if module_path not in sys.path:
    sys.path.append(module_path)
from utils import write_spacy_docs, read_spacy_docs, tag_bioes
from data_processing import get_closest_match

data_dir = "../data/relation_extraction"

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/Users/mattboggess/sta

# 1. Extract Relation Triples

First we condense all equivalent lexicon representations for a concept and spacy pre-process these lexical forms 

In [216]:
def process_lexicon(lexicon_file):
    """ Takes in a lexicon file consisting of concept text representation pairs and turns this into a list
    of Spacy processed terms and a lexicon csv mapping KB concepts to lists of text representations and their lemma forms.
    
    """
    
    with open(lexicon_file, "r") as f:
        lexicon = f.read()
    
    # get rid of extra column and read in as dataframe
    lexicon = lexicon.replace(' | "n"', '').replace('"', '')
    lexicon = pd.read_csv(StringIO(lexicon), sep="\s*\|\s*", header=None, names=["concept", "relation", "text"])

    # create mapping from kb concept to unique text representations
    lexicon = lexicon[~lexicon.text.str.contains("Concept-Word-Frame")]
    lexicon = lexicon.groupby("concept")["text"].apply(lambda x: list(set(x))).reset_index()

    # spacy process terms to get lemmas
    spacy_terms = []
    lemmas = []
    for concept in lexicon.concept:
        terms = list(lexicon.loc[lexicon.concept == concept, "text"])[0]
        spacy_terms_tmp = [nlp(term) for term in terms]
        lemma_terms = list([" ".join([tok.lemma_ for tok in t]) for t in spacy_terms_tmp])
        spacy_terms += spacy_terms_tmp
        lemmas.append(lemma_terms)

    lexicon["lemmas"] = lemmas
    return spacy_terms, lexicon

terms, lexicon = process_lexicon(f"{data_dir}/raw_data/kb_lexicon_nouns.txt")
write_spacy_docs(terms, f"{data_dir}/processed_data/kb_terms_spacy")
lexicon.to_csv(f"{data_dir}/processed_data/lexicon.csv", index=False)

In [226]:
lexicon

Unnamed: 0,concept,text,lemmas
0,"1,3-Bisphosphoglycerate","[PGAP, 1,3-bisphosphoglycerate]","[pgap, 1,3-bisphosphoglycerate]"
1,3-Prime-End,"[3'-end, 3 prime end]","[3 '- end, 3 prime end]"
2,5-Prime-End,"[5'-end, 5 prime end, five-prime-end]","[5 '- end, 5 prime end, five - prime - end]"
3,5-Prime-Nucleotide,"[5' base, 5'-nucleotide, five prime base]","[5 ' base, 5 '- nucleotide, five prime base]"
4,A-Antigen,"[A antigen, carbohydrate-a]","[a antigen, carbohydrate-a]"
...,...,...,...
2727,Zoned-Reserve,[zoned reserve],[zone reserve]
2728,Zoospore,[zoospore],[zoospore]
2729,Zygomycete,"[zygomycota, zygomycete]","[zygomycota, zygomycete]"
2730,Zygosporangium,[zygosporangium],[zygosporangium]


Next we extract all relations accounting for lexicon variability 

In [224]:
import itertools
import re
import string

def extract_lemmas(lexicon, concept, instance):
    """ For a given concept, text representation in a relation this function 
    extracts all lemma representations of that concept and the associated text representations.
    """
    
    # pull out equivalent lemma, text representations from lexicon
    if concept in lexicon.concept:
        instance_texts = lexicon.at[lexicon.concept == concept, "text"]
        instance_lemmas = lexicon.at[lexicon.concept == concept, "lemma"]
    else:
        instance_texts = []
        instance_lemmas = []
        
    # add particular text from relation if not present
    if instance not in instance_texts:
        instance_texts.append(instance)
        instance_lemmas.append(" ".join([tok.lemma_ for tok in nlp(instance)]))
        
    # create a dictionary mapping all lemma forms to corresponding text forms for this concept
    instance_pairs = {}
    for lemma, text in zip(instance_lemmas, instance_texts):
        if lemma in instance_pairs:
            instance_pairs[lemma].append(text)
        else:
            instance_pairs[lemma] = [text]
            
    return instance_pairs

def parse_relations(file, relation_type, lexicon, relations_db, include_relations=None):
    """ Parses all relations from a text file containing relations and adds them to
    the relations database.
    
    1. Parse each relation into concepts, relations, and text representation
    2. Extract all unique lemma representations and associated text representations for each entity/event pair
    3. Add each unique relation pair to the relations database
    
    """
    with open(file) as f:
        data = f.readlines()
    
    for i, r in enumerate(data):
    
        if i % 100 == 0:
            print(i)
    
        if relation_type == "taxonomy":
            c1, e1, relation, c2, e2 = [tok.strip() for tok in r.split("|")]
        elif relation_type == "structure":
            _, _, c1, e1, relation, c2, e2 = [tok.strip() for tok in r.split("|")]

            c1 = c1.strip("_").replace("_ABOX_", "").rstrip(string.digits)
            c2 = c2.strip("_").replace("_ABOX_", "").rstrip(string.digits)
        
        if include_relations and relation not in include_relations:
            continue
    
        if relation not in relations_db:
            relations_db[relation] = {}
        
        e1_pairs = extract_lemmas(lexicon, c1, e1)
        e2_pairs = extract_lemmas(lexicon, c2, e2)
        
        for pair in itertools.product(e1_pairs.keys(), e2_pairs.keys()):
            relations_db[relation][" -> ".join(pair)] = {"sentences": [], 
                                                         "e1_representations": e1_pairs[pair[0]], 
                                                         "e2_representations": e2_pairs[pair[1]]}
    
    return relations_db
    
meronym_relations = ["has-part", "has-region", "element", "possesses", "material"]
spatial_relations = ["is-at", "is-inside", "is-outside", "abuts", "between"]
taxonomy_relations = ["subclass-of", "instance-of"]
include_relations = meronym_relations + spatial_relations + taxonomy_relations

relations_db = {"no-relation": {}}
for relation_type in ["taxonomy", "structure"]:
    file = f"{data_dir}/raw_data/{relation_type}_relations.txt"
    relations_db = parse_relations(file, relation_type, lexicon, relations_db, include_relations)
    
with open(f"{data_dir}/processed_data/relations_db_intermediate.json", "w") as f:
    json.dump(relations_db, f, indent=4)
relations_db

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
1300

{'no-relation': {},
 'subclass-of': {'chemical -> substance': {'sentences': [],
   'e1_representations': ['chemical'],
   'e2_representations': ['substance']},
  'chemical -> object': {'sentences': [],
   'e1_representations': ['chemical'],
   'e2_representations': ['object']},
  'role -> thing': {'sentences': [],
   'e1_representations': ['role'],
   'e2_representations': ['thing']},
  'mixture -> substance': {'sentences': [],
   'e1_representations': ['mixture'],
   'e2_representations': ['substance']},
  'aggregate -> object': {'sentences': [],
   'e1_representations': ['aggregate'],
   'e2_representations': ['object']},
  'region -> object': {'sentences': [],
   'e1_representations': ['region'],
   'e2_representations': ['object']},
  'community -> aggregate': {'sentences': [],
   'e1_representations': ['community'],
   'e2_representations': ['aggregate']},
  'animate thing -> object': {'sentences': [],
   'e1_representations': ['animate thing'],
   'e2_representations': ['object']

# 2. Tag Sentences w/ Relations

First, we pre-process the sentences with spacy.

In [113]:
stax_bio_sentences = pd.read_csv(f"{data_dir}/raw_data/final_bio_parsed.csv")
exclude_sections = ["Preface", "Chapter Outline", "Index", "Chapter Outline", "Critical Thinking Questions", 
                    "Visual Connection Questions", "Key Terms", "Review Questions", 
                    "The Periodic Table of Elements", "Measurements and the Metric System"]
stax_bio_sentences = stax_bio_sentences[~(stax_bio_sentences.section_name.isin(exclude_sections))]
docs = [nlp(sent) for sent in stax_bio_sentences.sentence]
write_spacy_docs(docs, f"{data_dir}/processed_data/openstax_biology_sentences_spacy")

In [114]:
with open(f"{data_dir}/raw_data/life_bio_selected_sentences.txt", "r") as f:
    life_bio_sentences = f.readlines()
docs = [nlp(sent) for sent in life_bio_sentences]
write_spacy_docs(docs, f"{data_dir}/processed_data/life_biology_sentences_spacy")

In [232]:
indices = ((3, 4), (5, 6))
[i for ind in indices for i in ind]

[3, 4, 5, 6]

In [242]:
test = "7.1.1.16.6	Chapters 27 and 28 cover the green plants (D)."
re.sub("^(\d*\.*)+\s*", "", test)

'Chapters 27 and 28 cover the green plants (D).'

Next, we tag sentences with relation pairs extracted earlier.

In [208]:


def add_relation(term1, term2, found_terms, tokenized_text, relation_type, relations_db):
    
    term_pair = " -> ".join((term1, term2))
    for relation in relations_db:
        if term_pair in relations_db[relation]: 
            
        relation_text = " ".join(tokenized_text)
        term_text = found_terms[term1]["text"][0]
        relation_text = relation_text.replace(term_text, f"<e1> {term_text} </e1>")
        term_text = found_terms[term2]["text"][0] 
        relation_text = relation_text.replace(term_text, f"<e2> {term_text} </e2>")
        relation_count += 1
        relations[relation][" | ".join((term1, term2))]["sentences"].append(relation_text)
    
def tag_relations(text, terms, relations_db, nlp=None):
    
    # default to Stanford NLP pipeline wrapped in Spacy
    if nlp is None:
        snlp = stanfordnlp.Pipeline(lang="en")
        nlp = StanfordNLPLanguage(snlp)
        
    # preprocess with spacy if needed
    if type(terms[0]) != spacy.tokens.doc.Doc:
        terms = [nlp(term) for term in terms]
    if type(text) != spacy.tokens.doc.Doc:
        text = nlp(text)

    normalized_text = [token.lemma_ for token in text]
    tokenized_text = [token.text for token in text]

    # storage variables

                    
    # extract relations
    for i in range(len(found_terms) - 1):
        for j in range(i + 1, len(found_terms)):
            terms = list(found_terms.keys())
            # sort alphabetically so no-relations will have same ordering across all texts 
            term1, term2 = sorted([terms[i], terms[j]])
            relation_count = 0
            for relation in relations:
                if " | ".join((term1, term2)) in relations[relation]: 
                    relation_text = " ".join(tokenized_text)
                    term_text = found_terms[term1]["text"][0]
                    relation_text = relation_text.replace(term_text, f"<e1> {term_text} </e1>")
                    term_text = found_terms[term2]["text"][0] 
                    relation_text = relation_text.replace(term_text, f"<e2> {term_text} </e2>")
                    relation_count += 1
                    relations[relation][" | ".join((term1, term2))]["sentences"].append(relation_text)
                elif " | ".join((term2, term1)) in relations[relation]: 
                    relation_text = " ".join(tokenized_text)
                    term_text = found_terms[term1]["text"][0]
                    relation_text = relation_text.replace(term_text, f"<e2> {term_text} </e2>")
                    term_text = found_terms[term2]["text"][0] 
                    relation_text = relation_text.replace(term_text, f"<e1> {term_text} </e1>")
                    relation_count += 1
                    relations[relation][" | ".join((term2, term1))]["sentences"].append(relation_text)
            if relation_count == 0:
                relation_text = " ".join(tokenized_text)
                term_text = found_terms[term1]["text"][0]
                relation_text = relation_text.replace(term_text, f"<e1> {term_text} </e1>")
                term_text = found_terms[term2]["text"][0] 
                relation_text = relation_text.replace(term_text, f"<e2> {term_text} </e2>")
                    
                if " | ".join((term1, term2)) not in relations["no-relation"]:
                    relations["no-relation"][" | ".join((term1, term2))] = {"sentences": [relation_text], "e1_representations": [], "e2_representations": []}
                else:
                    relations["no-relation"][" | ".join((term1, term2))]["sentences"].append(relation_text)
                
    
    return relations

terms = read_spacy_docs(f"{data_dir}/processed_data/kb_terms_spacy", nlp)
sentences1 = read_spacy_docs(f"{data_dir}/processed_data/openstax_biology_sentences_spacy", nlp)
sentences2 = read_spacy_docs(f"{data_dir}/processed_data/life_biology_sentences_spacy", nlp)
sentences = sentences1 + sentences2

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/Users/mattboggess/sta

In [209]:
relations_db["no-relation"] = {}
for sentence in sentences:
    
    relations_db = locate_relations(sentence, terms, relations_db, nlp)
    

In [210]:
import json
with open(f"{data_dir}/processed_data/relations_db.json", "w") as f:
    json.dump(relations_db, f, indent=4)

# Explore Relations

In [267]:
with open(f"{data_dir}/processed_data/relations_db.json", "r") as f:
    db = json.load(f)

df = {"relation": [], "num_word_pairs": [], "word_pair": [], "num_sentences": []}

for rt in db:
    num_word_pairs = len(db[rt])
    for wp in db[rt]:
        df["relation"].append(rt)
        df["num_word_pairs"].append(num_word_pairs)
        df["word_pair"].append(wp)
        df["num_sentences"].append(len(db[rt][wp]["sentences"]))
df = pd.DataFrame(df)
df

Unnamed: 0,relation,num_word_pairs,word_pair,num_sentences
0,no-relation,85446,light microscope -> end,1
1,no-relation,85446,end -> light microscope,1
2,no-relation,85446,light microscope -> alga,1
3,no-relation,85446,alga -> light microscope,1
4,no-relation,85446,end -> alga,2
...,...,...,...,...
100431,is-outside,166,cytoplasm -> mitochondrion,2
100432,is-outside,166,extranuclear gene -> nucleus,0
100433,is-outside,166,extracellular side -> wall cell inside isotoni...,0
100434,is-outside,166,extra cellular matrix -> mate type alpha,0


In [271]:
df["zero"] = df["num_sentences"] == 0
df_avg = df.groupby(["relation", "zero"]).agg({"num_word_pairs": ["mean", "count"],
                                     "num_sentences": ["sum", "mean"]})
df_avg

Unnamed: 0_level_0,Unnamed: 1_level_0,num_word_pairs,num_word_pairs,num_sentences,num_sentences
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,sum,mean
relation,zero,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
abuts,False,110,18,146,8.111111
abuts,True,110,92,0,0.0
element,False,497,76,530,6.973684
element,True,497,421,0,0.0
has-part,False,4281,465,4714,10.137634
has-part,True,4281,3816,0,0.0
has-region,False,1561,117,861,7.358974
has-region,True,1561,1444,0,0.0
is-at,False,246,36,316,8.777778
is-at,True,246,210,0,0.0


In [269]:
df.groupby(["relation", "zero"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,num_word_pairs,word_pair,num_sentences
relation,zero,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
abuts,False,18,18,18
abuts,True,92,92,92
element,False,76,76,76
element,True,421,421,421
has-part,False,465,465,465
has-part,True,3816,3816,3816
has-region,False,117,117,117
has-region,True,1444,1444,1444
is-at,False,36,36,36
is-at,True,210,210,210


In [264]:
data = pd.read_csv(f"{data_dir}/raw_data/final_bio_parsed.csv")
data[(data.sentence.str.contains("nucleus") & (data.sentence.str.contains("cytosol")))].sentence.iat[0]

IndexError: index 0 is out of bounds for axis 0 with size 0

In [265]:
from spacy.lang.en.stop_words import STOP_WORDS

In [266]:
STOP_WORDS

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron