# Relation Data EDA

This notebook computes summary statistics and performs sanity checks on the relation data processing pipeline (raw_data -> relations database and sentences tagged with key terms).

In [22]:
import pandas as pd
import pickle
import json
from collections import defaultdict
import spacy
import stanfordnlp
import pandas as pd
from spacy_stanfordnlp import StanfordNLPLanguage
import warnings
import matplotlib.pyplot as plt
import os
import sys
import numpy as np
from ipywidgets import interact

# nlp preprocessing pipeline
warnings.filterwarnings('ignore')
snlp = stanfordnlp.Pipeline(lang="en")
nlp = StanfordNLPLanguage(snlp)

# fix for importing utils
module_path = os.path.abspath(os.path.join('../data_processing'))
if module_path not in sys.path:
    sys.path.append(module_path)
from data_processing_utils import read_spacy_docs, tag_terms

data_dir = '../data/relation_extraction'

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/Users/mattboggess/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/Users/mattboggess/sta

# What concepts actually match the text?


In [2]:
bio_sentences = read_spacy_docs("../data/preprocessed_data/Biology_2e_sentences_spacy", nlp)
bio_sentences += read_spacy_docs("../data/preprocessed_data/Life_Biology_sentences_spacy", nlp)
kb_terms = read_spacy_docs("../data/preprocessed_data/Life_Biology_kb_key_terms_spacy", nlp)

In [3]:
from collections import Counter

# count occurrences of all terms
term_counts = Counter()
for i, sentence in enumerate(bio_sentences):
    if i % 500 == 0:
        print(f"tagging sentence {i}/{len(bio_sentences)}")
    _, _, term_info = tag_terms(sentence, kb_terms, nlp)
    term_counts.update({t: len(term_info[t]["indices"]) for t in term_info})

tagging sentence 0/44162
tagging sentence 500/44162
tagging sentence 1000/44162
tagging sentence 1500/44162
tagging sentence 2000/44162
tagging sentence 2500/44162
tagging sentence 3000/44162
tagging sentence 3500/44162
tagging sentence 4000/44162
tagging sentence 4500/44162
tagging sentence 5000/44162
tagging sentence 5500/44162
tagging sentence 6000/44162
tagging sentence 6500/44162
tagging sentence 7000/44162
tagging sentence 7500/44162
tagging sentence 8000/44162
tagging sentence 8500/44162
tagging sentence 9000/44162
tagging sentence 9500/44162
tagging sentence 10000/44162
tagging sentence 10500/44162
tagging sentence 11000/44162
tagging sentence 11500/44162
tagging sentence 12000/44162
tagging sentence 12500/44162
tagging sentence 13000/44162
tagging sentence 13500/44162
tagging sentence 14000/44162
tagging sentence 14500/44162
tagging sentence 15000/44162
tagging sentence 15500/44162
tagging sentence 16000/44162
tagging sentence 16500/44162
tagging sentence 17000/44162
tagging s

In [10]:
with open("../data/preprocessed_data/Life_Biology_kb_lexicon.json", "r") as f:
    lexicon = json.load(f)

tagged_concepts = []
non_tagged_concepts = []
for concept in lexicon.keys():
    if any([lemma in set(term_counts.keys()) for lemma in lexicon[concept]["lemma_representations"]]) or \
       any([text in set(term_counts.keys()) for text in lexicon[concept]["text_representations"]]):
        tagged_concepts.append({concept: lexicon[concept]})
    else:
        non_tagged_concepts.append({concept: lexicon[concept]})
print(f"{len(tagged_concepts)}/{len(non_tagged_concepts) + len(tagged_concepts)} bio kb concepts tagged in text")

df = {"concept": [], "text": []}
for concept in non_tagged_concepts:
    c = list(concept.keys())[0]
    df["concept"].append(c)
    df["text"].append(concept[c]["text_representations"])
pd.DataFrame(df).to_excel("../data/relation_extraction/diagnostics/not_tagged_concepts.xlsx", index=False)

3963/5941 bio kb concepts tagged in text


# Relations Database EDA

In [38]:
with open("../data/relation_extraction/relations_db.json", "r") as f:
    rdb = json.load(f)

## Summary Statistics


In [39]:
long_df = {"relation": [], "term_pair": [], "count_sentences": [], "found_sentence": []}
for relation in rdb:
    for term_pair in rdb[relation]: 
        long_df["relation"].append(relation)
        long_df["term_pair"].append(term_pair)
        long_df["count_sentences"].append(len(rdb[relation][term_pair]["sentences"]))
        long_df["found_sentence"].append(len(rdb[relation][term_pair]["sentences"]) > 0)
long_df = pd.DataFrame(long_df)

In [41]:
summary_df = long_df.groupby(["relation", "found_sentence"]).agg({"term_pair": "count",
                                                                  "count_sentences": ["sum", "mean"]})
summary_df.to_csv("../data/relation_extraction/summary/relation_counts_summary.csv", index=False)
summary_df

Unnamed: 0_level_0,Unnamed: 1_level_0,term_pair,count_sentences,count_sentences
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum,mean
relation,found_sentence,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
abuts,False,140,0,0.0
abuts,True,26,146,5.615385
element,False,443,0,0.0
element,True,92,629,6.836957
has-part,False,5704,0,0.0
has-part,True,863,6553,7.593279
has-region,False,2817,0,0.0
has-region,True,234,1405,6.004274
is-at,False,258,0,0.0
is-at,True,46,331,7.195652


## Sentence Count Distributions

In [19]:
from ipywidgets import interact

def plot_sentence_counts(relation):
    
    df = long_df.query(f"count_sentences > 0 & relation == '{relation}'")
    df = df.query("count_sentences < 30")
    plt.hist(df.count_sentences, bins=30)
    plt.show()
    
interact(plot_sentence_counts, relation=list(rdb.keys()))

dict_keys(['no-relation', 'subclass-of', 'has-part', 'possesses', 'has-region', 'is-inside', 'is-at', 'element', 'abuts', 'is-outside'])


interactive(children=(Dropdown(description='relation', options=('no-relation', 'subclass-of', 'has-part', 'pos…

<function __main__.plot_sentence_counts(relation)>

## Multi-Label Term Pairs

In [35]:
term_relation_mapping = {}
for relation in rdb:
    if relation == "no-relation":
        continue
    for tp in rdb[relation]:
        cp = rdb[relation][tp]["concept_pair"]
        if tp in term_relation_mapping:
            term_relation_mapping[tp][1].append(relation)
        else:
            term_relation_mapping[tp] = (cp, [relation])
multi_label = {tp:r for tp, r in term_relation_mapping.items() if len(r[1]) > 1}
print(len(multi_label))
#print(np.unique(list(multi_label.values()), return_counts=True))
multi_label

247


{'substance -> object': ('Substance -> Tangible-Entity',
  ['subclass-of', 'has-part']),
 'mixture -> substance': ('Mixture -> Substance', ['subclass-of', 'has-part']),
 'membrane -> membrane': ('Biomembrane -> Membrane', ['subclass-of', 'is-at']),
 'segment of body -> anatomical structure': ('Body-Segment -> Anatomical-Structure',
  ['subclass-of', 'has-part']),
 'bone -> connective tissue': ('Bone -> Connective-Tissue',
  ['subclass-of', 'has-part']),
 'CDNA -> dna': ('CDNA -> DNA', ['subclass-of', 'abuts']),
 'circular dna -> dna': ('Circular-DNA -> DNA', ['subclass-of', 'has-part']),
 'plasmid -> dna': ('Circular-DNA -> DNA', ['subclass-of', 'has-part']),
 'gap junction -> pore': ('Cytoplasmic-Channel -> Pore',
  ['subclass-of', 'has-region']),
 'diet -> object': ('Diet -> Tangible-Entity', ['subclass-of', 'element']),
 'gene -> dna sequence': ('Gene -> DNA-Sequence', ['subclass-of', 'has-part']),
 'glycosidic linkage -> polar covalent bond': ('Glycosidic-Linkage -> Polar-Covalent-

# Sanity Check Relation Extraction 

- How many word-pairs match the text on regex, but we don't match in the pipeline?
- How many word-pairs don't match sentences? Do these seem reasonable?
- Word-pairs with too many matches?

In [36]:
bio_sentences = read_spacy_docs("../data/preprocessed_data/Biology_2e_sentences_spacy", nlp)
bio_sentences += read_spacy_docs("../data/preprocessed_data/Life_Biology_sentences_spacy", nlp)

In [37]:
found_concepts = set()
df = {"sentences": [], 
      "relation": [], 
      "term-pair": [], 
      "concept-pair": [], 
      "tagged": [], 
      "synonym-tagged": [], 
      "count_sentences": [], 
      "textbook_match": [],
      "term1_found": [],
      "term2_found": []} 

for relation in rdb:
    print(relation)
    if relation == "no-relation":
        continue
    for i, term_pair in enumerate(rdb[relation]):
        if i % 100 == 0:
            print(f"Processing term pair {i}/{len(rdb[relation])}")
        
        sentences = rdb[relation][term_pair]["sentences"]
        count_sentences = len(sentences)
        df["relation"].append(relation)
        df["term-pair"].append(term_pair)
        concept_pair = rdb[relation][term_pair]["concept_pair"]
        df["concept-pair"].append(concept_pair)
        
        if count_sentences:
            df["tagged"].append(True)
            df["textbook_match"].append(True)
            df["term1_found"].append(True)
            df["term2_found"].append(True)
            found_concepts.add(concept_pair)
        else:
            terms = term_pair.split(" -> ")
            df["tagged"].append(False)
            found_term1 = False
            found_term2 = False
            found_sentences = []
            for sentence in bio_sentences:
                sentence = str(sentence)
                if terms[0] in sentence:
                    found_term1 = True
                if terms[1] in sentence:
                    found_term2 = True
                if terms[0] in sentence and terms[1] in sentence:
                    found_sentences.append(sentence)
            sentences = found_sentences
            df["term1_found"].append(found_term1)
            df["term2_found"].append(found_term2)
            df["textbook_match"].append(len(found_sentences) > 0)
                
        df["count_sentences"].append(count_sentences)
        df["sentences"].append("\n".join(sentences))
                    
for cp in df["concept-pair"]:
    df["synonym-tagged"].append(cp in found_concepts)

df_df = pd.DataFrame(df)
df_df.to_excel(f"{data_dir}/diagnostics/relations_info.xlsx")
df_df.head()

no-relation
subclass-of
Processing term pair 0/19180


KeyboardInterrupt: 

In [None]:
def group_pairs(row):
    if row["tagged"]:
        group = "Tagged"
    elif row["textbook_match"]:
        group = "TB_Match_Not_Tagged"
    elif row["synonym-tagged"]:
        group = "Synonym Tagged"
    elif not row["term1_found"] or not row["term2_found"]:
        group = "Missing Term"
    else:
        group = "Terms Not Same Sentence"
    return group

df_copy = df_df.copy()
df_copy["group"] = df_copy.apply(group_pairs, axis=1)
df_copy.head()

In [None]:
df_copy.groupby(["relation", "group"])["sentences"].count().reset_index().rename({"sentences": "count"}, axis=1).to_csv(f"{data_dir}/diagnostics/relations_summary.csv", index=False)