# Distant Supervision Relation Extraction EDA

This notebook explores approaches for relationship extraction. Formulated as follows:

    Given a set of sentences & key term pairs -> 
    Clas# Distant Supervision

Here, we leverage an existing partial knowledge base (from Inquire in this case) and use to it to train a model to predict further relations between key terms.This is known as distant supervision because we construct labels noisily assuming every pair of terms exhibiting a relationship in the knowledge base will exhibit that relationship for all instances in the text.

Key Limitations:
- Does not generalize to new relations. Thus this might not generalize well to other textbooks/subject matters, especially if they have additional new relations that would need to be added.

Questions to Investigate:
- How many training examples are needed to reliably learn a relation?sify any relations present between each pair of terms 

In [1]:
import pandas as pd
import pickle
import json
from collections import defaultdict
import spacy

import stanfordnlp
from spacy_stanfordnlp import StanfordNLPLanguage
#stanfordnlp.download('en')
snlp = stanfordnlp.Pipeline(lang="en")
nlp = StanfordNLPLanguage(snlp)

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

# fix for importing utils
import os
import sys
module_path = os.path.abspath(os.path.join('../utils'))
if module_path not in sys.path:
    sys.path.append(module_path)
from utils import tag_text, write_spacy_docs, read_spacy_docs

data_dir = '../data/relation_extraction'

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/mattboggess7/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/home/mattboggess7/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/home/mattboggess7/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/home/mattboggess7/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/home/mattboggess7/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/home/mattboggess7/sta

# Read Data

## KB Terms

In [3]:
with open(f"{data_dir}/concepts.txt", "r") as f:
    terms = f.readlines()
terms = set([c.split('|')[1].strip() for c in terms])
print(f"Number of Manually Extracted KB Terms: {len(terms)}")

Number of Manually Extracted KB Terms: 5933


## KB Relationships

In [4]:
def wrangle_relations(triples):
    relations = defaultdict(lambda: [])
    for triple in triples:
        relation = triple[1]
        relations[relation].append(triple)

    relation_words = list(relations.keys())
    relation_counts = [len(relations[rel]) for rel in relation_words]
    relation_table = pd.DataFrame({'relations': relation_words, 'triples_count': relation_counts})
    return relations, relation_table.sort_values('triples_count', ascending=False).reset_index(drop=True)

#### Structure Relations

In [5]:
with open(f"{data_dir}/structure.txt", "r") as f:
    structure_triples = f.readlines()
structure_triples = [s.split("|") for s in structure_triples]
structure_triples = set([(s[3].strip(), s[4].strip(), s[-1].strip()) for s in structure_triples])

structure_relations, structure_info = wrangle_relations(structure_triples)
structure_info["relation_type"] = "structure"
print(f"Number of Unique KB Structure Relations: {structure_info.shape[0]}")
structure_info

Number of Unique KB Structure Relations: 44


Unnamed: 0,relations,triples_count,relation_type
0,has-part,4281,structure
1,has-region,1561,structure
2,possesses,784,structure
3,is-inside,604,structure
4,encloses,603,structure
5,element,497,structure
6,size,349,structure
7,is-between,267,structure
8,is-at,246,structure
9,does-not-enclose,169,structure


#### Process Relations

In [6]:
with open(f"{data_dir}/process.txt", "r") as f:
    process_triples = f.readlines()
process_triples = [s.split("|") for s in process_triples]
process_triples = set([(s[3].strip(), s[4].strip(), s[-1].strip()) for s in process_triples])

process_relations, process_info = wrangle_relations(process_triples)
process_info["relation_type"] = "process"
print(f"Number of Unique KB Process Relations: {process_info.shape[0]}")
process_info

Number of Unique KB Process Relations: 13


Unnamed: 0,relations,triples_count,relation_type
0,object,2520,process
1,subevent,1839,process
2,base,1784,process
3,result,1678,process
4,agent,1488,process
5,raw-material,827,process
6,next-event,613,process
7,first-subevent,330,process
8,instrument,255,process
9,donor,164,process


#### Combine Relations

In [7]:
relations = {**process_relations, **structure_relations}
relation_triples = list(structure_triples) + list(process_triples)
relations_info = pd.concat([structure_info, process_info]).sort_values("triples_count", ascending=False).reset_index(drop=True)
relations_info

Unnamed: 0,relations,triples_count,relation_type
0,has-part,4281,structure
1,object,2520,process
2,subevent,1839,process
3,base,1784,process
4,result,1678,process
5,has-region,1561,structure
6,agent,1488,process
7,raw-material,827,process
8,possesses,784,structure
9,next-event,613,process


## Textbook Sentences

In [8]:
with open(f"{data_dir}/selected_textbook_sentences.txt", "r") as f:
    sentences = f.readlines()
sentences = [" ".join(sent.split("\t")[1:]) for sent in sentences]
print(f"Number of Textbook Sentences: {len(sentences)}")

Number of Textbook Sentences: 18730


# Tag Terms & Relations


## Spacy Preprocessing

#### Pre-process Terms

In [9]:
processed_terms = []
for i, term in enumerate(terms):
    if i % 1000 == 0:
        print(f"{i}/{len(terms)} processed")
    processed_terms.append(nlp(term))
write_spacy_docs(processed_terms, f"{data_dir}/processed_terms")

0/5933 processed
1000/5933 processed
2000/5933 processed
3000/5933 processed
4000/5933 processed
5000/5933 processed


#### Pre-process Sentences

In [10]:
processed_sentences = []
for i, sentence in enumerate(sentences):
    if i % 1000 == 0:
        print(f"{i}/{len(sentences)} processed")
    processed_sentences.append(nlp(sentence))
write_spacy_docs(processed_sentences, f"{data_dir}/processed_sentences")

0/18730 processed
1000/18730 processed
2000/18730 processed
3000/18730 processed
4000/18730 processed
5000/18730 processed
6000/18730 processed
7000/18730 processed
8000/18730 processed
9000/18730 processed
10000/18730 processed
11000/18730 processed
12000/18730 processed
13000/18730 processed
14000/18730 processed
15000/18730 processed
16000/18730 processed
17000/18730 processed
18000/18730 processed


#### Pre-process Relations

In [11]:
processed_relations = []
for i, relation in enumerate(relation_triples):
    if i % 1000 == 0:
        print(f"{i}/{len(relation_triples)} processed")
    processed_triple = (" ".join([tok.lemma_ for tok in nlp(relation[0])]),
                        relation[1],
                        " ".join([tok.lemma_ for tok in nlp(relation[2])]))
    processed_relations.append(processed_triple)

0/22320 processed
1000/22320 processed
2000/22320 processed
3000/22320 processed
4000/22320 processed
5000/22320 processed
6000/22320 processed
7000/22320 processed
8000/22320 processed
9000/22320 processed
10000/22320 processed
11000/22320 processed
12000/22320 processed
13000/22320 processed
14000/22320 processed
15000/22320 processed
16000/22320 processed
17000/22320 processed
18000/22320 processed
19000/22320 processed
20000/22320 processed
21000/22320 processed
22000/22320 processed


## Tag Sentences w/ Terms & Relations

In [12]:
tagged_sentences = []
for i, sentence in enumerate(processed_sentences):
    if i % 1000 == 0:
        print(f"{i}/{len(processed_sentences)} processed")
    tagged_sentences.append(tag_text(sentence, processed_terms, processed_relations, nlp=nlp))

with open(f"{data_dir}/tagged_sentences.json", "w") as f:
    json.dump(tagged_sentences, f)

0/18730 processed
1000/18730 processed
2000/18730 processed
3000/18730 processed
4000/18730 processed
5000/18730 processed
6000/18730 processed
7000/18730 processed
8000/18730 processed
9000/18730 processed
10000/18730 processed
11000/18730 processed
12000/18730 processed
13000/18730 processed
14000/18730 processed
15000/18730 processed
16000/18730 processed
17000/18730 processed
18000/18730 processed


# Summary Statistics

- How many negative examples?
- How many duplicate relations for same pair?
- How many examples / relation?
- How many sentences / relation?
- How many sentences / example / relation?

In [14]:
with open(f"{data_dir}/tagged_sentences.json", "r") as f:
    json.load(tagged_sentences, f)
relation_counts = defaultdict(lambda: {})
for tagged_sentence in tagged_sentences:
    for relation in tagged_sentence["relations"]:
        word_pair = (relation[0], relation[2])
        if word_pair in relation_counts[relation[1]]:
            relation_counts[relation[1]][word_pair] += 1
        else:
            relation_counts[relation[1]][word_pair] = 1

df = {'relation': [], 'examples': [], 'triples': [], 'examples/triple': []}
for r in dict(relation_counts).keys():
    rc = relation_counts[r]
    df['relation'].append(r)
    df['triples'].append(len(rc.keys()))
    df['examples'].append(sum(rc.values()))
    df['examples/triple'].append(sum(rc.values()) / len(rc.keys()))

df = pd.DataFrame(df)
df

Unnamed: 0,relation,examples,triples,examples/triple
0,no-relation,70288,34153,2.058033
1,agent,187,53,3.528302
2,object,367,82,4.47561
3,has-part,2319,356,6.514045
4,raw-material,232,53,4.377358
5,is-between,171,48,3.5625
6,is-at,98,20,4.9
7,has-region,385,92,4.184783
8,result,304,84,3.619048
9,base,306,75,4.08


In [15]:
import numpy as np
pair_counts = defaultdict(lambda: [])
for r in relation_counts:
    for pair in relation_counts[r]:
        pair_counts[pair].append(r)
pair_counts = [(pair, pair_counts[pair]) for pair in pair_counts if len(pair_counts[pair]) > 1]
print(f"{len(pair_counts)} term pairs have multiple relations")
for pc in pair_counts:
    print(pc)

96 term pairs have multiple relations
(('attract', 'water molecule'), ['agent', 'object'])
(('cohesion', 'water molecule'), ['agent', 'object'])
(('attract', 'molecule'), ['agent', 'object'])
(('gene expression', 'gene'), ['agent', 'object', 'base'])
(('reproduce', 'cell'), ['agent', 'result', 'base'])
(('diffusion', 'concentration gradient'), ['agent', 'base'])
(('signal transduction', 'cell'), ['agent', 'base'])
(('gene expression', 'transcription factor'), ['agent', 'result'])
(('cellular response', 'cell'), ['agent', 'result', 'base'])
(('code for', 'gene'), ['agent', 'donor'])
(('reproduce', 'cancer cell'), ['agent', 'result', 'base'])
(('metabolism', 'cell'), ['agent', 'object'])
(('cell communication', 'cell'), ['agent', 'object'])
(('endocytosis', 'phagocyte'), ['agent', 'base'])
(('phagocytosis', 'macrophage'), ['agent', 'base'])
(('cell signal', 'cell'), ['agent', 'base'])
(('neural signal', 'cell'), ['agent', 'object'])
(('nutrition', 'animal'), ['agent', 'raw-material', 'be

In [3]:
with open(f"{data_dir}/tagged_sentences.json", "r") as f:
    tagged_sentences = json.load(f)
relation_counts = defaultdict(lambda: {})
for tagged_sentence in tagged_sentences:
    for relation in tagged_sentence["relations"]:
        word_pair = (relation[0], relation[2])
        if word_pair in relation_counts[relation[1]]:
            relation_counts[relation[1]][word_pair] += 1
        else:
            relation_counts[relation[1]][word_pair] = 1

df = {'relation': [], 'examples': [], 'triples': [], 'examples/triple': []}
for r in dict(relation_counts).keys():
    rc = relation_counts[r]
    df['relation'].append(r)
    df['triples'].append(len(rc.keys()))
    df['examples'].append(sum(rc.values()))
    df['examples/triple'].append(sum(rc.values()) / len(rc.keys()))

df = pd.DataFrame(df)
df

Unnamed: 0,relation,examples,triples,examples/triple
0,no-relation,70288,34153,2.058033
1,agent,187,53,3.528302
2,object,367,82,4.47561
3,has-part,2319,356,6.514045
4,raw-material,232,53,4.377358
5,is-between,171,48,3.5625
6,is-at,98,20,4.9
7,has-region,385,92,4.184783
8,result,304,84,3.619048
9,base,306,75,4.08
