In [9]:
import warnings
import spacy
import stanfordnlp
import json
import os
import gensim
import importlib  

from gensim.models import KeyedVectors
from spacy_stanfordnlp import StanfordNLPLanguage
from model.data_loaders import RelationDataset
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn import svm

snlp = stanfordnlp.Pipeline(lang="en")
nlp = StanfordNLPLanguage(snlp)

warnings.filterwarnings("ignore")


Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/hanlin/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/hanlin/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/hanlin/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/hanlin/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/Users/hanlin/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/Users/hanlin/stanfordnlp_resources/en_ewt_mode

In [5]:
doc = nlp("Microsoft is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_)

for ent in doc.ents:
    print(ent.label_)

Microsoft Microsoft PROPN NNP nsubj
is be AUX VBZ aux
looking look VERB VBG root
at at SCONJ IN mark
buying buy VERB VBG advcl
U.K. U.K. PROPN NNP compound
startup startup NOUN NN obj
for for ADP IN case
$ $ SYM $ obl
1 1 NUM CD compound
billion billion NUM CD nummod


In [None]:
import nltk
import json 
import numpy as np

ID2WORD = json.load(open(os.path.join("../../data/relation_extraction", 'id2word.json')))
all_tokens = set()
SPACY_FEATURES = ['LS',
 'X',
 'PROPN',
 'SYM',
 'INTJ',
 'UH',
 'NOUN',
 'AFX',
 'NNP',
 'PUNCT',
 'NFP',
 'PRP',
 'NN',
 'PRON',
 'UNK']

def bag_of_words_featurizer(bag_of_words, bag_of_words_actual):
    """
    sentence is in id 
    """
    feat_len = len(ID2WORD)
    feat_vec = np.zeros(feat_len)
    for sent_index, sentence in enumerate(bag_of_words):
        words = bag_of_words_actual[sent_index]
        for word_id in sentence:
            feat_vec[word_id] += 1
    feat_vec/= len(bag_of_words)
    return feat_vec


def word_vec_bag_of_words_featurizer(model, bag_of_words, bag_of_words_actual):
    """
    sentence is in id 
    """
    feat_len = 300
    feat_vec = np.zeros(feat_len)
    feat_vecs = []
    for sent_index, sentence in enumerate(bag_of_words):
        words = bag_of_words_actual[sent_index]
        for word in words:
            word_vec = feat_vec
            if word in model.wv.vocab:
                word_vec = model[word]
            feat_vecs.append(word_vec)
    if feat_vecs == []:
        feat_vec = model['UNK']
    else:
        feat_vec = np.mean(np.array(feat_vecs), axis=0)
    return feat_vec


def loadPretrainedWordVectors(pretrained_path):
    return KeyedVectors.load_word2vec_format(pretrained_path, binary=True)

def loadFastTextWordVectors(pretrained_path):
    return KeyedVectors.load_word2vec_format(pretrained_path, binary=False)

def spacy_features(word_pair):
    spacy_feat = [0 for i in range(len(SPACY_FEATURES))]
    for i in range(2):
        try:
            featurized_phrase = nlp(word_pair[i])
            for token in featurized_phrase:
                if token.pos_ in SPACY_FEATURES:
                    spacy_feat[SPACY_FEATURES.index(token.pos_)] += 1
                else:
                    spacy_feat[-1] += 1
                if token.tag_ in SPACY_FEATURES:
                    spacy_feat[SPACY_FEATURES.index(token.tag_)] += 1
                else:
                    spacy_feat[-1] += 1
        except:
            print(word_pair)
            return spacy_feat
    return spacy_feat
    

def edit_distance(word_pair):
    """
    sentence is in id 
    """ 
    return nltk.edit_distance(word_pair[0], word_pair[1])

def is_sub_word(word_pair):
    return 1 if word_pair[0] in word_pair[1] else 0


def construct_minz_dataset_word_vec(word_vec_model, dataset):
    X = []
    y = []
    index = 0
    for bag_of_words, y_label, word_pair, pad_mask, e1_mask, e2_mask in dataset:
        words = []
        bag_of_words = bag_of_words.numpy()
        if len(bag_of_words) != 0:
            index += 1
            for bag_of_word in bag_of_words:
                words.append([ID2WORD[str(id)] for id in bag_of_word])
            feat_vec = word_vec_bag_of_words_featurizer(word_vec_model, bag_of_words, words)
            X.append(feat_vec)
            y.append(y_label.numpy()[0])
    return np.array(X), np.array(y)

def construct_minz_dataset(dataset):
    X = []
    y = []
    index = 0
    for bag_of_words, y_label, word_pair, pad_mask, e1_mask, e2_mask in dataset:
        words = []
        bag_of_words = bag_of_words.numpy()
        if len(bag_of_words) != 0:
            index += 1
            for bag_of_word in bag_of_words:
                words.append([ID2WORD[str(id)] for id in bag_of_word])
            feat_vec = bag_of_words_featurizer(bag_of_words, words)
#             feat_vec = np.concatenate([feat_vec, spacy_features(word_pair)])
            np.append(feat_vec, edit_distance(word_pair))
            np.append(feat_vec, is_sub_word(word_pair))
            X.append(feat_vec)
            y.append(y_label.numpy()[0])
    return np.array(X), np.array(y)


In [None]:
def get_relation_name(relations, y_label):
    return relations[y_label]

def dump_tp_fp(y_pred, y_true, relations, rel_dataset):
    if "no-relation" not in relations:
        relations.insert(0, "no-relation")
    i = 0
    rel_dict = {}
    for relation in relations:
        rel_dict[relation] = {"TP" : [], "FP" : []}
    for bag_of_words, y_label, word_pair, pad_mask, e1_mask, e2_mask in rel_dataset:
        relation_name = get_relation_name(relations, y_pred[i])
        if len(bag_of_words) != 0:
            if y_true[i]==y_pred[i]:
                rel_dict[relation_name]["TP"].append(word_pair)
            if y_true[i] != y_pred[i]:
                rel_dict[relation_name]["FP"].append(word_pair)
            i += 1
    json.dump(rel_dict, open("relations_tp_fp.json", "w"), indent=4)
    return rel_dict
            

In [None]:
data_dir = "../../data/relation_extraction"
rel_dataset_train = RelationDataset(data_dir=data_dir, split="train", relations=["taxonomy", "meronym", "spatial", "event_structure"], embedding_type="custom", max_sent_length=256)
rel_dataset_test = RelationDataset(data_dir=data_dir, split="test",  relations=["taxonomy", "meronym", "spatial", "event_structure"], embedding_type="custom", max_sent_length=256)    

In [None]:
id2vocab = json.load(open(os.path.join("../../data/relation_extraction", 'id2word.json')))
X_train, y_train = construct_minz_dataset(rel_dataset_train)
X_test, y_test = construct_minz_dataset(rel_dataset_test)
np.unique(y_train, return_counts=True)

In [None]:
#Includes spacy features
clf = LogisticRegression(random_state=0, solver='lbfgs')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy", clf.score(X_test, y_test))
print(classification_report(y_test, y_pred))
print(clf.coef_) 

In [27]:
all_tokens
spacy_feat = list(all_tokens)
spacy_feat

['LS',
 'X',
 'PROPN',
 'SYM',
 'INTJ',
 'UH',
 'NOUN',
 'root',
 'AFX',
 'NNP',
 'PUNCT',
 'NFP',
 'PRP',
 'NN',
 'PRON']

In [31]:
spacy_feat

['LS',
 'X',
 'PROPN',
 'SYM',
 'INTJ',
 'UH',
 'NOUN',
 'AFX',
 'NNP',
 'PUNCT',
 'NFP',
 'PRP',
 'NN',
 'PRON',
 'UNK']

In [25]:
# Non-spacy feat
clf = LogisticRegression(random_state=0, solver='lbfgs')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy", clf.score(X_test, y_test))
print(classification_report(y_test, y_pred))
print(clf.coef_) 

Accuracy 0.5860215053763441
              precision    recall  f1-score   support

           0       0.60      0.52      0.55       185
           1       0.58      0.65      0.61       187

    accuracy                           0.59       372
   macro avg       0.59      0.59      0.58       372
weighted avg       0.59      0.59      0.58       372

[[ 0.00158959  0.         -0.07175936 ...  0.          0.
   0.        ]]


Word vector experiments (Fast text and word2vec)

In [10]:
model = loadFastTextWordVectors('wiki-news-300d-1M-subword.vec')

In [20]:
model_wordVec = loadPretrainedWordVectors('GoogleNews-vectors-negative300.bin')

In [18]:
X_train, y_train = construct_minz_dataset_word_vec(model, rel_dataset_train)
X_test, y_test = construct_minz_dataset_word_vec(model, rel_dataset_test)

In [19]:
clf = LogisticRegression(random_state=0, solver='lbfgs')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy", clf.score(X_test, y_test))
print(classification_report(y_test, y_pred))

Accuracy 0.5725806451612904
              precision    recall  f1-score   support

           0       0.75      0.21      0.33       185
           1       0.54      0.93      0.69       187

    accuracy                           0.57       372
   macro avg       0.65      0.57      0.51       372
weighted avg       0.65      0.57      0.51       372



In [21]:
X_train, y_train = construct_minz_dataset_word_vec(model_wordVec, rel_dataset_train)
X_test, y_test = construct_minz_dataset_word_vec(model_wordVec, rel_dataset_test)

In [23]:
clf = LogisticRegression(random_state=0, solver='lbfgs')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy", clf.score(X_test, y_test))
print(classification_report(y_test, y_pred))

Accuracy 0.5456989247311828
              precision    recall  f1-score   support

           0       0.60      0.26      0.37       185
           1       0.53      0.82      0.65       187

    accuracy                           0.55       372
   macro avg       0.56      0.54      0.51       372
weighted avg       0.56      0.55      0.51       372



In [28]:
rel_dict = dump_tp_fp(y_pred, y_test, ["subclass-of"], rel_dataset_test)
print(rel_dict)


{'no-relation': {'TP': ['phage -> lytic cycle', 'species -> oviparous', 'tissue -> physician', 'nerve -> calcium atom', 'carbon dioxide -> heart', 'seedling -> fluorescent substance', 'vascular plant -> blood vessel', 'energy -> chemoheterotroph', 'bacteriophage -> infection', 'excretory organ -> digestive system', 'condition -> theory', 'earth -> plant', 'cell adhesion molecule -> close', 'zygote -> centriole', 'pigment -> kidney', 'nerve impulse -> water balance', 'hydrogen atom -> nucleus', 'parasite -> compound', 'activate -> determination', 'carrier -> free - energy', 'colon -> molecule', 'water -> create', 'mitochondrial membrane -> energy', 'antidiuretic hormone -> norepinephrine', 'vein -> oxygen atom', 'cytoplasm -> influence', 'lake -> bird', 'touch -> marine mammal', 'make -> ferment', 'free - energy -> FAD', 'natural selection -> offspring', 'increase -> consumer', 'fungus -> reproduce', 'riboflavin -> whole', 'mutation -> host', 'lead atom -> body cavity', 'C4 plant -> soy

In [14]:
clf = svm.SVC(gamma='scale')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy", clf.score(X_test, y_test))
print(classification_report(y_test, y_pred))

Accuracy 0.5295698924731183
              precision    recall  f1-score   support

           0       0.73      0.09      0.15       185
           1       0.52      0.97      0.67       187

    accuracy                           0.53       372
   macro avg       0.62      0.53      0.41       372
weighted avg       0.62      0.53      0.42       372



In [15]:
clf = RandomForestClassifier(n_estimators=100, max_depth=20,random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy", clf.score(X_test, y_test))
print(classification_report(y_test, y_pred))

Accuracy 0.5698924731182796
              precision    recall  f1-score   support

           0       0.58      0.48      0.53       185
           1       0.56      0.66      0.61       187

    accuracy                           0.57       372
   macro avg       0.57      0.57      0.57       372
weighted avg       0.57      0.57      0.57       372



In [35]:
data_dir = "../../data/relation_extraction"
rel_dataset_train = RelationDataset(data_dir=data_dir, split="train", relations=["subclass-of", 'has-part', 'possesses', 'has-region', 'is-inside', 'is-at', 'element', 'abuts', 'is-outside'], embedding_type="custom", max_sent_length=256)
rel_dataset_test = RelationDataset(data_dir=data_dir, split="test", relations=["subclass-of", 'has-part', 'possesses', 'has-region', 'is-inside', 'is-at', 'element', 'abuts', 'is-outside'], embedding_type="custom", max_sent_length=256)    

In [36]:
id2vocab = json.load(open(os.path.join("../../data/relation_extraction", 'id2word.json')))
X_train, y_train = construct_minz_dataset(rel_dataset_train)
X_test, y_test = construct_minz_dataset(rel_dataset_test)
np.unique(y_train, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([1912,  868,  602,   71,  162,  109,   32,   64,   18,   15]))

In [37]:
clf = LogisticRegression(random_state=0, solver='lbfgs')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy", clf.score(X_test, y_test))
print(classification_report(y_test, y_pred))
print(clf.coef_) 

Accuracy 0.5448028673835126
              precision    recall  f1-score   support

           0       0.59      0.79      0.68       415
           1       0.39      0.30      0.34       187
           2       0.56      0.41      0.47       130
           3       0.50      0.31      0.38        16
           4       0.50      0.22      0.31        36
           5       0.44      0.17      0.24        24
           6       0.00      0.00      0.00         7
           7       0.00      0.00      0.00        14
           8       0.50      0.25      0.33         4
           9       0.00      0.00      0.00         4

    accuracy                           0.54       837
   macro avg       0.35      0.25      0.28       837
weighted avg       0.51      0.54      0.51       837

[[-8.86601350e-05  0.00000000e+00  5.78052339e-02 ...  0.00000000e+00
   0.00000000e+00 -9.49881261e-03]
 [-2.69760862e-03  0.00000000e+00 -4.22132824e-02 ...  0.00000000e+00
   0.00000000e+00 -8.04476174e-03]
 [-

In [38]:
clf = RandomForestClassifier(n_estimators=100, max_depth=20,random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy", clf.score(X_test, y_test))
print(classification_report(y_test, y_pred))


Accuracy 0.5137395459976105
              precision    recall  f1-score   support

           0       0.52      0.98      0.68       415
           1       0.40      0.01      0.02       187
           2       0.48      0.15      0.23       130
           3       0.00      0.00      0.00        16
           4       0.00      0.00      0.00        36
           5       0.25      0.04      0.07        24
           6       0.00      0.00      0.00         7
           7       0.00      0.00      0.00        14
           8       0.00      0.00      0.00         4
           9       0.00      0.00      0.00         4

    accuracy                           0.51       837
   macro avg       0.16      0.12      0.10       837
weighted avg       0.43      0.51      0.38       837



In [None]:
clf = svm.SVC(gamma='scale')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy", clf.score(X_test, y_test))
print(classification_report(y_test, y_pred))


In [None]:
np.unique(y_test, return_counts=True)

In [None]:
rel_dict = dump_tp_fp(y_pred, y_test,["subclass-of", 'has-part', 'possesses', 'has-region', 'is-inside', 'is-at', 'element', 'abuts', 'is-outside'], rel_dataset_test)
