In [1]:
from anntools import Collection, Keyphrase, Relation, Sentence
from pathlib import Path
from typing import List

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed, Bidirectional, Input, Embedding, Lambda
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder

In [4]:
import numpy as np

In [5]:
from keras_contrib.layers import CRF
import matplotlib.pyplot as plt

In [6]:
from preprocessing import find_keyphrase_by_span

In [7]:
import spacy
import networkx as nx

In [8]:
nlp = spacy.load('es_core_news_sm')

In [9]:
def find_relation(relations:List[Relation], entity1, entity2):
    for rel in relations:
        if rel.origin == entity1 and rel.destination == entity2 or \
           rel.origin == entity2 and rel.destination == entity1:
            return rel.label
    return 'empty'

In [10]:
def find_keyphrase_tokens(sentence:Sentence, doc:List):
    "Returns the spacy tokens of every keyphrase"
    text = sentence.text
    keyphrases = {}
    i = 0
    for token in doc:
        idx = text.index(token.text, i)
        n = len(token.text)
        i = idx + n
        keyphrase_id, _ = find_keyphrase_by_span(idx, idx+n, sentence.keyphrases, text)
        if keyphrase_id is None:
            continue
#         print(keyphrase_id, token)
        try:
            keyphrases[keyphrase_id].append(token) 
        except:
            keyphrases[keyphrase_id] = [token]
    return keyphrases    

In [23]:
def get_features(sentence:Sentence, doc:List):
    features = []
    keyphrases = find_keyphrase_tokens(sentence, doc)
    for i, keyphrase1 in enumerate(sentence.keyphrases):
        for keyphrase2 in sentence.keyphrases[i+1:]:
            try:
                tokens1 = keyphrases[keyphrase1.id]            
                tokens2 = keyphrases[keyphrase2.id]
            except:
                # This doesn't work properly because of the multitokens are not recognize
                pass
            features.append({
                'origin': keyphrase1.text,
                'destination': keyphrase2.text,
                'origin_tag': keyphrase1.label,
                'destination_tag': keyphrase2.label
                # Ideas:
                # el tamaño del camino del dependency graph entre los 2 tokens
                # quizá la secuencia entera a seguir codificada entre los 2 tokens principales
            })
    return features

In [12]:
def get_labels(sentence:Sentence, doc):
    labels = []
    for i, keyprhase1 in enumerate(sentence.keyphrases):
        for keyprhase2 in sentence.keyphrases[i+1:]:
#             print(keyprhase1.text, keyprhase2.text)
            labels.append(find_relation(sentence.relations, keyprhase1.id, keyprhase2.id)) 
    return labels

In [13]:
def get_instances(sentence):
    """
    Makes all the analysis of the sentence according to spacy
    """
    doc = nlp(sentence.text)
    features = get_features(sentence, doc)
    labels = get_labels(sentence, doc)
    return features, labels

In [14]:
    class REClassifier:
        "Classifier for the relation extraction task"
        def __init__(self):
            self.n_timesteps = 15
            self.model = None

        def train(self, collection:Collection):
            features, labels = self.get_sentences(collection)
            X, y = self.preprocessing(features, labels)
            self.get_bi_lstm_model('concat')
            return self.fit_model(X, y, True)


        def get_bi_lstm_model(self, mode:str):
            inputs = Input(shape=(self.X_shape[1], self.X_shape[2]))
    #         outputs = Embedding(input_dim=35179, output_dim=20,
    #                           input_length=self.X_shape[1], mask_zero=True)(inputs)  # 20-dim embedding
            outputs = Bidirectional(LSTM(units=512, return_sequences=True,
                                       recurrent_dropout=0.1))(inputs)  # variational biLSTM
            outputs = Bidirectional(LSTM(units=512, return_sequences=True,
                               recurrent_dropout=0.2, dropout=0.2))(outputs)
            outputs = TimeDistributed(Dense(self.y_shape[2], activation="softmax"))(outputs)  # a dense layer as suggested by neuralNer
    #         crf = CRF(8)  # CRF layer
    #         out = crf(outputs)  # output

            model = Model(inputs=inputs, outputs=outputs)
            model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    #         model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
            model.summary()
            # model.compile(loss='binary_crossentropy', optimizer='adam')
            self.model = model

        def preprocessing(self, features, labels):
            self.max_len = 50

            vectorizer = DictVectorizer()
            X = self._padding_dicts(features)
            [vectorizer.fit(sent) for sent in X]
            X = np.array([vectorizer.transform(sent).todense() for sent in X])
    #         X = X.reshape(1921, 15, X.shape[1])
            self.X_shape = X.shape

            encoder = LabelEncoder()
    #         print(labels)
            y = [encoder.fit_transform(label) for label in labels]
            y = pad_sequences(maxlen=50, sequences=y, padding="post", value=encoder.transform(['empty'])[0])
    #         y = y.reshape(1921, 15, y.shape[1])    
            y = to_categorical(y)
            self.y_shape = y.shape
            return X, y

        def _padding_dicts(self, X):
            new_X = []
            for seq in X:
                new_seq = []
                for i in range(self.max_len):
                    try:
                        new_seq.append(seq[i])
                    except:
                        new_seq.append({'dep': 0, 'pos': 0})
                new_X.append(new_seq)
            return new_X

        def fit_model(self, X, y, plot=False):
            hist = self.model.fit(X, y, batch_size=32, epochs=5,
                        validation_split=0.2, verbose=1)
            if plot:
                plt.style.use("ggplot")
                plt.figure(figsize=(12, 12))
                plt.plot(hist["acc"])
                plt.plot(hist["val_acc"])
                plt.show() 

        def get_sentences(self, collection:Collection):
            features = []
            labels = []
    #         self.max_len = 0
            for sentence in collection:
                feat, label = get_instances(sentence)
    #             self.max_len = max(self.max_len, len(feat))
                features.append(feat)
                labels.append(label)
            return features, labels

        def run(self, collection: Collection):
            collection = collection.clone()
            # returns a collection with everything annotated
            return collection


In [15]:
collection = Collection().load_dir(Path('2021/ref/training'))

In [16]:
sentence = collection.sentences[0]

In [17]:
re_clf = REClassifier()

In [24]:
features, relations = re_clf.get_sentences(collection)

In [26]:
re_clf.train(collection)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 50, 22)]          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 50, 1024)          2191360   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 1024)          6295552   
_________________________________________________________________
time_distributed (TimeDistri (None, 50, 11)            11275     
Total params: 8,498,187
Trainable params: 8,498,187
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [1]:
from re_clsf import REClassifier

In [2]:
re_clf = REClassifier()

In [3]:
re_clf.load_model('re')

In [4]:
from anntools import Collection
from pathlib import Path

In [5]:
dev_set = Collection().load_dir(Path('2021/eval/develop/scenario1-main'))

In [7]:
re_clf.encoder.classes_

array(['arg', 'causes', 'domain', 'empty', 'entails', 'has-property',
       'in-context', 'in-place', 'in-time', 'is-a', 'part-of', 'same-as',
       'subject', 'target'], dtype='<U12')

In [6]:
re_clf.test_model(dev_set)



[['empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty'],
 ['empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty',
  'empty'