# Link Prediction

## TODO

- [ ] Convert corpus dataframe into a file based corpus. (Advantages: Easy use for tf.data.Dataframe)
- [ ] Remove unnecesary tags in relation and proposition vectorizers
- [ ] Do Ensemble learning
- [ ] Show information about the corpus. Percentage about clases and other

In [4]:
import sys
sys.path.append("../../")

from corpus_parser.unified_parser import UnifiedParser

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from pathlib import Path
from keras import layers
import keras.backend as K

import numpy as np
import json
import matplotlib.pyplot as plt

from collections import OrderedDict, Counter

import pandas
from pandas.core.common import SettingWithCopyWarning

# TODO Fix this issues.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)



In [5]:
# Define model initial params

INFO_TAG = "persuasive_essays_paragraph"

DATA_PATH = Path("../../data/")
EXPORT_PATH = Path(DATA_PATH, 'link_prediction', INFO_TAG)
GLOVE_PATH = Path(DATA_PATH, 'glove.840B.300d.txt')
DIM = 300

params = {
    # Model Training Hyperparameters
    'epochs': 70,
    'batch_size': 20,
    'metrics': ['acc'],
    
    # Ensemble Hyperparameters
    'ensemble_amount': 10,
    
    # Model Hyperparameters
    'dim': DIM,
    'dropout': 0.1,
    'lstm_size': 200,
    'max_distance_encoded': 5,
    'linear_embedders_dims': [50, 50, 50, DIM],
    'regularizer_weight': 0.001,
    'encoder_dense_units': 50,
    'encoder_pool_size': 1, # If 1 no tranformation is made to the input.
    'lstm_units': 25,
    'final_size': 20,
    'residual_size': 50,
    'with_attention': True, # If the attention block is used
    'loss_weights': {
        'relation': 10,
        'source': 1,
        'target': 1,
    },
    
    # Adam Optimizer Hyperparameters
    'lr_alpha': 0.003,
    'lr_kappa': 0.001,
    'beta_1': 0.9,
    'beta_2': 0.999,
    
    # Early Stopping Hyperparameters
    'min_delta': 0,
    'patience': 5,
    
    # Corpus Info
#     'corpus_path': str(Path(DATA_PATH, 'projection', INFO_TAG)),
#     'corpus_path': str(Path(DATA_PATH, 'corpus', 'ArgumentAnnotatedEssays-2.0', 'train-test-split')),
    'corpus_path': str(Path(DATA_PATH, 'parsed_to_conll', INFO_TAG)),
    'glove_path': str(Path(EXPORT_PATH, 'glove.npz')),
    'export_path': str(EXPORT_PATH),
    'model_path': str(EXPORT_PATH / "model"),
    'glove_raw_path': str(Path(GLOVE_PATH)),
    
    # Vectorizer Hyperparameters
    'sequence_standardize': None,
    'sequence_split': 'whitespace',
    
    # Corpus Hyperparameters
    'max_proposition_distance': 10, # TODO How to set this value
    
}

params['model_name'] = "model" + ("_attention" if params['with_attention'] else "")

In [6]:
# Load Dataset 

def find_duplicates(dataframe, first, second, group):

    for g, dataframe in dataframe.groupby(by=group):
        rows = [row for _, row in dataframe.iterrows()]
        for i, row in enumerate(rows):
            for row2 in rows[i+1:]:
                if row[first] == row2[second] and row[second] == row2[first]:
                    print("DUPLICATED ROW")
                    print(g)
                    print(row)
                    print(row2)

                
def extract_propositions(params: dict):
    corpus_path = Path(params['corpus_path'])
    
    parser = UnifiedParser()
    
    names = [
        "dev", 
        "test",
        "train",
    ]
    
    relation_tags = set()
    proposition_tags = set()
    
    source_vocabulary = set()
    target_vocabulary = set()
    
    # Max amount of propositions in a document
    max_amount_source_in_doc = 0
    max_amount_target_in_doc = 0
    
    # Max amount of tokens in a proposition
    max_size_in_source_prop = 0
    max_size_in_target_prop = 0
    
    for name in names:
        
        proposition_dict = parser.parse_dir(corpus_path / name)
        
        current_source_arg_units = pandas.DataFrame(columns=['prop_id', 'prop_type', 'prop_text', 'file_key'])
        current_target_arg_units = pandas.DataFrame(columns=['prop_id', 'prop_type', 'prop_text', 'file_key'])
        current_relations = pandas.DataFrame(columns=['prop_id_source', 'prop_id_target', 'relation_type', 'distance', 'file_key'])

        for key, (args_unit, relations, _) in proposition_dict.items():
            args_unit = args_unit[['prop_id', 'prop_type', 'prop_text']]
            args_unit['file_key'] = [key for _ in range(len(args_unit))]
            
            relations = relations[['prop_id_source', 'prop_id_target', 'relation_type']]
            relations['distance'] = relations.aggregate(lambda x: x['prop_id_target']-x['prop_id_source'], axis=1)
            relations['file_key'] = relations.aggregate(lambda x: key, axis=1)
            
            source_prop = args_unit[args_unit['prop_id'].isin(relations['prop_id_source'])]
            target_prop = args_unit[args_unit['prop_id'].isin(relations['prop_id_target'])]
            
            source_vocabulary.update([t for s in source_prop['prop_text'] for t in s.split()])
            target_vocabulary.update([t for s in target_prop['prop_text'] for t in s.split()])
            
            max_size_in_source_prop = max(max_size_in_source_prop, source_prop.aggregate(lambda x: len(x['prop_text'].split()), axis=1).max())
            max_size_in_target_prop = max(max_size_in_target_prop, target_prop.aggregate(lambda x: len(x['prop_text'].split()), axis=1).max())
            
            
            max_amount_source_in_doc = max(max_amount_source_in_doc, len(relations['prop_id_source'].drop_duplicates()))
            max_amount_target_in_doc = max(max_amount_target_in_doc, len(relations['prop_id_target'].drop_duplicates()))
            
            current_source_arg_units = pandas.concat([current_source_arg_units, source_prop], ignore_index=True)
            current_target_arg_units = pandas.concat([current_target_arg_units, target_prop], ignore_index=True)
            current_relations = pandas.concat([current_relations, relations], ignore_index=True)
            
            
        # Add Inverse Relations
        inverse_relations = {
            'prop_id_source': [],
            'prop_id_target': [],
            'relation_type': [],
            'distance': [],
            'file_key': [],
        }
        for _, row in current_relations.iterrows():
            inverse_relations['prop_id_source'].append(row['prop_id_target'])
            inverse_relations['prop_id_target'].append(row['prop_id_source'])
            inverse_relations['relation_type'].append(row['relation_type'] + "_Inverse")
            inverse_relations['distance'].append(-row['distance'])
            inverse_relations['file_key'].append(row['file_key'])
            
        inverse_relations = pandas.DataFrame(inverse_relations)
        
        # Sanity checks
#         print("NEGATIVE PROP IDs")
#         print("TARGET < 0", current_target_arg_units[current_target_arg_units['prop_id'] < 0])
#         print("SOURCE < 0", current_source_arg_units[current_source_arg_units['prop_id'] < 0])
#         print("RELATION TARGET < 0:", list(current_relations[current_relations['prop_id_target'] < 0]['file_key']))
#         print("RELATION SOURCE < 0:", list(current_relations[current_relations['prop_id_source'] < 0]['file_key']))
#         def check_max(s_t_data, data, max_column, compare_to, title):
#             for file, df in data.groupby(by='file_key'):
#                 maxim = s_t_data[s_t_data['file_key'] == file][max_column].max()
#                 print(title, maxim, file)
#                 print(df[df[compare_to] > maxim])
#         check_max(current_target_arg_units, current_relations, 'prop_id', 'prop_id_target', "RELATION TARGET > max")
#         check_max(current_source_arg_units, current_relations, 'prop_id', 'prop_id_source', "RELATION SOURCE > max")
#         print("BEFORE")
#         find_duplicates(current_relations, 'prop_id_source', 'prop_id_target', 'file_key')

        current_relations = pandas.concat([current_relations, inverse_relations], ignore_index=True)

        params[f'{name}_source_propositions'] = current_source_arg_units
        params[f'{name}_target_propositions'] = current_target_arg_units
        params[f'{name}_relations'] = current_relations

        print(name, "relations", len(current_relations))
        print(name, "source argumentative units", len(current_source_arg_units))
        print(name, "target argumentative units", len(current_target_arg_units))

        relation_tags.update(current_relations['relation_type'])
        proposition_tags.update(current_source_arg_units['prop_type'])
        proposition_tags.update(current_target_arg_units['prop_type'])
    

    vocabulary = source_vocabulary.union(target_vocabulary)
    params['vocabulary'] = vocabulary
    print("Vocab size", len(vocabulary))
    
    relation_tags = list(relation_tags)
    proposition_tags = list(proposition_tags)
    print("Relation tags", relation_tags)
    print("Proposition tags", proposition_tags)
    params['relation_tags'] = relation_tags
    params['proposition_tags'] = proposition_tags
    
    max_size_prop = max(max_size_in_source_prop, max_size_in_target_prop)
    max_amount_doc = max(max_amount_source_in_doc, max_amount_target_in_doc)
    params['max_size_prop'] = max_size_prop
    params['max_amount_doc'] = max_amount_doc
    
    print('max_size_prop', max_size_prop)
    print('max_amount_doc', max_amount_doc)

    # Vectorizers
    sequence_vectorizer = layers.TextVectorization(
        output_mode = "int",
        max_tokens = len(vocabulary) + 2, # Plus PAD and UNK
        output_sequence_length = int(max_size_prop),
        standardize = params['sequence_standardize'],
        split = params['sequence_split']
    )
    sequence_vectorizer.adapt(pandas.concat([
        params['train_source_propositions'],
        params['train_target_propositions'],
    ], ignore_index=True)['prop_text'])
    params['sequence_vectorizer'] = sequence_vectorizer
    
    relation_tag_vectorizer = layers.TextVectorization(
        output_mode = "int",
        max_tokens = len(relation_tags) + 2, # Plus PAD and UNK
        output_sequence_length = 1,
        standardize = None,
        split = None
    )
    relation_tag_vectorizer.adapt(relation_tags)
    params['relation_tag_vectorizer'] = relation_tag_vectorizer
    
    proposition_tag_vectorizer = layers.TextVectorization(
        output_mode = "int",
        max_tokens = len(proposition_tags) + 2, # Plus PAD and UNK
        output_sequence_length = 1,
        standardize = None,
        split = None
    )
    proposition_tag_vectorizer.adapt(proposition_tags)
    params['proposition_tag_vectorizer'] = proposition_tag_vectorizer
    
    # One-Hot Encoders
    relation_encoder = layers.CategoryEncoding(
        num_tokens=len(relation_tag_vectorizer.get_vocabulary()), # Plus PAD and UNK
        output_mode="one_hot",
    )
    params['relation_encoder'] = relation_encoder
    
    proposition_encoder = layers.CategoryEncoding(
        num_tokens=len(proposition_tag_vectorizer.get_vocabulary()), # Plus PAD and UNK
        output_mode="one_hot",
    )
    params['proposition_encoder'] = proposition_encoder
    
    
extract_propositions(params)

NEGATIVE PROP IDs
TARGET < 0 Empty DataFrame
Columns: [prop_id, prop_type, prop_text, file_key]
Index: []
SOURCE < 0 Empty DataFrame
Columns: [prop_id, prop_type, prop_text, file_key]
Index: []
RELATION TARGET < 0: []
RELATION SOURCE < 0: []
RELATION TARGET > max 8 ../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay012.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 10 ../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay024.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 10 ../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay043.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 9 ../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay045.ann.conll
Empty DataFrame
Columns: [prop_id_sourc

Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 10 ../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay346.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 8 ../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay351.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 11 ../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay362.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 7 ../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay372.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 10 ../../data/parsed_to_conll/persuasive_essays_parag

Index: []
RELATION TARGET > max 7 ../../data/parsed_to_conll/persuasive_essays_paragraph/test/essay335.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 12 ../../data/parsed_to_conll/persuasive_essays_paragraph/test/essay341.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 15 ../../data/parsed_to_conll/persuasive_essays_paragraph/test/essay348.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 11 ../../data/parsed_to_conll/persuasive_essays_paragraph/test/essay352.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 12 ../../data/parsed_to_conll/persuasive_essays_paragraph/test/essay355.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, r

Index: []
RELATION SOURCE > max 10 ../../data/parsed_to_conll/persuasive_essays_paragraph/test/essay287.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 23 ../../data/parsed_to_conll/persuasive_essays_paragraph/test/essay289.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 14 ../../data/parsed_to_conll/persuasive_essays_paragraph/test/essay301.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 14 ../../data/parsed_to_conll/persuasive_essays_paragraph/test/essay306.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 20 ../../data/parsed_to_conll/persuasive_essays_paragraph/test/essay310.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, 

Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 14 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay089.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 7 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay090.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 10 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay092.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 8 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay093.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 15 ../../data/parsed_to_conll/persuasive_essa

Index: []
RELATION TARGET > max 15 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay179.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 12 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay181.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 11 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay183.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 10 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay184.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 14 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay185.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_tar

RELATION TARGET > max 10 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay275.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 18 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay276.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 10 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay279.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 14 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay280.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 12 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay282.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relat

RELATION TARGET > max 6 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay379.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 8 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay380.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 4 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay383.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 7 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay384.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION TARGET > max 9 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay385.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_t

RELATION SOURCE > max 16 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay060.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 18 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay062.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 17 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay063.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 12 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay064.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 13 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay065.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relat

RELATION SOURCE > max 13 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay147.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 12 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay148.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 18 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay150.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 13 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay151.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 13 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay152.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relat

RELATION SOURCE > max 22 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay242.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 15 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay244.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 10 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay246.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 21 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay247.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 21 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay248.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relat

RELATION SOURCE > max 12 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay330.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 14 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay332.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 13 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay333.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 16 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay334.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relation_type, distance, file_key]
Index: []
RELATION SOURCE > max 8 ../../data/parsed_to_conll/persuasive_essays_paragraph/train/essay336.ann.conll
Empty DataFrame
Columns: [prop_id_source, prop_id_target, relati

Counter({'supports': 2542, 'supports_Inverse': 2542, 'attacks': 155, 'attacks_Inverse': 155})
train relations 5394
train source argumentative units 2697
train target argumentative units 1198
Vocab size 7331
Relation tags ['attacks_Inverse', 'supports_Inverse', 'attacks', 'supports']
Proposition tags ['Premise', 'Claim']
max_size_prop 72
max_amount_doc 20


In [11]:

def creating_glove_embeddings(params: dict):
    
    if Path(params["glove_path"]).exists():
        print("Glove Embedding Matrix Found")
        embedding_matrix = np.load(params["glove_path"])["embeddings"]
        params['embedding_matrix'] = embedding_matrix
        return
    
    # Loading Glove
    hits = 0
    embedding_dim = params['dim']
    word_to_index = dict(map(lambda x: (x[1], x[0]), enumerate(params['sequence_vectorizer'].get_vocabulary())))
    num_tokens = len(word_to_index) # Plus padding and unknown 

    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    with Path(params["glove_raw_path"]).open() as f:
        for line_idx, line in enumerate(f):
            if line_idx % 100000 == 0:
                print('- At line {}'.format(line_idx))
            line = line.strip().split()
            if len(line) != 300 + 1:
                continue
            word = line[0]
            embedding = line[1:]
            if word in word_to_index:
                hits += 1
                word_idx = word_to_index[word]
                embedding_matrix[word_idx] = embedding
                
    print('- Done. Found {} vectors for {} words'.format(hits, num_tokens - 2))
    
    params['embedding_matrix'] = embedding_matrix
    Path(params["glove_path"], "..").resolve().mkdir(exist_ok=True, parents=True)
    np.savez_compressed(params["glove_path"], embeddings=embedding_matrix)

creating_glove_embeddings(params)

- At line 0
- At line 100000
- At line 200000
- At line 300000
- At line 400000
- At line 500000
- At line 600000
- At line 700000
- At line 800000
- At line 900000
- At line 1000000
- At line 1100000
- At line 1200000
- At line 1300000
- At line 1400000
- At line 1500000
- At line 1600000
- At line 1700000
- At line 1800000
- At line 1900000
- At line 2000000
- At line 2100000
- Done. Found 6197 vectors for 6269 words


## Building Datasets

In [31]:
# Encode Dataset

def encode_distance(distance, encode_size):
    """
    return: Tensor with the encoded distance
    """
    middle = encode_size // 2

    abs_distance = tf.abs(distance)
    zeros = tf.zeros((tf.maximum(1, abs_distance), middle))

    to_sum = tf.concat([zeros, tf.eye(abs_distance, num_columns=middle)], axis=0)
    distance_vec = tf.foldl(lambda x, y: tf.add(x, y), to_sum)

    if distance < 0:
        first_vec = tf.reverse(distance_vec, axis=[0])
        second_vec = zeros[0]
    else:
        first_vec = zeros[0]
        second_vec = distance_vec

    return tf.concat([first_vec, second_vec], axis=0)


def encode_datasets(params: dict):
    sequence_vectorizer = params['sequence_vectorizer']
    proposition_tag_vectorizer = params['proposition_tag_vectorizer']
    relation_tag_vectorizer = params['relation_tag_vectorizer']
    proposition_encoder = params['proposition_encoder']
    relation_encoder = params['relation_encoder']
    max_amount_doc = params['max_amount_doc']
    distance_encoding_bits = params['max_distance_encoded'] * 2
    max_proposition_distance = params['max_proposition_distance']

    if 'raw_data_dataframe' in params and False:
        data_dataframe = params['raw_data_dataframe']
    else:
        data_dataframe = pandas.DataFrame(
            columns = [
                'file_key', 
                'source_prop_id', 
                'target_prop_id', 
                'source_prop_text',
                'target_prop_text',
                'source_prop_type',
                'target_prop_type',
                'relation_type', 
                'distance',
                'split',
            ])

        for split in ['dev', 'test', 'train']:

            source_arg_units = params[f'{split}_source_propositions']
            target_arg_units = params[f'{split}_target_propositions']
            relations = params[f'{split}_relations']
            
            source_keys_relation = set(zip(relations['prop_id_source'], relations['file_key']))
            target_keys_relation = set(zip(relations['prop_id_target'], relations['file_key']))
            source_keys_args = set(zip(source_arg_units['prop_id'], source_arg_units['file_key']))
            target_keys_args = set(zip(target_arg_units['prop_id'], target_arg_units['file_key']))
            
            dif = source_keys_args.union(source_keys_relation).difference(source_keys_args.intersection(source_keys_relation))
            assert len(dif) == 0, f"All source relation keys aren't in the source units or viceversa\n{dif}"
            assert target_keys_args == target_keys_relation, "All target relation keys aren't in the target units or viceversa"
            
            counter = Counter(relations['relation_type'])
            print("Initial", split, counter)
            

            for file_key, file_source_df in source_arg_units.groupby(by='file_key'):
                file_target_df = target_arg_units[target_arg_units['file_key'] == file_key]
                file_relations = relations[relations['file_key'] == file_key]
                
                counter = Counter(file_relations['relation_type'])
                print("Initial File", split, counter)
                
                current_file_info = {
                    'file_key': [], 
                    'source_prop_id': [],
                    'target_prop_id': [],
                    'source_prop_text': [],
                    'target_prop_text': [],
                    'source_prop_type': [],
                    'target_prop_type': [],
                    'relation_type': [],
                    'distance': [],
                    'split': [],
                }
                
                for _, source_row in file_source_df.iterrows():
                    source_id = source_row['prop_id']
                    for _, target_row in file_target_df.iterrows():
                        target_id = target_row['prop_id']

                        # Same relations not allowed
                        if source_id == target_id:
                            continue

                        distance = target_id - source_id
                        # Distance is greater than the max alowed distance between propositions
                        if abs(distance) > max_proposition_distance:
                            continue


                        source_target_relation = file_relations[(file_relations['prop_id_target'] == target_id) & (file_relations['prop_id_source'] == source_id)]

                        if len(source_target_relation) == 0:
                            # No related propositions
                            relation_type = '' # No Relation
                            distance = 0 # Mock Distance
                            source_target_relation = pandas.concat([source_target_relation, pandas.DataFrame({
                                'prop_id_source': [source_id],
                                'prop_id_target': [target_id],
                                'relation_type': [relation_type],
                                'distance': [distance],
                                'file_key': [file_key]
                            })])
                            
                        if len(source_target_relation) > 1:
                            print("WARNING: Multiple relation with single source-target pair")
                            print(source_target_relation)

                        for _, relation_row in source_target_relation.iterrows():

                            assert relation_row['distance'] == distance, f"{relation_row['distance']} != {distance}"

                            # Adding data
                            current_file_info['file_key'].append(file_key)
                            current_file_info['source_prop_id'].append(source_id)
                            current_file_info['target_prop_id'].append(target_id)
                            current_file_info['source_prop_text'].append(source_row['prop_text'])
                            current_file_info['target_prop_text'].append(target_row['prop_text'])
                            current_file_info['source_prop_type'].append(source_row['prop_type'])
                            current_file_info['target_prop_type'].append(target_row['prop_type'])
                            current_file_info['relation_type'].append(relation_row['relation_type'])
                            current_file_info['distance'].append(distance)
                            current_file_info['split'].append(split)
                    
                current_file_info = pandas.DataFrame(current_file_info)
                counter = Counter(current_file_info['relation_type'])
                print("Final File", split, counter)
                break

                data_dataframe = pandas.concat([data_dataframe, current_file_info], ignore_index=True)
                    
            
        
    params['raw_data_dataframe'] = data_dataframe

    for split, data_dataframe in data_dataframe.groupby(by="split"):
        source_ds = tf.data.Dataset.from_tensor_slices(tf.constant(data_dataframe['source_prop_text'])).map(lambda x: sequence_vectorizer(x))
        target_ds = tf.data.Dataset.from_tensor_slices(tf.constant(data_dataframe['target_prop_text'])).map(lambda x: sequence_vectorizer(x))
        source_type_ds = tf.data.Dataset.from_tensor_slices(tf.constant(data_dataframe['source_prop_type'])).map(lambda x: proposition_encoder(proposition_tag_vectorizer([x])))
        target_type_ds = tf.data.Dataset.from_tensor_slices(tf.constant(data_dataframe['target_prop_type'])).map(lambda x: proposition_encoder(proposition_tag_vectorizer([x])))
        relation_type_ds = tf.data.Dataset.from_tensor_slices(tf.constant(data_dataframe['relation_type'])).map(lambda x: relation_encoder(relation_tag_vectorizer([x])))
        distance_ds = tf.data.Dataset.from_tensor_slices(list(data_dataframe['distance'].to_numpy(dtype=int))).map(lambda x: encode_distance(x, distance_encoding_bits))
        
        relation_counter = Counter(data_dataframe['relation_type'])
        print(relation_counter)
        
        # Order matters
        input_ds = tf.data.Dataset.zip((source_ds, target_ds, distance_ds))
        output_ds = tf.data.Dataset.zip((relation_type_ds, source_type_ds, target_type_ds))
        
        ds = tf.data.Dataset.zip((input_ds, output_ds))
        
        params[f"{split}_ds"] = ds

        
encode_datasets(params)

AssertionError: All source relation keys aren't in the source units or viceversa
{(5, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay053.ann.conll'), (10, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay388.ann.conll'), (9, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay377.ann.conll'), (9, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay045.ann.conll'), (7, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay309.ann.conll'), (5, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay362.ann.conll'), (9, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay381.ann.conll'), (8, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay230.ann.conll'), (2, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay118.ann.conll'), (5, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay283.ann.conll'), (9, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay326.ann.conll'), (5, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay106.ann.conll'), (2, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay372.ann.conll'), (3, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay171.ann.conll'), (8, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay024.ann.conll'), (10, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay054.ann.conll'), (10, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay326.ann.conll'), (7, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay012.ann.conll'), (2, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay388.ann.conll'), (1, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay053.ann.conll'), (13, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay283.ann.conll'), (6, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay400.ann.conll'), (4, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay210.ann.conll'), (1, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay362.ann.conll'), (8, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay186.ann.conll'), (4, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay400.ann.conll'), (1, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay209.ann.conll'), (7, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay080.ann.conll'), (6, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay377.ann.conll'), (22, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay284.ann.conll'), (20, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay281.ann.conll'), (10, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay043.ann.conll'), (5, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay210.ann.conll'), (3, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay288.ann.conll'), (6, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay381.ann.conll'), (10, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay329.ann.conll'), (9, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay284.ann.conll'), (1, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay012.ann.conll'), (16, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay288.ann.conll'), (12, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay225.ann.conll'), (6, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay054.ann.conll'), (7, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay209.ann.conll'), (9, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay362.ann.conll'), (15, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay297.ann.conll'), (1, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay080.ann.conll'), (2, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay297.ann.conll'), (2, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay177.ann.conll'), (1, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay024.ann.conll'), (19, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay326.ann.conll'), (9, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay337.ann.conll'), (8, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay297.ann.conll'), (11, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay346.ann.conll'), (5, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay024.ann.conll'), (2, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay329.ann.conll'), (6, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay225.ann.conll'), (5, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay372.ann.conll'), (6, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay171.ann.conll'), (2, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay045.ann.conll'), (5, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay388.ann.conll'), (13, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay118.ann.conll'), (7, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay186.ann.conll'), (3, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay230.ann.conll'), (7, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay190.ann.conll'), (6, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay309.ann.conll'), (15, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay326.ann.conll'), (6, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay281.ann.conll'), (5, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay346.ann.conll'), (13, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay186.ann.conll'), (15, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay177.ann.conll'), (6, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay209.ann.conll'), (7, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay351.ann.conll'), (14, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay102.ann.conll'), (12, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay284.ann.conll'), (11, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay225.ann.conll'), (19, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay288.ann.conll'), (6, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay106.ann.conll'), (1, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay190.ann.conll'), (19, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay283.ann.conll'), (8, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay012.ann.conll'), (3, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay054.ann.conll'), (12, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay337.ann.conll'), (1, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay377.ann.conll'), (15, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay309.ann.conll'), (7, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay264.ann.conll'), (15, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay281.ann.conll'), (5, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay043.ann.conll'), (3, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay351.ann.conll'), (10, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay102.ann.conll'), (9, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay177.ann.conll'), (13, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay297.ann.conll'), (13, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay177.ann.conll'), (13, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay264.ann.conll'), (3, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay102.ann.conll'), (7, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay377.ann.conll'), (7, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay045.ann.conll'), (2, '../../data/parsed_to_conll/persuasive_essays_paragraph/dev/essay337.ann.conll')}

## Building the model

Two versions of the model can be buit. The difference is the presence or not of an attention layer.

In [13]:
from attention import apply_attention

# Build Model

def build_model(params: dict):
    linear_embedders_dims = params['linear_embedders_dims'] # [50, 50, 50, 300]
    max_sequence_size = params['max_size_prop']
    words_amount = len(params['sequence_vectorizer'].get_vocabulary()) # Plus UNK and Pad
    embedding_dim = params['dim']
    embedding_matrix = params['embedding_matrix']
    regularizer_weight = params['regularizer_weight']
    dropout = params['dropout']
    final_embedding_dimension = params['encoder_dense_units']
    final_layer_size = params['final_size']
    pool_size = params['encoder_pool_size']
    distance_encoding_bits = params['max_distance_encoded'] * 2
    lstm_units = params['lstm_units']
    res_size = params['residual_size']
    relation_amount = len(params['relation_tag_vectorizer'].get_vocabulary()) # Plus UNK and Pad
    proposition_tag_amount = len(params['proposition_tag_vectorizer'].get_vocabulary()) # Plus UNK and Pad
    with_attention = params['with_attention']
    
    def build_embedder(max_sequence_size, words_amount, embedding_dim, embedding_matrix, linear_layers_dims, regularizer_weight, dropout):
        """
        Builds a proposition embedder
        """
        
        # Input layer
        int_sequence_input = keras.Input(
            shape=(max_sequence_size,), 
            dtype="int64"
        )

        # Embedding layer, convert an index vector into a embedding vector, by accessing embedding_matrix
        embedding_layer = layers.Embedding(
            words_amount,
            embedding_dim,
            embeddings_initializer=keras.initializers.Constant(embedding_matrix),
            trainable=False,
            input_length=max_sequence_size,
        )

        initial_layer = model_layers = embedding_layer(int_sequence_input)

        def get_linear_layer(dense_dim, linear_layer=None):
            """
            Creates a single dense layer for the embedder
            """
            
            if linear_layer is None:
                input_vec = keras.Input(shape=(embedding_dim,))
            else:
                input_vec = linear_layer
            
            linear_layer = layers.Dense(
                units=dense_dim,
                activation=None,
                kernel_initializer='he_normal',
                kernel_regularizer=keras.regularizers.l2(regularizer_weight),
                bias_regularizer=keras.regularizers.l2(regularizer_weight)
            )(input_vec)
            
            linear_layer = layers.BatchNormalization()(linear_layer)
            linear_layer = layers.Dropout(dropout)(linear_layer)
            linear_layer = layers.Activation('relu')(linear_layer)
            return input_vec, linear_layer
        
        # Linear transformation
        linear_input, linear_layer = get_linear_layer(linear_layers_dims[0])
        for dim in linear_layers_dims[1:]:
            _, linear_layer = get_linear_layer(dim, linear_layer)
        linear_layer = keras.Model(inputs=linear_input, outputs=linear_layer)
        
        # Apply linear_layer to each word embedding
        model_layers = layers.TimeDistributed(linear_layer)(model_layers)
        
        # Residual connection
        model_layers = layers.Add()([initial_layer, model_layers])
        
        return int_sequence_input, model_layers
    
    def build_dense_encoder(max_sequence_size, embedding_dim, final_dimension, pool_size, regularizer_weight):
        
        # Input layer
        embedding_inputs = keras.Input(
            shape=(max_sequence_size, embedding_dim)
        )
        
        encoder_layer = embedding_inputs
        
        linear_layer = layers.Dense(
            units=final_dimension,
            activation='relu',
            kernel_regularizer=keras.regularizers.l2(regularizer_weight),
            bias_regularizer=keras.regularizers.l2(regularizer_weight)
        )
        
        # Apply linear_layer to each word embedding
        encoder_layer = layers.TimeDistributed(linear_layer)(encoder_layer)
        
        # Average the words embeddings
        encoder_layer = layers.AveragePooling1D(
            pool_size=pool_size,
        )(encoder_layer)
    
        encoder_layer = layers.BatchNormalization()(encoder_layer)
    
        return keras.Model(inputs=embedding_inputs, outputs=encoder_layer)
    
    def build_bilstm_encoder(sequence_size, encoded_dim, lstm_units, dropout, regularizer_weight, return_sequences):

        # Input layer
        embedding_inputs = keras.Input(
            shape=(sequence_size, encoded_dim)
        )
        
        bilstm_layer = layers.Bidirectional(
            layers.LSTM(
                units=lstm_units,
                dropout=dropout,
                recurrent_dropout=dropout,
                kernel_regularizer=keras.regularizers.l2(regularizer_weight),
                recurrent_regularizer=keras.regularizers.l2(regularizer_weight),
                bias_regularizer=keras.regularizers.l2(regularizer_weight),
                return_sequences=return_sequences,
            ),
            merge_mode='mul'
        )(embedding_inputs)
        
        return keras.Model(inputs=embedding_inputs, outputs=bilstm_layer)
    
    def apply_resnet(input_layer, regularizer_weight, res_size, dropout):
        prev_layer = input_layer
        prev_block = prev_layer
        
        layers_dims = (2, 2)
        blocks = layers_dims[0]
        res_layers = layers_dims[1]

        shape = int(np.shape(input_layer)[1])

        for i in range(1, blocks + 1):
            for j in range(1, res_layers):
                prev_layer = layers.BatchNormalization()(prev_layer)

                prev_layer = layers.Dropout(dropout)(prev_layer)

                prev_layer = layers.Activation('relu')(prev_layer)

                prev_layer = layers.Dense(
                    units=res_size,
                    activation=None,
                    kernel_initializer='he_normal',
                    kernel_regularizer=keras.regularizers.l2(regularizer_weight),
                    bias_regularizer=keras.regularizers.l2(regularizer_weight),
                )(prev_layer)
            
            prev_layer = layers.BatchNormalization()(prev_layer)

            prev_layer = layers.Dropout(dropout)(prev_layer)

            prev_layer = layers.Activation('relu')(prev_layer)

            prev_layer = layers.Dense(units=shape,
                               activation=None,
                               kernel_initializer='he_normal',
                               kernel_regularizer=keras.regularizers.l2(regularizer_weight),
                               bias_regularizer=keras.regularizers.l2(regularizer_weight),
                               )(prev_layer)

            prev_layer = layers.Add()([prev_block, prev_layer])
            prev_block = prev_layer

        return prev_block
    
    def create_single_model(index):
        """
        Create a single model for the ensemble learning
        """
        
        input_distance = keras.Input(
            shape=(distance_encoding_bits, )
        )

        input_source_embedder, source_embedder = build_embedder(
            max_sequence_size, 
            words_amount, 
            embedding_dim, 
            embedding_matrix, 
            linear_embedders_dims, 
            regularizer_weight, 
            dropout
        )

        input_target_embedder, target_embedder = build_embedder(
            max_sequence_size, 
            words_amount, 
            embedding_dim, 
            embedding_matrix, 
            linear_embedders_dims, 
            regularizer_weight, 
            dropout
        )

        dense_encoder = build_dense_encoder(
            max_sequence_size, 
            embedding_dim, 
            final_embedding_dimension, 
            pool_size, 
            regularizer_weight
        )

        bilstm_encoder = build_bilstm_encoder(
            max_sequence_size, 
            final_embedding_dimension, 
            lstm_units, 
            dropout, 
            regularizer_weight,
            with_attention
        )

        # Apply dense encoder to source and target sequence features
        prev_source_layers = source_layers = dense_encoder(source_embedder)
        prev_target_layers = target_layers = dense_encoder(target_embedder)

        # Apply bilstm encoder to source and target sequence features
        source_layers = bilstm_encoder(source_layers)
        target_layers = bilstm_encoder(target_layers)

        if with_attention:
            source_layers, target_layers = apply_attention(
                input_source_embedder, 
                input_target_embedder,
                prev_source_layers,
                prev_target_layers,
                source_layers,
                target_layers,
                final_layer_size,
            )

        # Concatenate source and target sequence features with other features 
        model_layers = layers.Concatenate()([source_layers, target_layers, input_distance])
        model_layers = layers.BatchNormalization()(model_layers)
        model_layers = layers.Dropout(dropout)(model_layers)

        # Middle dense layer
        model_layers = layers.Dense(
            units=final_layer_size,
            activation='relu',
            kernel_initializer='he_normal',
            kernel_regularizer=keras.regularizers.l2(regularizer_weight),
            bias_regularizer=keras.regularizers.l2(regularizer_weight),
        )(model_layers)

        # Apply a residual network
        model_layers = apply_resnet(
            model_layers,
            regularizer_weight,
            res_size,
            dropout
        )


        model_layers = layers.BatchNormalization()(model_layers)
        model_layers = layers.Dropout(dropout)(model_layers)

        # Classifiers
        relation_classifier = layers.Dense(
            units=relation_amount,
            activation='softmax',
            name="relation",
        )(model_layers)

        source_classifier = layers.Dense(
            units=proposition_tag_amount,
            activation='softmax',
            name="source",
        )(model_layers)

        target_classifier = layers.Dense(
            units=proposition_tag_amount,
            activation='softmax',
            name="target",
        )(model_layers)

        # Creating final model
        model = keras.Model(
            inputs=(input_source_embedder, input_target_embedder, input_distance),
            outputs=(relation_classifier, source_classifier, target_classifier)
        )
    
        model.summary()
        
        return model
    
    model = create_single_model(0)
    
    params[params['model_name']] = model

build_model(params)

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 72)]         0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 72)]         0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 72, 300)      1881300     ['input_6[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 72, 300)      1881300     ['input_4[0][0]']                
                                                                                            

                                                                                                  
 att_weights_source (Activation  (None, 72)          0           ['att_masked_addition_source[0][0
 )                                                               ]']                              
                                                                                                  
 att_weights_target (Activation  (None, 72)          0           ['att_masked_addition_target[0][0
 )                                                               ]']                              
                                                                                                  
 att_weights_reshape_source (Re  (None, 72, 1)       0           ['att_weights_source[0][0]']     
 shape)                                                                                           
                                                                                                  
 att_weigh

 dropout_13 (Dropout)           (None, 20)           0           ['batch_normalization_14[0][0]'] 
                                                                                                  
 relation (Dense)               (None, 6)            126         ['dropout_13[0][0]']             
                                                                                                  
 source (Dense)                 (None, 4)            84          ['dropout_13[0][0]']             
                                                                                                  
 target (Dense)                 (None, 4)            84          ['dropout_13[0][0]']             
                                                                                                  
Total params: 3,876,366
Trainable params: 111,326
Non-trainable params: 3,765,040
__________________________________________________________________________________________________


In [14]:
from utils import create_lr_annealing_function

def train_and_save_model(params: dict):
    model_name = params['model_name']
    epochs = 2 # params['epochs'] # TODO
    batch_size = params['batch_size']
    model = params[model_name]
    loss_weights = params['loss_weights']
    lr_alpha = params['lr_alpha']
    lr_kappa = params['lr_kappa']
    relation_amount = len(params['relation_tag_vectorizer'].get_vocabulary())
    proposition_amount = len(params['proposition_tag_vectorizer'].get_vocabulary())
    metrics = params['metrics']
    beta_1 = params['beta_1']
    beta_2 = params['beta_2']
    min_delta = params['min_delta']
    patience = params['patience']
    train_ds = params['train_ds'].batch(batch_size)
    val_ds = params['dev_ds'].batch(batch_size)
    
    
    # Optimizer
    lr_function = create_lr_annealing_function(initial_lr=lr_alpha, k=lr_kappa)
    lr_scheduler = keras.callbacks.LearningRateScheduler(lr_function)
    optimizer = tf.optimizers.Adam(
        learning_rate=lr_function(0),
        beta_1=beta_1,
        beta_2=beta_2,
    )
    
    # EarlyStopping
    early_stopping = keras.callbacks.EarlyStopping(
        min_delta=min_delta,
        patience=patience,
        verbose=1,
    )
    
    # Metrics
    metrics = {
        'relation': metrics.copy(),
        'source': metrics.copy(),
        'target': metrics.copy(),
    }
    for name, num_classes in [
            ('relation', relation_amount), 
            ('source', proposition_amount), 
            ('target', proposition_amount)
        ]:
        
        f1 = tfa.metrics.F1Score(
            num_classes=num_classes,
            average=None,
            name=f'{name}F1',
        )
        f1_macro = tfa.metrics.F1Score(
            num_classes=num_classes,
            average='macro',
            name=f'{name}F1Macro',
        )
        f1_micro = tfa.metrics.F1Score(
            num_classes=num_classes,
            average='micro',
            name=f'{name}F1Micro',
        )
        metrics[name].extend([
            f1,
            f1_macro,
            f1_micro
        ])
    
    model.compile(
        loss='categorical_crossentropy', # Apply this loss function to all outputs
        loss_weights=loss_weights, # Weights for the sum of the loss functions
        optimizer=optimizer,
        metrics=metrics
    )
    
    # Train
    history = model.fit(train_ds,
                  batch_size=batch_size, 
                  epochs=epochs, 
                  validation_data=val_ds,
                  callbacks=[
                      lr_scheduler,
                      early_stopping,
                  ])
    
    model.save(params["model_path"], save_format='tf')
    
    history = history.history
    for key in history:
        values = np.array(history[key]).tolist()
        history[key] = values
    params['history'] = history
    with Path(params['export_path'], f"{model_name}_history.json").open('w') as f:
        json.dump(history, f)

train_and_save_model(params)

	NEW LR: 0.003
	NEW LR: 0.003
Epoch 1/2
	NEW LR: 0.0029970029970029974
Epoch 2/2
INFO:tensorflow:Assets written to: ../../data/link_prediction/persuasive_essays_paragraph/model/assets


In [None]:
# Load Model
def load_saved_model(params: dict, path_key: str, param_save_key: str):
    title = model_name = params["model_name"]
    path = Path(params['export_path'], f"{model_name}_history.json")
    history = json.load(path.open())
    params['history'] = history
    
    model = keras.models.load_model(params[path_key])
    params[param_save_key] = model

    
load_saved_model(params, 'model_path', params['model_name'])

In [15]:
# Evaluate model
def evaluate_model(params: dict):
    model = params[params['model_name']]
    batch_size = params['batch_size']
    test_ds = params['test_ds'].batch(batch_size)
    
    results = model.evaluate(test_ds, batch_size=batch_size)


evaluate_model(params)



In [None]:
def plot_history(params: dict):
    history = params['history']
    relation_labels = [x if x else "None" for x in params['relation_tag_vectorizer'].get_vocabulary()]
    proposition_labels = [x if x else "None" for x in params['proposition_tag_vectorizer'].get_vocabulary()]
        
    def plot_list(values, label):
        X = [i for i in range(len(values))]
        plt.plot(X, values, label=label)
    
    def show_plot(title, x_label="Epoch", y_label=""):
        plt.title(title)
        plt.xlabel(x_label)
        plt.ylabel(y_label)
        plt.legend()
        plt.show()
        
    def plot_categorical_list(values, labels, bar_plot=False):
        values = np.array(values).T
        plt.xticks(rotation = 45)
        if bar_plot:
            plt.bar(labels, [x[-1] for x in values])
        else:
            for label, label_values in zip(labels, values):
                X = [i for i in range(len(label_values))]
                plt.plot(X, label_values, label=label)
    
    for key, value in history.items():
        value = np.array(value)
        if len(value.shape) == 1: # List values
            plot_list(value, key)
            show_plot(key, y_label="values")
        else: # Categorical values
            labels = relation_labels if 'relation' in key else proposition_labels
            plot_categorical_list(value, labels, bar_plot=True)
            show_plot(key + " bar", x_label='categories', y_label="values")
            plot_categorical_list(value, labels, bar_plot=False)
            show_plot(key, y_label="values")
        
plot_history(params)

In [None]:
def show_info_tables(params: dict):
    history = params['history']
    relation_labels = [x if x else "None" for x in params['relation_tag_vectorizer'].get_vocabulary()]
    proposition_labels = [x if x else "None" for x in params['proposition_tag_vectorizer'].get_vocabulary()]




In [16]:
class LinkPredictionModel(keras.Model):
    
    def __init__(self,
                 model,
                 sequence_vectorizer, 
                 proposition_tag_vectorizer, 
                 relation_tag_vectorizer, 
                 distance_encoding_bits,
                 batch_size=32
                ):
        super().__init__()
        self.model = model
        self.sequence_vectorizer = sequence_vectorizer
        self.proposition_tag_vectorizer = proposition_tag_vectorizer
        self.relation_tag_vectorizer = relation_tag_vectorizer
        self.distance_encoding_bits = distance_encoding_bits
        self.batch_size = batch_size
    
    def decode_outputs(self, outputs):
        propositions = self.proposition_tag_vectorizer.get_vocabulary()
        relations = self.relation_tag_vectorizer.get_vocabulary()
        
        result = []
        relation_outputs, source_outputs, target_outputs = outputs 
        
        for relation_output, source_output, target_output in zip(relation_outputs, source_outputs, target_outputs):
            relation_tag = relations[tf.argmax(relation_output)]
            source_tag = propositions[tf.argmax(source_output)]
            target_tag = propositions[tf.argmax(target_output)]
            result.append((relation_tag, source_tag, target_tag))
            
        return result
    
    def call(self, source_inputs, target_inputs, distance_inputs):
        
        source_ds = tf.data.Dataset.from_tensor_slices(tf.constant(source_inputs)).map(lambda x: self.sequence_vectorizer(x))
        target_ds = tf.data.Dataset.from_tensor_slices(tf.constant(target_inputs)).map(lambda x: self.sequence_vectorizer(x))
        distance_ds = tf.data.Dataset.from_tensor_slices(tf.constant(distance_inputs)).map(lambda x: encode_distance(x, self.distance_encoding_bits))
    
        inputs_ds = tf.data.Dataset.zip((source_ds, target_ds, distance_ds)).batch(self.batch_size)
        
        outputs = self.model(list(inputs_ds))
        
        return self.decode_outputs(outputs)

def build_link_prediction_model(params: dict):
    model = params[params['model_name']]
    sequence_vectorizer = params['sequence_vectorizer']
    proposition_tag_vectorizer = params['proposition_tag_vectorizer']
    relation_tag_vectorizer = params['relation_tag_vectorizer']
    distance_encoding_bits = params['max_distance_encoded'] * 2
    
    
    model = LinkPredictionModel(
        model=model,
        sequence_vectorizer=sequence_vectorizer,
        proposition_tag_vectorizer=proposition_tag_vectorizer,
        relation_tag_vectorizer=relation_tag_vectorizer,
        distance_encoding_bits=distance_encoding_bits
    )
    
    source = "muchos años , la gente tenía que pagar una gran cantidad de dinero prar enviar sus cartas , y sus pagos estaban relacionados con el peso de sus cartas o cajas , y muchos accidentes pueden causar el problema de que el correo no se pueda entregar"
    target = "electrónico puede contarse como uno de los resultados más beneficiosos de la tecnología moderna"
    distance = -1
    
    result = model([source], [target], [distance])
    
    print(result)

    params[params['model_name'] + "_final"] = model
    
build_link_prediction_model(params)

[('', 'Premise', 'Claim')]


In [17]:
def compute_statistic(params: dict):
    data_dataframe = params['raw_data_dataframe']
    data_dataframe = data_dataframe[data_dataframe['split'] == 'test']
    
    model = params[params['model_name'] + "_final"]
    
    statistic = {
        'source_prop_text': [],
        'target_prop_text': [],
        'source_prop_type': [],
        'target_prop_type': [],
        'relation_type': [],
        'infered_source_prop_type': [],
        'infered_target_prop_type': [],
        'infered_relation_type': [], 
        'distance': [],
    }
    
    source_ds = tf.data.Dataset.from_tensor_slices(data_dataframe['source_prop_text'])
    target_ds = tf.data.Dataset.from_tensor_slices(data_dataframe['target_prop_text'])
    distance_ds = tf.data.Dataset.from_tensor_slices(list(data_dataframe['distance'].to_numpy(dtype=int)))
    source_tag_ds = tf.data.Dataset.from_tensor_slices(data_dataframe['source_prop_type'])
    target_tag_ds = tf.data.Dataset.from_tensor_slices(data_dataframe['target_prop_type'])
    relation_tag_ds = tf.data.Dataset.from_tensor_slices(data_dataframe['relation_type'])
    
    batch_num = 1
    for sources, targets, distances, source_tags, target_tags, relation_tags in tf.data.Dataset.zip((source_ds, target_ds, distance_ds, source_tag_ds, target_tag_ds, relation_tag_ds)).batch(32):
        print("batch:", batch_num)
        batch_num += 1
        
        inference = model(sources, targets, distances)
        
        statistic['source_prop_text'].extend(sources)
        statistic['target_prop_text'].extend(targets)
        statistic['source_prop_type'].extend(source_tags)
        statistic['target_prop_type'].extend(target_tags)
        statistic['relation_type'].extend(relation_tags)
        statistic['distance'].extend(distances)
        
        for relation_tag, source_tag, target_tag in inference:
            statistic['infered_source_prop_type'].append(source_tag)
            statistic['infered_target_prop_type'].append(target_tag)
            statistic['infered_relation_type'].append(relation_tag)
    
    statistic = pandas.DataFrame(statistic)
    print(statistic.describe())
    params['statistic'] = statistic
    
compute_statistic(params)

batch: 1
batch: 2
batch: 3
batch: 4
batch: 5
batch: 6
batch: 7
batch: 8
batch: 9
batch: 10
batch: 11
batch: 12
batch: 13
batch: 14
batch: 15
batch: 16
batch: 17
batch: 18
batch: 19
batch: 20
batch: 21
batch: 22
batch: 23
batch: 24
batch: 25
batch: 26
batch: 27
batch: 28
batch: 29
batch: 30
batch: 31
batch: 32
batch: 33
batch: 34
batch: 35
batch: 36
batch: 37
batch: 38
batch: 39
batch: 40
batch: 41
batch: 42
batch: 43
batch: 44
batch: 45
batch: 46
batch: 47
batch: 48
batch: 49
batch: 50
batch: 51
batch: 52
batch: 53
batch: 54
batch: 55
batch: 56
batch: 57
batch: 58
batch: 59
batch: 60
batch: 61
batch: 62
batch: 63
batch: 64
batch: 65
batch: 66
batch: 67
batch: 68
batch: 69
batch: 70
batch: 71
batch: 72
batch: 73
batch: 74
batch: 75
batch: 76
batch: 77
batch: 78
batch: 79
batch: 80
batch: 81
batch: 82
batch: 83
batch: 84
batch: 85
batch: 86
batch: 87
batch: 88
batch: 89
batch: 90
batch: 91
batch: 92
batch: 93
batch: 94
batch: 95
batch: 96
batch: 97
batch: 98
batch: 99
batch: 100
batch: 1

In [18]:

def show_statistic(params: dict):
    statistic = params['statistic']
#     statistic = {
#         'source_prop_text': [],
#         'target_prop_text': [],
#         'source_prop_type': [],
#         'target_prop_type': [],
#         'relation_type': [],
#         'infered_source_prop_type': [],
#         'infered_target_prop_type': [],
#         'infered_relation_type': [], 
#         'distance': [],
#     }
    true_positive_relation = statistic[statistic['relation_type'] == statistic['infered_relation_type']]
    true_positive_source = statistic[statistic['source_prop_type'] == statistic['infered_source_prop_type']]
    true_positive_target = statistic[statistic['target_prop_type'] == statistic['infered_target_prop_type']]
    
    print("Accuracy:")
    relation_accuracy = len(true_positive_relation) / len(statistic)
    print("Relation Accuracy:", relation_accuracy)
    source_accuracy = len(true_positive_source) / len(statistic)
    print("Source Accuracy:", source_accuracy)
    target_accuracy = len(true_positive_target) / len(statistic)
    print("Target Accuracy:", target_accuracy)
    
    source_counter = Counter(statistic['source_prop_type'].map(lambda x: x.numpy().decode()))
    target_counter = Counter(statistic['target_prop_type'].map(lambda x: x.numpy().decode()))
    relation_counter = Counter(statistic['relation_type'].map(lambda x: x.numpy().decode()))
    print(source_counter)
    print(target_counter)
    print(relation_counter)
    
show_statistic(params)

Accuracy:
Relation Accuracy: 0.9833867677062081
Source Accuracy: 1.0
Target Accuracy: 0.6132322937918974
Counter({'Premise': 3431})
Counter({'Claim': 2104, 'Premise': 1327})
Counter({'': 2607, 'supports': 767, 'attacks': 42, 'supports_Inverse': 12, 'attacks_Inverse': 3})


In [None]:
def use_model(params: dict):
    # TODO 
    data_path = params['']