# Link Prediction

## TODO

- [ ] Remove unnecesary tags in relation and proposition vectorizers

In [1]:
from pathlib import Path
if __name__ == "__main__":
    try:
        _path = str(Path(__file__, "..", "..", "..").resolve())
        import sys
        if _path not in sys.path:
            sys.path.insert(0, _path)
    except NameError:
        import sys
        _path = str(Path("../../").resolve())
        if _path not in sys.path:
            sys.path.insert(0, _path)
        

from corpus_parser.unified_parser import UnifiedParser

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from keras import layers
import keras.backend as K

import numpy as np
import json
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.metrics import ConfusionMatrixDisplay

import pandas
from pandas.core.common import SettingWithCopyWarning

# TODO Fix this issues.
import warnings
warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)



In [2]:
# Define model initial params

INFO_TAG = "persuasive_essays_paragraph"

DATA_PATH = Path("../../data/")
EXPORT_PATH = Path(DATA_PATH, 'link_prediction', INFO_TAG)
GLOVE_PATH = Path(DATA_PATH, 'glove.840B.300d.txt')
TO_PROCESS_DATADIR = Path(DATA_PATH, )
PROCESSED_DATADIR = Path(DATA_PATH, )
SEGMENTER_MODEL_DIR = Path(DATA_PATH, )
DIM = 300

params = {
    'in_production': False,
    
    # Model Training Hyperparameters
    'epochs': 70,
    'batch_size': 20,
    'metrics': ['acc'],
    
    # Ensemble Hyperparameters
    'ensemble_amount': 10,
    
    # Model Hyperparameters
    'dim': DIM,
    'dropout': 0.1,
    'lstm_size': 200,
    'max_distance_encoded': 5,
    'linear_embedders_dims': [50, 50, 50, DIM],
    'regularizer_weight': 0.001,
    'encoder_dense_units': 50,
    'encoder_pool_size': 1, # If 1 no tranformation is made to the input.
    'lstm_units': 25,
    'final_size': 20,
    'residual_size': 50,
    'with_attention': True, # If the attention block is used
    'loss_weights': {
        'relation': 10,
        'source': 1,
        'target': 1,
    },
    
    # Adam Optimizer Hyperparameters
    'lr_alpha': 0.003,
    'lr_kappa': 0.001,
    'beta_1': 0.9,
    'beta_2': 0.999,
    
    # Early Stopping Hyperparameters
    'min_delta': 0,
    'patience': 5,
    
    # Corpus Info
#     'corpus_path': str(Path(DATA_PATH, 'projection', INFO_TAG)),
#     'corpus_path': str(Path(DATA_PATH, 'corpus', 'ArgumentAnnotatedEssays-2.0', 'train-test-split')),
    'corpus_path': str(Path(DATA_PATH, 'parsed_to_conll', INFO_TAG)),
    'glove_path': str(Path(EXPORT_PATH, 'glove.npz')),
    'export_path': str(EXPORT_PATH),
    'model_path': str(EXPORT_PATH / "model"),
    'glove_raw_path': str(Path(GLOVE_PATH)),
    
    # Vectorizer Hyperparameters
    'sequence_standardize': None,
    'sequence_split': 'whitespace',
    
    # Corpus Hyperparameters
    'max_proposition_distance': 10, # TODO How to set this value
    
    # Data info
    'to_process_data_path': str(Path(TO_PROCESS_DATADIR)), # Directory with text to be processed
    'processed_data_path': str(Path(PROCESSED_DATADIR)), # Directory to save the processed data
}

params['model_name'] = "model" + ("_attention" if params['with_attention'] else "")

In [3]:
# Load Dataset 

def find_duplicates(dataframe, first, second, group):

    for g, dataframe in dataframe.groupby(by=group):
        rows = [row for _, row in dataframe.iterrows()]
        for i, row in enumerate(rows):
            for row2 in rows[i+1:]:
                if row[first] == row2[second] and row[second] == row2[first]:
                    print("DUPLICATED ROW")
                    print(g)
                    print(row)
                    print(row2)

                
def extract_propositions(params: dict):
    corpus_path = Path(params['corpus_path'])
    
    parser = UnifiedParser()
    
    names = [
        "dev", 
        "test",
        "train",
    ]
    
    relation_tags = set()
    proposition_tags = set()
    
    source_vocabulary = set()
    target_vocabulary = set()
    
    # Max amount of propositions in a document
    max_amount_source_in_doc = 0
    max_amount_target_in_doc = 0
    
    # Max amount of tokens in a proposition
    max_size_in_source_prop = 0
    max_size_in_target_prop = 0
    
    for name in names:
        
        proposition_dict = parser.parse_dir(corpus_path / name)
        
        current_source_arg_units = pandas.DataFrame(columns=['prop_id', 'prop_type', 'prop_text', 'file_key'])
        current_target_arg_units = pandas.DataFrame(columns=['prop_id', 'prop_type', 'prop_text', 'file_key'])
        current_relations = pandas.DataFrame(columns=['prop_id_source', 'prop_id_target', 'relation_type', 'distance', 'file_key'])

        for key, (args_unit, relations, _) in proposition_dict.items():
            args_unit = args_unit[['prop_id', 'prop_type', 'prop_text']]
            
            args_unit['file_key'] = [key for _ in range(len(args_unit))]
            
            relations = relations[['prop_id_source', 'prop_id_target', 'relation_type']]
            relations['distance'] = relations.aggregate(lambda x: x['prop_id_target']-x['prop_id_source'], axis=1)
            relations['file_key'] = relations.aggregate(lambda x: key, axis=1)
            
            source_prop = args_unit[args_unit['prop_id'].isin(relations['prop_id_source'])]
            target_prop = args_unit[args_unit['prop_id'].isin(relations['prop_id_target'])]
            
            source_vocabulary.update([t for s in source_prop['prop_text'] for t in s.split()])
            target_vocabulary.update([t for s in target_prop['prop_text'] for t in s.split()])
            
            max_size_in_source_prop = max(max_size_in_source_prop, source_prop.aggregate(lambda x: len(x['prop_text'].split()), axis=1).max())
            max_size_in_target_prop = max(max_size_in_target_prop, target_prop.aggregate(lambda x: len(x['prop_text'].split()), axis=1).max())
            
            
            max_amount_source_in_doc = max(max_amount_source_in_doc, len(relations['prop_id_source'].drop_duplicates()))
            max_amount_target_in_doc = max(max_amount_target_in_doc, len(relations['prop_id_target'].drop_duplicates()))
            
            current_source_arg_units = pandas.concat([current_source_arg_units, source_prop], ignore_index=True)
            current_target_arg_units = pandas.concat([current_target_arg_units, target_prop], ignore_index=True)
            current_relations = pandas.concat([current_relations, relations], ignore_index=True)
            
            
        # Add Inverse Relations
        inverse_relations = {
            'prop_id_source': [],
            'prop_id_target': [],
            'relation_type': [],
            'distance': [],
            'file_key': [],
        }
        for _, row in current_relations.iterrows():
            inverse_relations['prop_id_source'].append(row['prop_id_target'])
            inverse_relations['prop_id_target'].append(row['prop_id_source'])
            inverse_relations['relation_type'].append(row['relation_type'] + "_Inverse")
            inverse_relations['distance'].append(-row['distance'])
            inverse_relations['file_key'].append(row['file_key'])
        
        inverse_relations = pandas.DataFrame(inverse_relations)
        
        # Sanity checks
#         print("NEGATIVE PROP IDs")
#         print("TARGET < 0", current_target_arg_units[current_target_arg_units['prop_id'] < 0])
#         print("SOURCE < 0", current_source_arg_units[current_source_arg_units['prop_id'] < 0])
#         print("RELATION TARGET < 0:", list(current_relations[current_relations['prop_id_target'] < 0]['file_key']))
#         print("RELATION SOURCE < 0:", list(current_relations[current_relations['prop_id_source'] < 0]['file_key']))
#         def check_max(s_t_data, data, max_column, compare_to, title):
#             for file, df in data.groupby(by='file_key'):
#                 maxim = s_t_data[s_t_data['file_key'] == file][max_column].max()
#                 print(title, maxim, file)
#                 print(df[df[compare_to] > maxim])
#         check_max(current_target_arg_units, current_relations, 'prop_id', 'prop_id_target', "RELATION TARGET > max")
#         check_max(current_source_arg_units, current_relations, 'prop_id', 'prop_id_source', "RELATION SOURCE > max")
#         print("BEFORE")
#         find_duplicates(current_relations, 'prop_id_source', 'prop_id_target', 'file_key')

        current_relations = pandas.concat([current_relations, inverse_relations], ignore_index=True)

        params[f'{name}_source_propositions'] = current_source_arg_units
        params[f'{name}_target_propositions'] = current_target_arg_units
        params[f'{name}_relations'] = current_relations

        print(name, "relations", len(current_relations))
        print(name, "source argumentative units", len(current_source_arg_units))
        print(name, "target argumentative units", len(current_target_arg_units))

        relation_tags.update(current_relations['relation_type'])
        proposition_tags.update(current_source_arg_units['prop_type'])
        proposition_tags.update(current_target_arg_units['prop_type'])
    

    vocabulary = source_vocabulary.union(target_vocabulary)
    params['vocabulary'] = vocabulary
    print("Vocab size", len(vocabulary))
    
    relation_tags = list(relation_tags)
    proposition_tags = list(proposition_tags)
    print("Relation tags", relation_tags)
    print("Proposition tags", proposition_tags)
    params['relation_tags'] = relation_tags
    params['proposition_tags'] = proposition_tags
    
    max_size_prop = max(max_size_in_source_prop, max_size_in_target_prop)
    max_amount_doc = max(max_amount_source_in_doc, max_amount_target_in_doc)
    params['max_size_prop'] = max_size_prop
    params['max_amount_doc'] = max_amount_doc
    
    print('max_size_prop', max_size_prop)
    print('max_amount_doc', max_amount_doc)

    # Vectorizers
    sequence_vectorizer = layers.TextVectorization(
        output_mode = "int",
        max_tokens = len(vocabulary) + 2, # Plus PAD and UNK
        output_sequence_length = int(max_size_prop),
        standardize = params['sequence_standardize'],
        split = params['sequence_split']
    )
    sequence_vectorizer.adapt(pandas.concat([
        params['train_source_propositions'],
        params['train_target_propositions'],
    ], ignore_index=True)['prop_text'])
    params['sequence_vectorizer'] = sequence_vectorizer
    
    relation_tag_vectorizer = layers.TextVectorization(
        output_mode = "int",
        max_tokens = len(relation_tags) + 2, # Plus PAD and UNK
        output_sequence_length = 1,
        standardize = None,
        split = None
    )
    relation_tag_vectorizer.adapt(relation_tags)
    params['relation_tag_vectorizer'] = relation_tag_vectorizer
    
    proposition_tag_vectorizer = layers.TextVectorization(
        output_mode = "int",
        max_tokens = len(proposition_tags) + 2, # Plus PAD and UNK
        output_sequence_length = 1,
        standardize = None,
        split = None
    )
    proposition_tag_vectorizer.adapt(proposition_tags)
    params['proposition_tag_vectorizer'] = proposition_tag_vectorizer
    
    # One-Hot Encoders
    relation_encoder = layers.CategoryEncoding(
        num_tokens=len(relation_tag_vectorizer.get_vocabulary()), # Plus PAD and UNK
        output_mode="one_hot",
    )
    params['relation_encoder'] = relation_encoder
    
    proposition_encoder = layers.CategoryEncoding(
        num_tokens=len(proposition_tag_vectorizer.get_vocabulary()), # Plus PAD and UNK
        output_mode="one_hot",
    )
    params['proposition_encoder'] = proposition_encoder
    
    
extract_propositions(params)

dev relations 652
dev source argumentative units 326
dev target argumentative units 144
test relations 1618
test source argumentative units 809
test target argumentative units 365
train relations 5394
train source argumentative units 2697
train target argumentative units 1198
Vocab size 7331
Relation tags ['supports_Inverse', 'attacks_Inverse', 'attacks', 'supports']
Proposition tags ['Premise', 'Claim']
max_size_prop 72
max_amount_doc 20


In [4]:

def creating_glove_embeddings(params: dict):
    
    if Path(params["glove_path"]).exists():
        print("Glove Embedding Matrix Found")
        embedding_matrix = np.load(params["glove_path"])["embeddings"]
        params['embedding_matrix'] = embedding_matrix
        return
    
    # Loading Glove
    hits = 0
    embedding_dim = params['dim']
    word_to_index = dict(map(lambda x: (x[1], x[0]), enumerate(params['sequence_vectorizer'].get_vocabulary())))
    num_tokens = len(word_to_index) # Plus padding and unknown 

    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    with Path(params["glove_raw_path"]).open() as f:
        for line_idx, line in enumerate(f):
            if line_idx % 100000 == 0:
                print('- At line {}'.format(line_idx))
            line = line.strip().split()
            if len(line) != 300 + 1:
                continue
            word = line[0]
            embedding = line[1:]
            if word in word_to_index:
                hits += 1
                word_idx = word_to_index[word]
                embedding_matrix[word_idx] = embedding
                
    print('- Done. Found {} vectors for {} words'.format(hits, num_tokens - 2))
    
    params['embedding_matrix'] = embedding_matrix
    Path(params["glove_path"], "..").resolve().mkdir(exist_ok=True, parents=True)
    np.savez_compressed(params["glove_path"], embeddings=embedding_matrix)

creating_glove_embeddings(params)

Glove Embedding Matrix Found


## Building Datasets

In [5]:
# Encode Dataset

def encode_distance(distance, encode_size):
    """
    return: Tensor with the encoded distance
    """
    middle = encode_size // 2

    abs_distance = tf.abs(distance)
    zeros = tf.zeros((tf.maximum(1, abs_distance), middle))

    to_sum = tf.concat([zeros, tf.eye(abs_distance, num_columns=middle)], axis=0)
    distance_vec = tf.foldl(lambda x, y: tf.add(x, y), to_sum)

    if distance < 0:
        first_vec = tf.reverse(distance_vec, axis=[0])
        second_vec = zeros[0]
    else:
        first_vec = zeros[0]
        second_vec = distance_vec

    return tf.concat([first_vec, second_vec], axis=0)


def encode_datasets(params: dict):
    sequence_vectorizer = params['sequence_vectorizer']
    proposition_tag_vectorizer = params['proposition_tag_vectorizer']
    relation_tag_vectorizer = params['relation_tag_vectorizer']
    proposition_encoder = params['proposition_encoder']
    relation_encoder = params['relation_encoder']
    max_amount_doc = params['max_amount_doc']
    distance_encoding_bits = params['max_distance_encoded'] * 2
    max_proposition_distance = params['max_proposition_distance']

    df_path = Path(params['export_path'], 'data_df.pkl')
    if df_path.exists():
        data_dataframe = pandas.read_pickle(df_path)
    else:
        data_dataframe = pandas.DataFrame(
            columns = [
                'file_key', 
                'source_prop_id', 
                'target_prop_id', 
                'source_prop_text',
                'target_prop_text',
                'source_prop_type',
                'target_prop_type',
                'relation_type', 
                'distance',
                'split',
            ])

        for split in ['dev', 'test', 'train']:

            source_arg_units = params[f'{split}_source_propositions']
            target_arg_units = params[f'{split}_target_propositions']
            relations = params[f'{split}_relations']

            all_arg_units = pandas.concat([source_arg_units, target_arg_units], ignore_index=True)
            all_arg_units = all_arg_units.drop_duplicates()
            all_arg_units = [(file, df) for file, df in all_arg_units.groupby(by='file_key')]
            
            for file_key, file_source_df in all_arg_units:
                file_target_df = file_source_df.copy()
                file_relations = relations[relations['file_key'] == file_key]

                current_file_info = {
                    'file_key': [], 
                    'source_prop_id': [],
                    'target_prop_id': [],
                    'source_prop_text': [],
                    'target_prop_text': [],
                    'source_prop_type': [],
                    'target_prop_type': [],
                    'relation_type': [],
                    'distance': [],
                    'split': [],
                }
                
                for _, source_row in file_source_df.iterrows():
                    source_id = source_row['prop_id']
                    for _, target_row in file_target_df.iterrows():
                        target_id = target_row['prop_id']

                        # Same relations not allowed
                        if source_id == target_id:
                            continue

                        distance = target_id - source_id
                        # Distance is greater than the max alowed distance between propositions
                        if abs(distance) > max_proposition_distance:
                            continue


                        source_target_relation = file_relations[(file_relations['prop_id_target'] == target_id) & (file_relations['prop_id_source'] == source_id)]
                        
                        if len(source_target_relation) == 0:
                            # No related propositions
                            relation_type = '' # No Relation
                            distance = 0 # Mock Distance
                            source_target_relation = pandas.concat([source_target_relation, pandas.DataFrame({
                                'prop_id_source': [source_id],
                                'prop_id_target': [target_id],
                                'relation_type': [relation_type],
                                'distance': [distance],
                                'file_key': [file_key]
                            })])
                            
                        if len(source_target_relation) > 1:
                            print("WARNING: Multiple relation with single source-target pair")
                            print(source_target_relation)

                        for _, relation_row in source_target_relation.iterrows():

                            assert relation_row['distance'] == distance, f"{relation_row['distance']} != {distance}"

                            # Adding data
                            current_file_info['file_key'].append(file_key)
                            current_file_info['source_prop_id'].append(source_id)
                            current_file_info['target_prop_id'].append(target_id)
                            current_file_info['source_prop_text'].append(source_row['prop_text'])
                            current_file_info['target_prop_text'].append(target_row['prop_text'])
                            current_file_info['source_prop_type'].append(source_row['prop_type'])
                            current_file_info['target_prop_type'].append(target_row['prop_type'])
                            current_file_info['relation_type'].append(relation_row['relation_type'])
                            current_file_info['distance'].append(distance)
                            current_file_info['split'].append(split)
                    
                current_file_info = pandas.DataFrame(current_file_info)
                

                data_dataframe = pandas.concat([data_dataframe, current_file_info], ignore_index=True)
        data_dataframe.to_pickle(df_path)

    params['raw_data_dataframe'] = data_dataframe

    for split, data_dataframe in data_dataframe.groupby(by="split"):
        source_ds = tf.data.Dataset.from_tensor_slices(tf.constant(data_dataframe['source_prop_text'])).map(lambda x: sequence_vectorizer(x))
        target_ds = tf.data.Dataset.from_tensor_slices(tf.constant(data_dataframe['target_prop_text'])).map(lambda x: sequence_vectorizer(x))
        source_type_ds = tf.data.Dataset.from_tensor_slices(tf.constant(data_dataframe['source_prop_type'])).map(lambda x: proposition_encoder(proposition_tag_vectorizer([x])))
        target_type_ds = tf.data.Dataset.from_tensor_slices(tf.constant(data_dataframe['target_prop_type'])).map(lambda x: proposition_encoder(proposition_tag_vectorizer([x])))
        relation_type_ds = tf.data.Dataset.from_tensor_slices(tf.constant(data_dataframe['relation_type'])).map(lambda x: relation_encoder(relation_tag_vectorizer([x])))
        distance_ds = tf.data.Dataset.from_tensor_slices(list(data_dataframe['distance'].to_numpy(dtype=int))).map(lambda x: encode_distance(x, distance_encoding_bits))
        
        relation_counter = Counter(data_dataframe['relation_type'])
        print(split, relation_counter)
        
        # Order matters
        input_ds = tf.data.Dataset.zip((source_ds, target_ds, distance_ds))
        output_ds = tf.data.Dataset.zip((relation_type_ds, source_type_ds, target_type_ds))
        
        ds = tf.data.Dataset.zip((input_ds, output_ds))
        
        params[f"{split}_ds"] = ds

        
encode_datasets(params)

dev Counter({'': 3906, 'supports': 304, 'supports_Inverse': 304, 'attacks': 22, 'attacks_Inverse': 22})
test Counter({'': 10406, 'supports': 767, 'supports_Inverse': 767, 'attacks_Inverse': 42, 'attacks': 42})
train Counter({'': 33582, 'supports': 2541, 'supports_Inverse': 2541, 'attacks_Inverse': 155, 'attacks': 155})


## Building the model

Two versions of the model can be buit. The difference is the presence or not of an attention layer.

In [6]:
from link_prediction.models.attention import apply_attention

# Build Model

def build_model(params: dict):
    linear_embedders_dims = params['linear_embedders_dims'] # [50, 50, 50, 300]
    max_sequence_size = params['max_size_prop']
    words_amount = len(params['sequence_vectorizer'].get_vocabulary()) # Plus UNK and Pad
    embedding_dim = params['dim']
    embedding_matrix = params['embedding_matrix']
    regularizer_weight = params['regularizer_weight']
    dropout = params['dropout']
    final_embedding_dimension = params['encoder_dense_units']
    final_layer_size = params['final_size']
    pool_size = params['encoder_pool_size']
    distance_encoding_bits = params['max_distance_encoded'] * 2
    lstm_units = params['lstm_units']
    res_size = params['residual_size']
    relation_amount = len(params['relation_tag_vectorizer'].get_vocabulary()) # Plus UNK and Pad
    proposition_tag_amount = len(params['proposition_tag_vectorizer'].get_vocabulary()) # Plus UNK and Pad
    with_attention = params['with_attention']
    ensemble_amount = params['ensemble_amount']
    
    def build_embedder(max_sequence_size, words_amount, embedding_dim, embedding_matrix, linear_layers_dims, regularizer_weight, dropout):
        """
        Builds a proposition embedder
        """
        
        # Input layer
        int_sequence_input = keras.Input(
            shape=(max_sequence_size,), 
            dtype="int64"
        )

        # Embedding layer, convert an index vector into a embedding vector, by accessing embedding_matrix
        embedding_layer = layers.Embedding(
            words_amount,
            embedding_dim,
            embeddings_initializer=keras.initializers.Constant(embedding_matrix),
            trainable=False,
            input_length=max_sequence_size,
        )

        initial_layer = model_layers = embedding_layer(int_sequence_input)

        def get_linear_layer(dense_dim, linear_layer=None):
            """
            Creates a single dense layer for the embedder
            """
            
            if linear_layer is None:
                input_vec = keras.Input(shape=(embedding_dim,))
            else:
                input_vec = linear_layer
            
            linear_layer = layers.Dense(
                units=dense_dim,
                activation=None,
                kernel_initializer='he_normal',
                kernel_regularizer=keras.regularizers.l2(regularizer_weight),
                bias_regularizer=keras.regularizers.l2(regularizer_weight)
            )(input_vec)
            
            linear_layer = layers.BatchNormalization()(linear_layer)
            linear_layer = layers.Dropout(dropout)(linear_layer)
            linear_layer = layers.Activation('relu')(linear_layer)
            return input_vec, linear_layer
        
        # Linear transformation
        linear_input, linear_layer = get_linear_layer(linear_layers_dims[0])
        for dim in linear_layers_dims[1:]:
            _, linear_layer = get_linear_layer(dim, linear_layer)
        linear_layer = keras.Model(inputs=linear_input, outputs=linear_layer)
        
        # Apply linear_layer to each word embedding
        model_layers = layers.TimeDistributed(linear_layer)(model_layers)
        
        # Residual connection
        model_layers = layers.Add()([initial_layer, model_layers])
        
        return int_sequence_input, model_layers
    
    def build_dense_encoder(max_sequence_size, embedding_dim, final_dimension, pool_size, regularizer_weight):
        
        # Input layer
        embedding_inputs = keras.Input(
            shape=(max_sequence_size, embedding_dim)
        )
        
        encoder_layer = embedding_inputs
        
        linear_layer = layers.Dense(
            units=final_dimension,
            activation='relu',
            kernel_regularizer=keras.regularizers.l2(regularizer_weight),
            bias_regularizer=keras.regularizers.l2(regularizer_weight)
        )
        
        # Apply linear_layer to each word embedding
        encoder_layer = layers.TimeDistributed(linear_layer)(encoder_layer)
        
        # Average the words embeddings
        encoder_layer = layers.AveragePooling1D(
            pool_size=pool_size,
        )(encoder_layer)
    
        encoder_layer = layers.BatchNormalization()(encoder_layer)
    
        return keras.Model(inputs=embedding_inputs, outputs=encoder_layer)
    
    def build_bilstm_encoder(sequence_size, encoded_dim, lstm_units, dropout, regularizer_weight, return_sequences):

        # Input layer
        embedding_inputs = keras.Input(
            shape=(sequence_size, encoded_dim)
        )
        
        bilstm_layer = layers.Bidirectional(
            layers.LSTM(
                units=lstm_units,
                dropout=dropout,
                recurrent_dropout=dropout,
                kernel_regularizer=keras.regularizers.l2(regularizer_weight),
                recurrent_regularizer=keras.regularizers.l2(regularizer_weight),
                bias_regularizer=keras.regularizers.l2(regularizer_weight),
                return_sequences=return_sequences,
            ),
            merge_mode='mul'
        )(embedding_inputs)
        
        return keras.Model(inputs=embedding_inputs, outputs=bilstm_layer)
    
    def apply_resnet(input_layer, regularizer_weight, res_size, dropout):
        prev_layer = input_layer
        prev_block = prev_layer
        
        layers_dims = (2, 2)
        blocks = layers_dims[0]
        res_layers = layers_dims[1]

        shape = int(np.shape(input_layer)[1])

        for i in range(1, blocks + 1):
            for j in range(1, res_layers):
                prev_layer = layers.BatchNormalization()(prev_layer)

                prev_layer = layers.Dropout(dropout)(prev_layer)

                prev_layer = layers.Activation('relu')(prev_layer)

                prev_layer = layers.Dense(
                    units=res_size,
                    activation=None,
                    kernel_initializer='he_normal',
                    kernel_regularizer=keras.regularizers.l2(regularizer_weight),
                    bias_regularizer=keras.regularizers.l2(regularizer_weight),
                )(prev_layer)
            
            prev_layer = layers.BatchNormalization()(prev_layer)

            prev_layer = layers.Dropout(dropout)(prev_layer)

            prev_layer = layers.Activation('relu')(prev_layer)

            prev_layer = layers.Dense(units=shape,
                               activation=None,
                               kernel_initializer='he_normal',
                               kernel_regularizer=keras.regularizers.l2(regularizer_weight),
                               bias_regularizer=keras.regularizers.l2(regularizer_weight),
                               )(prev_layer)

            prev_layer = layers.Add()([prev_block, prev_layer])
            prev_block = prev_layer

        return prev_block
    
    def create_single_model(index):
        """
        Create a single model for the ensemble learning
        """
        
        input_distance = keras.Input(
            shape=(distance_encoding_bits, )
        )

        input_source_embedder, source_embedder = build_embedder(
            max_sequence_size, 
            words_amount, 
            embedding_dim, 
            embedding_matrix, 
            linear_embedders_dims, 
            regularizer_weight, 
            dropout
        )

        input_target_embedder, target_embedder = build_embedder(
            max_sequence_size, 
            words_amount, 
            embedding_dim, 
            embedding_matrix, 
            linear_embedders_dims, 
            regularizer_weight, 
            dropout
        )

        dense_encoder = build_dense_encoder(
            max_sequence_size, 
            embedding_dim, 
            final_embedding_dimension, 
            pool_size, 
            regularizer_weight
        )

        bilstm_encoder = build_bilstm_encoder(
            max_sequence_size, 
            final_embedding_dimension, 
            lstm_units, 
            dropout, 
            regularizer_weight,
            with_attention
        )

        # Apply dense encoder to source and target sequence features
        prev_source_layers = source_layers = dense_encoder(source_embedder)
        prev_target_layers = target_layers = dense_encoder(target_embedder)

        # Apply bilstm encoder to source and target sequence features
        source_layers = bilstm_encoder(source_layers)
        target_layers = bilstm_encoder(target_layers)

        if with_attention:
            source_layers, target_layers = apply_attention(
                input_source_embedder, 
                input_target_embedder,
                prev_source_layers,
                prev_target_layers,
                source_layers,
                target_layers,
                final_layer_size,
                index,
            )

        # Concatenate source and target sequence features with other features 
        model_layers = layers.Concatenate()([source_layers, target_layers, input_distance])
        model_layers = layers.BatchNormalization()(model_layers)
        model_layers = layers.Dropout(dropout)(model_layers)

        # Middle dense layer
        model_layers = layers.Dense(
            units=final_layer_size,
            activation='relu',
            kernel_initializer='he_normal',
            kernel_regularizer=keras.regularizers.l2(regularizer_weight),
            bias_regularizer=keras.regularizers.l2(regularizer_weight),
        )(model_layers)

        # Apply a residual network
        model_layers = apply_resnet(
            model_layers,
            regularizer_weight,
            res_size,
            dropout
        )


        model_layers = layers.BatchNormalization()(model_layers)
        model_layers = layers.Dropout(dropout)(model_layers)

        # Classifiers
        relation_classifier = layers.Dense(
            units=relation_amount,
            activation='softmax',
            name=f"relation_{index}",
        )(model_layers)

        source_classifier = layers.Dense(
            units=proposition_tag_amount,
            activation='softmax',
            name=f"source_{index}",
        )(model_layers)

        target_classifier = layers.Dense(
            units=proposition_tag_amount,
            activation='softmax',
            name=f"target_{index}",
        )(model_layers)

        # Creating final model
        model = keras.Model(
            inputs=(input_source_embedder, input_target_embedder, input_distance),
            outputs=(relation_classifier, source_classifier, target_classifier),
            name=f"{params['model_name']}_{index}"
        )
    
        model.summary()
        
        return model
    
    models = []
    
    for i in range(ensemble_amount):
        model = create_single_model(i)
        models.append(model)
    
    params[params['model_name']] = models

build_model(params)

Model: "model_attention_0"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 72)]         0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 72)]         0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 72, 300)      1881300     ['input_4[0][0]']                
                                                                                                  
 embedding (Embedding)          (None, 72, 300)      1881300     ['input_2[0][0]']                
                                                                                  

 att_masked_addition_source_0 (  (None, 72)          0           ['negative_mul[0][0]',           
 Add)                                                             'att_scores_flat_source_0[0][0]'
                                                                 ]                                
                                                                                                  
 att_masked_addition_target_0 (  (None, 72)          0           ['negative_mul[1][0]',           
 Add)                                                             'att_scores_flat_target_0[0][0]'
                                                                 ]                                
                                                                                                  
 att_weights_source_0 (Activati  (None, 72)          0           ['att_masked_addition_source_0[0]
 on)                                                             [0]']                            
          

                                                                                                  
 add_3 (Add)                    (None, 20)           0           ['add_2[0][0]',                  
                                                                  'dense_13[0][0]']               
                                                                                                  
 batch_normalization_14 (BatchN  (None, 20)          80          ['add_3[0][0]']                  
 ormalization)                                                                                    
                                                                                                  
 dropout_13 (Dropout)           (None, 20)           0           ['batch_normalization_14[0][0]'] 
                                                                                                  
 relation_0 (Dense)             (None, 6)            126         ['dropout_13[0][0]']             
          

 masking (Lambda)               (None, 72)           0           ['input_9[0][0]',                
                                                                  'input_11[0][0]']               
                                                                                                  
 att_scores_source_1 (TimeDistr  (None, 72, 1)       21          ['att_activation_source_1[0][0]']
 ibuted)                                                                                          
                                                                                                  
 att_scores_target_1 (TimeDistr  (None, 72, 1)       21          ['att_activation_target_1[0][0]']
 ibuted)                                                                                          
                                                                                                  
 negative_mul (Lambda)          (None, 72)           0           ['masking[0][0]',                
          

                                                                                                  
 batch_normalization_27 (BatchN  (None, 20)          80          ['add_6[0][0]']                  
 ormalization)                                                                                    
                                                                                                  
 dropout_25 (Dropout)           (None, 20)           0           ['batch_normalization_27[0][0]'] 
                                                                                                  
 activation_22 (Activation)     (None, 20)           0           ['dropout_25[0][0]']             
                                                                                                  
 dense_26 (Dense)               (None, 50)           1050        ['activation_22[0][0]']          
                                                                                                  
 batch_nor

 repeat_query_target_2 (RepeatV  (None, 72, 20)      0           ['att_linearity_query_target_2[0]
 ector)                                                          [0]']                            
                                                                                                  
 att_K_target_2 (TimeDistribute  (None, 72, 20)      520         ['model_11[1][0]']               
 d)                                                                                               
                                                                                                  
 att_addition_source_2 (Add)    (None, 72, 20)       0           ['repeat_query_source_2[0][0]',  
                                                                  'att_K_source_2[0][0]']         
                                                                                                  
 att_addition_target_2 (Add)    (None, 72, 20)       0           ['repeat_query_target_2[0][0]',  
          

                                                                                                  
 dropout_37 (Dropout)           (None, 20)           0           ['batch_normalization_40[0][0]'] 
                                                                                                  
 activation_32 (Activation)     (None, 20)           0           ['dropout_37[0][0]']             
                                                                                                  
 dense_38 (Dense)               (None, 50)           1050        ['activation_32[0][0]']          
                                                                                                  
 batch_normalization_41 (BatchN  (None, 50)          200         ['dense_38[0][0]']               
 ormalization)                                                                                    
                                                                                                  
 dropout_3

 avg_query_target_3 (GlobalAver  (None, 25)          0           ['model_15[1][0]']               
 agePooling1D)                                                                                    
                                                                                                  
 avg_query_source_3 (GlobalAver  (None, 25)          0           ['model_15[0][0]']               
 agePooling1D)                                                                                    
                                                                                                  
 att_linearity_query_source_3 (  (None, 20)          520         ['avg_query_target_3[0][0]']     
 Dense)                                                                                           
                                                                                                  
 att_linearity_query_target_3 (  (None, 20)          520         ['avg_query_source_3[0][0]']     
 Dense)   

                                                                                                  
 att_cv_target_3 (Lambda)       (None, 50)           0           ['att_multiply_target_3[0][0]']  
                                                                                                  
 input_22 (InputLayer)          [(None, 10)]         0           []                               
                                                                                                  
 concatenate_3 (Concatenate)    (None, 110)          0           ['att_cv_source_3[0][0]',        
                                                                  'att_cv_target_3[0][0]',        
                                                                  'input_22[0][0]']               
                                                                                                  
 batch_normalization_54 (BatchN  (None, 110)         440         ['concatenate_3[0][0]']          
 ormalizat

 time_distributed_13 (TimeDistr  (None, 72, 300)     37250       ['embedding_9[0][0]']            
 ibuted)                                                                                          
                                                                                                  
 time_distributed_12 (TimeDistr  (None, 72, 300)     37250       ['embedding_8[0][0]']            
 ibuted)                                                                                          
                                                                                                  
 add_17 (Add)                   (None, 72, 300)      0           ['embedding_9[0][0]',            
                                                                  'time_distributed_13[0][0]']    
                                                                                                  
 add_16 (Add)                   (None, 72, 300)      0           ['embedding_8[0][0]',            
          

 att_weights_target_4 (Activati  (None, 72)          0           ['att_masked_addition_target_4[0]
 on)                                                             [0]']                            
                                                                                                  
 att_weights_reshape_source_4 (  (None, 72, 1)       0           ['att_weights_source_4[0][0]']   
 Reshape)                                                                                         
                                                                                                  
 att_weights_reshape_target_4 (  (None, 72, 1)       0           ['att_weights_target_4[0][0]']   
 Reshape)                                                                                         
                                                                                                  
 att_multiply_source_4 (Multipl  (None, 72, 50)      0           ['att_weights_reshape_source_4[0]
 y)       

 source_4 (Dense)               (None, 4)            84          ['dropout_69[0][0]']             
                                                                                                  
 target_4 (Dense)               (None, 4)            84          ['dropout_69[0][0]']             
                                                                                                  
Total params: 3,877,406
Trainable params: 112,366
Non-trainable params: 3,765,040
__________________________________________________________________________________________________
Model: "model_attention_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_37 (InputLayer)          [(None, 72)]         0           []                               
                                                                                                  


                                                                                                  
 att_scores_flat_source_5 (Flat  (None, 72)          0           ['att_scores_source_5[0][0]']    
 ten)                                                                                             
                                                                                                  
 att_scores_flat_target_5 (Flat  (None, 72)          0           ['att_scores_target_5[0][0]']    
 ten)                                                                                             
                                                                                                  
 att_masked_addition_source_5 (  (None, 72)          0           ['negative_mul[0][0]',           
 Add)                                                             'att_scores_flat_source_5[0][0]'
                                                                 ]                                
          

 ormalization)                                                                                    
                                                                                                  
 dropout_82 (Dropout)           (None, 50)           0           ['batch_normalization_88[0][0]'] 
                                                                                                  
 activation_71 (Activation)     (None, 50)           0           ['dropout_82[0][0]']             
                                                                                                  
 dense_83 (Dense)               (None, 20)           1020        ['activation_71[0][0]']          
                                                                                                  
 add_23 (Add)                   (None, 20)           0           ['add_22[0][0]',                 
                                                                  'dense_83[0][0]']               
          

                                                                                                  
 att_activation_source_6 (Activ  (None, 72, 20)      0           ['att_addition_source_6[0][0]']  
 ation)                                                                                           
                                                                                                  
 att_activation_target_6 (Activ  (None, 72, 20)      0           ['att_addition_target_6[0][0]']  
 ation)                                                                                           
                                                                                                  
 masking (Lambda)               (None, 72)           0           ['input_44[0][0]',               
                                                                  'input_46[0][0]']               
                                                                                                  
 att_score

                                                                                                  
 activation_81 (Activation)     (None, 50)           0           ['dropout_94[0][0]']             
                                                                                                  
 dense_95 (Dense)               (None, 20)           1020        ['activation_81[0][0]']          
                                                                                                  
 add_26 (Add)                   (None, 20)           0           ['dense_93[0][0]',               
                                                                  'dense_95[0][0]']               
                                                                                                  
 batch_normalization_102 (Batch  (None, 20)          80          ['add_26[0][0]']                 
 Normalization)                                                                                   
          

                                                                                                  
 repeat_query_source_7 (RepeatV  (None, 72, 20)      0           ['att_linearity_query_source_7[0]
 ector)                                                          [0]']                            
                                                                                                  
 att_K_source_7 (TimeDistribute  (None, 72, 20)      520         ['model_31[0][0]']               
 d)                                                                                               
                                                                                                  
 repeat_query_target_7 (RepeatV  (None, 72, 20)      0           ['att_linearity_query_target_7[0]
 ector)                                                          [0]']                            
                                                                                                  
 att_K_tar

                                                                                                  
 dropout_106 (Dropout)          (None, 110)          0           ['batch_normalization_114[0][0]']
                                                                                                  
 dense_107 (Dense)              (None, 20)           2220        ['dropout_106[0][0]']            
                                                                                                  
 batch_normalization_115 (Batch  (None, 20)          80          ['dense_107[0][0]']              
 Normalization)                                                                                   
                                                                                                  
 dropout_107 (Dropout)          (None, 20)           0           ['batch_normalization_115[0][0]']
                                                                                                  
 activatio

                                                                                                  
 model_34 (Functional)          (None, 72, 50)       15250       ['add_32[0][0]',                 
                                                                  'add_33[0][0]']                 
                                                                                                  
 model_35 (Functional)          (None, 72, 25)       15200       ['model_34[0][0]',               
                                                                  'model_34[1][0]']               
                                                                                                  
 avg_query_target_8 (GlobalAver  (None, 25)          0           ['model_35[1][0]']               
 agePooling1D)                                                                                    
                                                                                                  
 avg_query

                                                                  'model_34[0][0]']               
                                                                                                  
 att_multiply_target_8 (Multipl  (None, 72, 50)      0           ['att_weights_reshape_target_8[0]
 y)                                                              [0]',                            
                                                                  'model_34[1][0]']               
                                                                                                  
 att_cv_source_8 (Lambda)       (None, 50)           0           ['att_multiply_source_8[0][0]']  
                                                                                                  
 att_cv_target_8 (Lambda)       (None, 50)           0           ['att_multiply_target_8[0][0]']  
                                                                                                  
 input_57 

                                                                                                  
 input_67 (InputLayer)          [(None, 72)]         0           []                               
                                                                                                  
 embedding_19 (Embedding)       (None, 72, 300)      1881300     ['input_67[0][0]']               
                                                                                                  
 embedding_18 (Embedding)       (None, 72, 300)      1881300     ['input_65[0][0]']               
                                                                                                  
 time_distributed_28 (TimeDistr  (None, 72, 300)     37250       ['embedding_19[0][0]']           
 ibuted)                                                                                          
                                                                                                  
 time_dist

 att_masked_addition_target_9 (  (None, 72)          0           ['negative_mul[1][0]',           
 Add)                                                             'att_scores_flat_target_9[0][0]'
                                                                 ]                                
                                                                                                  
 att_weights_source_9 (Activati  (None, 72)          0           ['att_masked_addition_source_9[0]
 on)                                                             [0]']                            
                                                                                                  
 att_weights_target_9 (Activati  (None, 72)          0           ['att_masked_addition_target_9[0]
 on)                                                             [0]']                            
                                                                                                  
 att_weigh

 batch_normalization_149 (Batch  (None, 20)          80          ['add_39[0][0]']                 
 Normalization)                                                                                   
                                                                                                  
 dropout_139 (Dropout)          (None, 20)           0           ['batch_normalization_149[0][0]']
                                                                                                  
 relation_9 (Dense)             (None, 6)            126         ['dropout_139[0][0]']            
                                                                                                  
 source_9 (Dense)               (None, 4)            84          ['dropout_139[0][0]']            
                                                                                                  
 target_9 (Dense)               (None, 4)            84          ['dropout_139[0][0]']            
          

In [7]:
from link_prediction.models.link_utils import create_lr_annealing_function

def train_and_save_model(params: dict):
    model_name = params['model_name']
    if params['in_production']:
        epochs = params['epochs']
        train_ds = params['train_ds'].batch(batch_size)
        val_ds = params['dev_ds'].batch(batch_size)
        models = params[model_name]
    else:
        epochs = 2 
        train_ds = params['train_ds'].batch(batch_size).take(30)
        val_ds = params['dev_ds'].batch(batch_size).take(10)
        models = params[model_name][:2]
    batch_size = params['batch_size']
    loss_weights = params['loss_weights']
    lr_alpha = params['lr_alpha']
    lr_kappa = params['lr_kappa']
    relation_amount = len(params['relation_tag_vectorizer'].get_vocabulary())
    proposition_amount = len(params['proposition_tag_vectorizer'].get_vocabulary())
    global_metrics = params['metrics']
    beta_1 = params['beta_1']
    beta_2 = params['beta_2']
    min_delta = params['min_delta']
    patience = params['patience']
  
    def single_train(index, model):
        # Optimizer
        lr_function = create_lr_annealing_function(initial_lr=lr_alpha, k=lr_kappa)
        lr_scheduler = keras.callbacks.LearningRateScheduler(lr_function)
        optimizer = tf.optimizers.Adam(
            learning_rate=lr_function(0),
            beta_1=beta_1,
            beta_2=beta_2,
        )

        # EarlyStopping
        early_stopping = keras.callbacks.EarlyStopping(
            min_delta=min_delta,
            patience=patience,
            verbose=1,
        )

        # Metrics
        metrics = {
            f'relation_{index}': global_metrics.copy(),
            f'source_{index}': global_metrics.copy(),
            f'target_{index}': global_metrics.copy(),
        }
        for name, num_classes in [
                (f'relation_{index}', relation_amount), 
                (f'source_{index}', proposition_amount), 
                (f'target_{index}', proposition_amount)
            ]:

    #         f1 = tfa.metrics.F1Score( # Same as macro
    #             num_classes=num_classes,
    #             average=None,
    #             name=f'{name}F1',
    #         )
            f1_macro = tfa.metrics.F1Score(
                num_classes=num_classes,
                average='macro',
                name=f'{name}F1Macro',
            )
    #         f1_micro = tfa.metrics.F1Score( # Accuracy
    #             num_classes=num_classes,
    #             average='micro',
    #             name=f'{name}F1Micro',
    #         )
            metrics[name].extend([
    #             f1,
                f1_macro,
    #             f1_micro,
            ])
        
        current_loss_weights = {f'{name}_{index}': value for name, value in loss_weights.items()}
        
        model.compile(
            loss='categorical_crossentropy', # Apply this loss function to all outputs
            loss_weights=current_loss_weights, # Weights for the sum of the loss functions
            optimizer=optimizer,
            metrics=metrics
        )

        # Train
        history = model.fit(train_ds,
                      batch_size=batch_size, 
                      epochs=epochs, 
                      validation_data=val_ds,
                      callbacks=[
                          lr_scheduler,
                          early_stopping,
                      ])

        model.save(str(Path(params["model_path"], f"{model_name}_{index}")), save_format='tf')

        history = history.history
        for key in history:
            values = np.array(history[key]).tolist()
            history[key] = values
        params[f'history_{index}'] = history
        with Path(params['export_path'], f"{model_name}_{index}_history.json").open('w') as f:
            json.dump(history, f)
    
    for i, model in enumerate(models):
        single_train(i, model)

train_and_save_model(params)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3397, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-adb8381be1e7>", line 1, in <cell line: 1>
    from link_prediction.models.link_utils import create_lr_annealing_function
  File "/tf/notebooks/link_prediction/models/link_prediction.py", line 542, in <module>
    from link_prediction.models.attention import apply_attention
ModuleNotFoundError: No module named 'link_prediction.models'; 'link_prediction' is not a package

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 1992, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/usr/local/lib/python3.8/dist-packages/IPython/core/ultratb.py", line 1118, in structured_traceback
    return FormattedTB.structured_t

In [None]:
# Load Model
def load_saved_model(params: dict):
    ensemble_amount = params['ensemble_amount']
    
    model_name = params["model_name"]
    models = []
    for i in range(ensemble_amount):        
        model_path = Path(params["model_path"], f"{model_name}_{i}")
        if model_path.exists():
            history_path = Path(params['export_path'], f"{model_name}_{i}_history.json")
            history = json.load(history_path.open())
            params[f'history_{i}'] = history
            model = keras.models.load_model(str(model_path))
            models.append(model)
        else:
            print(f"Model in {model_path} doesn't exist")
    params[model_name] = models

    
load_saved_model(params)

In [None]:
# Evaluate model
def evaluate_model(params: dict):
    models = params[params['model_name']]
    batch_size = params['batch_size']
    test_ds = params['test_ds'].batch(batch_size)
    
    for i, model in enumerate(models):
        print("Model", i)
        results = model.evaluate(test_ds, batch_size=batch_size)


evaluate_model(params)

In [None]:
def plot_history(params: dict):
    history = params['history_0']
    relation_labels = [x if x else "None" for x in params['relation_tag_vectorizer'].get_vocabulary()]
    proposition_labels = [x if x else "None" for x in params['proposition_tag_vectorizer'].get_vocabulary()]
        
    def plot_list(values, label):
        X = [i for i in range(len(values))]
        plt.plot(X, values, label=label)
    
    def show_plot(title, x_label="Epoch", y_label=""):
        plt.title(title)
        plt.xlabel(x_label)
        plt.ylabel(y_label)
        plt.legend()
        plt.show()
        
    def plot_categorical_list(values, labels, bar_plot=False):
        values = np.array(values).T
        plt.xticks(rotation = 45)
        if bar_plot:
            plt.bar(labels, [x[-1] for x in values])
        else:
            for label, label_values in zip(labels, values):
                X = [i for i in range(len(label_values))]
                plt.plot(X, label_values, label=label)
    
    for key, value in history.items():
        value = np.array(value)
        if len(value.shape) == 1: # List values
            plot_list(value, key)
            show_plot(key, y_label="values")
        else: # Categorical values
            labels = relation_labels if 'relation' in key else proposition_labels
            plot_categorical_list(value, labels, bar_plot=True)
            show_plot(key + " bar", x_label='categories', y_label="values")
            plot_categorical_list(value, labels, bar_plot=False)
            show_plot(key, y_label="values")
        
plot_history(params)

In [None]:
class LinkPredictionModel(keras.Model):
    
    def __init__(self,
                 models,
                 sequence_vectorizer, 
                 proposition_tag_vectorizer, 
                 relation_tag_vectorizer, 
                 distance_encoding_bits,
                 batch_size=32
                ):
        super().__init__()
        self.models = models
        self.sequence_vectorizer = sequence_vectorizer
        self.proposition_tag_vectorizer = proposition_tag_vectorizer
        self.relation_tag_vectorizer = relation_tag_vectorizer
        self.distance_encoding_bits = distance_encoding_bits
        self.batch_size = batch_size
    
    def decode_outputs(self, outputs):
        propositions = self.proposition_tag_vectorizer.get_vocabulary()
        relations = self.relation_tag_vectorizer.get_vocabulary()
        
        # Flatting results
        results = []
        for output in outputs:
            relation_outputs, source_outputs, target_outputs = output
            
            result = []
            for relation_output, source_output, target_output in zip(relation_outputs, source_outputs, target_outputs):
                result.append((relation_output, source_output, target_output))
            
            results.append(result)
        
        final_result = []
        relation_eye = tf.eye(len(relations))
        proposition_eye = tf.eye(len(propositions))
        for output in zip(*results):

            # Reset voting vectors
            relation_tag_tensor = tf.zeros(shape=(len(relations)))
            target_tag_tensor = tf.zeros(shape=(len(propositions)))
            source_tag_tensor = tf.zeros(shape=(len(propositions)))
            
            for relation_output, source_output, target_output in output:
                # Add the vote for each class
                relation_tag_tensor = tf.add(relation_tag_tensor, relation_eye[tf.argmax(relation_output)])
                target_tag_tensor = tf.add(target_tag_tensor, proposition_eye[tf.argmax(target_output)])
                source_tag_tensor = tf.add(source_tag_tensor, proposition_eye[tf.argmax(source_output)])
            
            # Get the most voted class
            relation_tag = relations[tf.argmax(relation_tag_tensor)]
            target_tag_tensor = propositions[tf.argmax(target_tag_tensor)]
            source_tag_tensor = propositions[tf.argmax(source_tag_tensor)]
            
            final_result.append((relation_tag, source_tag_tensor, target_tag_tensor))
            
        return final_result
    
    def call(self, source_inputs, target_inputs, distance_inputs):
        
        source_ds = tf.data.Dataset.from_tensor_slices(tf.constant(source_inputs)).map(lambda x: self.sequence_vectorizer(x))
        target_ds = tf.data.Dataset.from_tensor_slices(tf.constant(target_inputs)).map(lambda x: self.sequence_vectorizer(x))
        distance_ds = tf.data.Dataset.from_tensor_slices(tf.constant(distance_inputs)).map(lambda x: encode_distance(x, self.distance_encoding_bits))
    
        inputs_ds = tf.data.Dataset.zip((source_ds, target_ds, distance_ds)).batch(self.batch_size)
        
        outputs = []
        
        for model in self.models:
            output = model(list(inputs_ds))
            outputs.append(output)
        
        return self.decode_outputs(outputs)

def build_link_prediction_model(params: dict):
    models = params[params['model_name']]
    sequence_vectorizer = params['sequence_vectorizer']
    proposition_tag_vectorizer = params['proposition_tag_vectorizer']
    relation_tag_vectorizer = params['relation_tag_vectorizer']
    distance_encoding_bits = params['max_distance_encoded'] * 2
    
    
    model = LinkPredictionModel(
        models=models,
        sequence_vectorizer=sequence_vectorizer,
        proposition_tag_vectorizer=proposition_tag_vectorizer,
        relation_tag_vectorizer=relation_tag_vectorizer,
        distance_encoding_bits=distance_encoding_bits
    )
    
    source = "muchos años , la gente tenía que pagar una gran cantidad de dinero prar enviar sus cartas , y sus pagos estaban relacionados con el peso de sus cartas o cajas , y muchos accidentes pueden causar el problema de que el correo no se pueda entregar"
    target = "electrónico puede contarse como uno de los resultados más beneficiosos de la tecnología moderna"
    distance = -1
    
    result = model([source], [target], [distance])
    
    print(result)

    params[params['model_name'] + "_final"] = model
    
build_link_prediction_model(params)

In [None]:
def compute_statistic(params: dict):
    data_dataframe = params['raw_data_dataframe']
    data_dataframe = data_dataframe[data_dataframe['split'] == 'test']
    
    model = params[params['model_name'] + "_final"]
    
    statistic = {
        'source_prop_text': [],
        'target_prop_text': [],
        'source_prop_type': [],
        'target_prop_type': [],
        'relation_type': [],
        'infered_source_prop_type': [],
        'infered_target_prop_type': [],
        'infered_relation_type': [], 
        'distance': [],
    }
    
    source_ds = tf.data.Dataset.from_tensor_slices(data_dataframe['source_prop_text'])
    target_ds = tf.data.Dataset.from_tensor_slices(data_dataframe['target_prop_text'])
    distance_ds = tf.data.Dataset.from_tensor_slices(list(data_dataframe['distance'].to_numpy(dtype=int)))
    source_tag_ds = tf.data.Dataset.from_tensor_slices(data_dataframe['source_prop_type'])
    target_tag_ds = tf.data.Dataset.from_tensor_slices(data_dataframe['target_prop_type'])
    relation_tag_ds = tf.data.Dataset.from_tensor_slices(data_dataframe['relation_type'])
    
    batch_num = 1
    for sources, targets, distances, source_tags, target_tags, relation_tags in tf.data.Dataset.zip((source_ds, target_ds, distance_ds, source_tag_ds, target_tag_ds, relation_tag_ds)).batch(32):
        print("batch:", batch_num)
        batch_num += 1
        
        inference = model(sources, targets, distances)
        
        statistic['source_prop_text'].extend([x.numpy().decode() for x in sources])
        statistic['target_prop_text'].extend([x.numpy().decode() for x in targets])
        statistic['source_prop_type'].extend([x.numpy().decode() for x in source_tags])
        statistic['target_prop_type'].extend([x.numpy().decode() for x in target_tags])
        statistic['relation_type'].extend([x.numpy().decode() for x in relation_tags])
        statistic['distance'].extend(distances)
        
        for relation_tag, source_tag, target_tag in inference:
            statistic['infered_source_prop_type'].append(source_tag)
            statistic['infered_target_prop_type'].append(target_tag)
            statistic['infered_relation_type'].append(relation_tag)
        
        if not params['in_production']:
            if batch_num > 10:
                break
    
    statistic = pandas.DataFrame(statistic)
    print(statistic.describe())
    params['statistic'] = statistic
    
compute_statistic(params)

## Show statistic

- [ ] Calculate consistency (If support or Inverse_support are present its inverse should be present as well)

In [None]:

def show_statistic(params: dict):
    statistic = params['statistic']
#     statistic = {
#         'source_prop_text': [],
#         'target_prop_text': [],
#         'source_prop_type': [],
#         'target_prop_type': [],
#         'relation_type': [],
#         'infered_source_prop_type': [],
#         'infered_target_prop_type': [],
#         'infered_relation_type': [], 
#         'distance': [],
#     }
    true_positive_relation = statistic[statistic['relation_type'] == statistic['infered_relation_type']]
    true_positive_source = statistic[statistic['source_prop_type'] == statistic['infered_source_prop_type']]
    true_positive_target = statistic[statistic['target_prop_type'] == statistic['infered_target_prop_type']]
    
    print("Accuracy:")
    relation_accuracy = len(true_positive_relation) / len(statistic)
    print("Relation Accuracy:", relation_accuracy)
    source_accuracy = len(true_positive_source) / len(statistic)
    print("Source Accuracy:", source_accuracy)
    target_accuracy = len(true_positive_target) / len(statistic)
    print("Target Accuracy:", target_accuracy)
    
    source_counter = Counter(statistic['source_prop_type'])
    target_counter = Counter(statistic['target_prop_type'])
    relation_counter = Counter(statistic['relation_type'])
    print("Source Counter:", source_counter)
    print("Target Counter:", target_counter)
    print("Relation Counter:", relation_counter)
    
    def plot_confusion_matrix(true_y, pred_y, title, xticks_rotation=0):
        ConfusionMatrixDisplay.from_predictions(true_y, pred_y, normalize="true")
        plt.xticks(rotation = xticks_rotation)
        plt.title(title)
        plt.show()
        
    plot_confusion_matrix(statistic['relation_type'], statistic['infered_relation_type'], "Relation", xticks_rotation=45)
    plot_confusion_matrix(statistic['source_prop_type'], statistic['infered_source_prop_type'], "Source")
    plot_confusion_matrix(statistic['target_prop_type'], statistic['infered_target_prop_type'], "Target")
    
show_statistic(params)

In [None]:
def use_model(params: dict):
    # TODO 
    data_path = params['']

## Export jupyter as module

In [8]:
if __name__ == "__main__":
    try:
        if Path(__file__).suffix == ".ipynb":
            raise NameError()
    except NameError:
        # In Jupyer Notebook
        from utils.notebook_utils import export_notebook_as_module
        export_notebook_as_module(Path("link_prediction.ipynb"))