In [4]:
import sys
from contextlib import redirect_stdout
!pip install tensorflow
from tensorflow.keras import Input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, concatenate, Lambda
from tensorflow.keras.backend import clear_session

from os import listdir
from xml.dom.minidom import parse
from nltk.tokenize import word_tokenize

import string
import re
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences




# Dataset

In [5]:
class Dataset:
    """
    Reads all XML files in a given directory and generates a list of sentences.
    Each sentence is a list of tuples (word, start, end, tag)
    """
    def __init__(self, datadir):
        self.data = {}
        # Process each file in the directory
        for f in listdir(datadir):

            # Parse the XML file, get a DOM tree
            tree = parse(datadir + "/" + f)

            # Process each sentence in the file
            sentences = tree.getElementsByTagName("sentence")
            for s in sentences:
                sid = s.attributes["id"].value  # Get the sentence ID
                stext = s.attributes["text"].value  # Get the sentence text
                entities = s.getElementsByTagName("entity")

                spans = []
                for e in entities:
                    # For discontinuous entities, we only get the first span.
                    # Obviously, the model will not work for this type of entities. Their encoding is complex.
                    # But it doesn't matter much, there are very few.
                    (start, end) = e.attributes["charOffset"].value.split(";")[0].split("-")
                    typ = e.attributes["type"].value
                    spans.append((int(start), int(end), typ))

                # Convert the sentence into a sequence of tokens
                tokens = self.__tokenize(stext)

                # Add the Gold Standard tag
                self.data[sid] = []
                for i in range(0, len(tokens)):
                    # Check if the token is part of the tag
                    tokens[i]['tag'] = self.__get_tag(tokens[i], spans)
                    self.data[sid].append(tokens[i])

    ## --------- Tokenize the sentence -----------
    ## Tokenizes the sentence
    def __tokenize(self, txt):
        offset = 0
        tks = []
        ## word_tokenize separates the words
        for t in word_tokenize(txt):
            ## Manage the position (using the offset) where each token should appear
            offset = txt.find(t, offset)
            tks.append({'lc_form': t.lower(), 'form': t, 'start': offset, 'end': offset + len(t) - 1})
            offset += len(t)

        ## List of tuples
        return tks

    ## --------- Get the Tag -----------
    def __get_tag(self, token, spans):
        for (spanS, spanE, spanT) in spans:
            if token['start'] == spanS and token['end'] <= spanE:
                return "B-" + spanT
            elif token['start'] >= spanS and token['end'] <= spanE:
                return "I-" + spanT
        return "O"

    ## Iterator over the sentences in the dataset
    def sentences(self):
        for sid in self.data:
            yield self.data[sid]

    ## Iterator over the sentence IDs
    def sentence_ids(self):
        for sid in self.data:
            yield sid

    ## Get a sentence by its ID
    def get_sentence(self, sid):
        return self.data[sid]

    ## Get sentences as lists
    def tokens(self):
        for sid in self.data:
            s = []
            for w in self.data[sid]:
                s.append((sid, w['form'], w['start'], w['end']))
            yield s

# Codemaps

In [6]:
class Codemaps:

    def __init__(self, data, maxlen=None, suflen=None):
        """
        Default constructor
        :param data:
        :param maxlen:
        :param suflen:
        """
        if isinstance(data, Dataset) and maxlen is not None and suflen is not None:
            self.__create_indexs(data, maxlen, suflen)

        elif type(data) == str and maxlen is None and suflen is None:
            self.__load(data)

        else:
            print('codemaps: Invalid or missing parameters in constructor')
            exit()

    def __create_indexs(self, data, maxlen, suflen):
        """
        Create indices from the training corpus
        Extracts all words and labels in the given sentences and creates indices to encode them as numbers
        :param data:
        :param maxlen:
        :param suflen:
        :return:
        """

        self.maxlen = maxlen
        self.suflen = suflen
        words = set([])
        lc_words = set([])
        sufs = set([])
        labels = set([])

        for s in data.sentences():
            for t in s:
                words.add(t['form'])
                sufs.add(t['lc_form'][-self.suflen:])
                labels.add(t['tag'])

        self.word_index = {w: i + 2 for i, w in enumerate(list(words))}
        self.word_index['PAD'] = 0  # Padding
        self.word_index['UNK'] = 1  # Unknown words

        self.suf_index = {s: i + 2 for i, s in enumerate(list(sufs))}
        self.suf_index['PAD'] = 0  # Padding
        self.suf_index['UNK'] = 1  # Unknown suffixes

        self.label_index = {t: i + 1 for i, t in enumerate(list(labels))}
        self.label_index['PAD'] = 0  # Padding

    def __load(self, name):
        """
        Load the indices
        :param name:
        :return:
        """
        self.maxlen = 0
        self.suflen = 0
        self.word_index = {}
        self.suf_index = {}
        self.label_index = {}

        with open(name + ".idx") as f:
            for line in f.readlines():
                (t, k, i) = line.split()
                if t == 'MAXLEN':
                    self.maxlen = int(k)
                elif t == 'SUFLEN':
                    self.suflen = int(k)
                elif t == 'WORD':
                    self.word_index[k] = int(i)
                elif t == 'SUF':
                    self.suf_index[k] = int(i)
                elif t == 'LABEL':
                    self.label_index[k] = int(i)

    def save(self, name):
        """
        Save the indices
        :param name:
        :return:
        """
        with open(name + ".idx", "w") as f:
            print('MAXLEN', self.maxlen, "-", file=f)
            print('SUFLEN', self.suflen, "-", file=f)
            for key in self.label_index: print('LABEL', key, self.label_index[key], file=f)
            for key in self.word_index: print('WORD', key, self.word_index[key], file=f)
            for key in self.suf_index: print('SUF', key, self.suf_index[key], file=f)

    def encode_words(self, data):
        """
        Encodes X (text to indices)
        :param data:
        :return:
        """
        # Encode the words and add padding
        Xw = [[self.word_index[w['form']] if w['form'] in self.word_index else self.word_index['UNK'] for w in s] for s
              in data.sentences()]
        Xw = pad_sequences(maxlen=self.maxlen, sequences=Xw, padding="post", value=self.word_index['PAD'])
        # Encode the suffixes and add padding
        Xs = [[self.suf_index[w['lc_form'][-self.suflen:]] if w['lc_form'][-self.suflen:] in self.suf_index else
               self.suf_index['UNK'] for w in s] for s in data.sentences()]
        Xs = pad_sequences(maxlen=self.maxlen, sequences=Xs, padding="post", value=self.suf_index['PAD'])
        # Return the sentences
        return [Xw, Xs]

    def encode_labels(self, data):
        """
        Encodes labels and adds padding
        :param data:
        :return:
        """
        Y = [[self.label_index[w['tag']] for w in s] for s in data.sentences()]
        Y = pad_sequences(maxlen=self.maxlen, sequences=Y, padding="post", value=self.label_index["PAD"])
        return np.array(Y)

    def get_n_words(self):
        """
        Returns the number of word indices
        :return:
        """
        return len(self.word_index)

    def get_n_sufs(self):
        """
        Returns the number of suffix indices
        :return:
        """
        return len(self.suf_index)

    def get_n_labels(self):
        """
        Returns the number of labels
        :return:
        """
        return len(self.label_index)

    def word2idx(self, w):
        """
        Returns the index of a word
        :param w:
        :return:
        """
        return self.word_index[w]

    def suff2idx(self, s):
        """
        Returns the index of a suffix
        :param s:
        :return:
        """
        return self.suf_index[s]

    def label2idx(self, l):
        """
        Returns the index of a label
        :param l:
        :return:
        """
        return self.label_index[l]

    def idx2label(self, i):
        """
        Returns the label given an index
        :param i:
        :return:
        """
        for l in self.label_index:
            if self.label_index[l] == i:
                return l
        raise KeyError

# Train

In [7]:
def build_network(codes):
    """
    Function that builds the neural network.
    :param codes:
    :return:
    """
    global lstm_units, embeddingW_dim, embeddingS_dim
    
    # sizes
    n_words = codes.get_n_words() # Number of words in the vocabulary
    n_sufs = codes.get_n_sufs() # Number of suffixes in the vocabulary
    n_labels = codes.get_n_labels() # Number of word classes
    max_len = codes.maxlen # Maximum length of each sentence, for padding

    inptW = Input(shape=(max_len,))  # Word embedding layer
    embW = Embedding(input_dim=n_words, output_dim=embeddingW_dim,
                     input_length=max_len, mask_zero=True)(inptW)

    inptS = Input(shape=(max_len,))  # Suffix embedding layer
    embS = Embedding(input_dim=n_sufs, output_dim=embeddingS_dim,
                     input_length=max_len, mask_zero=True)(inptS)

    dropW = Dropout(0.1)(embW)
    dropS = Dropout(0.1)(embS)
    drops = concatenate([dropW, dropS])

    # biLSTM
    bilstm = Bidirectional(LSTM(units=lstm_units, return_sequences=True,
                                recurrent_dropout=0.1))(drops)
    # Fully connected layer with softmax activation function to obtain the labels
    out = TimeDistributed(Dense(n_labels, activation="softmax"))(bilstm)

    # Build and compile the model
    model = Model([inptW, inptS], out)
    model.compile(optimizer="adam",
                  loss="sparse_categorical_crossentropy",
                  metrics=["accuracy"])

    return model

# Main

In this case, we comment the code line by line and use it as a base case to analyze the effect of modifying different parameters. The Dataset function assigns a series of features to each word: the word in lowercase, the raw word, the position of the first and last character in the sentence, and the tag according to B-I-O schema.

Then, a vocabulary is created, and each word and suffix is assigned the corresponding index. This way, the dataset is transformed into vectors, which are then used to train the model and make predictions.

In [8]:
# Directory with the files
traindir = 'data/train'
validationdir = 'data/devel'
modelname = 'models/model_1'

# Load the training and validation corpora
# ******************************************************************
# The following features are assigned to each word:
# - lc_form: word in lowercase
# - form: raw word
# - start: position of the first character in the sentence
# - end: position of the last character in the sentence
# - tag: word classification, according to B-I-O
# ******************************************************************
traindata = Dataset(traindir)
valdata = Dataset(validationdir)

# Create the indices
# ******************************************************************
# When initializing the object of the Codemaps class, sets with index identifiers are created for each word, suffix, and class.
# It is used to encode sentences
# ******************************************************************
max_len = 150 # Maximum length for padding
suf_len = 5 # Last characters of each word to obtain suffix
codes = Codemaps(traindata, max_len, suf_len)

# Build the network
# ******************************************************************
# Bidirectional LSTM model, with Embedding and Dropout layer
# ******************************************************************
# These are some model parameters (they are imported in the build_network function as global variables)
lstm_units = 10
embeddingW_dim = 50
embeddingS_dim = 50

model = build_network(codes)
model.summary()

# Map the datasets
# ******************************************************************
# Encodes each sentence by assigning the corresponding index to each word or suffix, and applies padding
# Also encodes the classifications
# ******************************************************************
Xt = codes.encode_words(traindata)
Yt = codes.encode_labels(traindata)
Xv = codes.encode_words(valdata)
Yv = codes.encode_labels(valdata)

# Train the model
model.fit(Xt, Yt, batch_size=32, epochs=10, validation_data=(Xv, Yv), verbose=1)

# Save the model and the indices
model.save(modelname)
codes.save(modelname)



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 150)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 150)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 150, 50)              483800    ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 150, 50)              247850    ['input_2[0][0]']             
                                                                                            

INFO:tensorflow:Assets written to: models/model_1\assets


# Change dimensions for Embbeding layer

Results do not improve, lower dimensions seem to work better.

In [9]:
dimensions = [10, 300]
model_names = ['embedding_10_dim', 'embedding_300_dim']

for dimension, model_name in zip(dimensions, model_names):
    clear_session()
    
    embeddingW_dim, embeddingS_dim = dimension, dimension
    model = build_network(codes)
    
    model.fit(Xt, Yt, batch_size=32, epochs=10, validation_data=(Xv, Yv), verbose=1)
    model.save(model_name)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: embedding_10_dim\assets


INFO:tensorflow:Assets written to: embedding_10_dim\assets


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: embedding_300_dim\assets


INFO:tensorflow:Assets written to: embedding_300_dim\assets


# Change max Padding

Result do not improve increasing sentence length. So we choose lower length to decrease computational cost:

In [10]:
max_lens = [50, 300]
model_names = ['max_len_50', 'max_len_300']

for modelname, max_len in zip(model_names, max_lens):
    clear_session()
    
    codes = Codemaps(traindata, max_len, 5)
    
    lstm_units = 10
    embeddingW_dim, embeddingS_dim = 10, 10
    model = build_network(codes)
    
    Xt = codes.encode_words(traindata)
    Yt = codes.encode_labels(traindata)
    Xv = codes.encode_words(valdata)
    Yv = codes.encode_labels(valdata)
    
    model.fit(Xt, Yt, batch_size=32, epochs=10, validation_data=(Xv, Yv), verbose=1)
    model.save(modelname)
    codes.save(modelname)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: max_len_50\assets


INFO:tensorflow:Assets written to: max_len_50\assets


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: max_len_300\assets


INFO:tensorflow:Assets written to: max_len_300\assets


# Change sufix length

The suffix length that yields the best results for predicting the word category is 5 characters. Suffixes have variable lengths, and in each case, a different number of characters might be needed. However, the model cannot know this in advance, so a fixed length must be chosen. In this case, it is observed that on average, the length that works best is 5 characters, although the difference is small.

In [11]:
suf_lens = [3, 7]
model_names = ['suf_len_3', 'suf_len_7']


for modelname, suf_len in zip(model_names, suf_lens):
    clear_session()
    
    codes = Codemaps(traindata, 50, suf_len)
    lstm_units = 10
    embeddingW_dim, embeddingS_dim = 10, 10
    model = build_network(codes)
    
    Xt = codes.encode_words(traindata)
    Yt = codes.encode_labels(traindata)
    Xv = codes.encode_words(valdata)
    Yv = codes.encode_labels(valdata)
    
    model.fit(Xt, Yt, batch_size=32, epochs=10, validation_data=(Xv, Yv), verbose=1)
    model.save(modelname)
    codes.save(modelname)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: suf_len_3\assets


INFO:tensorflow:Assets written to: suf_len_3\assets


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: suf_len_7\assets


INFO:tensorflow:Assets written to: suf_len_7\assets


# Increase LSTM units

When we use 100 LSTM units in the model, the accuracy with the training data improves significantly. However, the accuracy with the validation data does not improve as much, and the difference between the two increases. This is due to overfitting, as expected when increasing the complexity of the model. We see that with 20 LSTM units, the result is similar to that with 100 LSTM units, and the overfitting is lower. Additionally, the computational cost of training the model is significantly lower with fewer LSTM units.

When using 300 LSTM units, the accuracy with the validation data does not improve either, as expected.

In conclusion, we see that it is better to use a limited number (20 is enough) of LSTM units, as this avoids overfitting, achieves similar results with the validation data, and the computational cost is lower.

In [12]:
clear_session()

codes = Codemaps(traindata, 50, 5)

# 100 unidades
lstm_units = 100
embeddingW_dim, embeddingS_dim = 20, 20
model = build_network(codes)
    
Xt = codes.encode_words(traindata)
Yt = codes.encode_labels(traindata)
Xv = codes.encode_words(valdata)
Yv = codes.encode_labels(valdata)

modelname = 'more_lstm_units'
model.fit(Xt, Yt, batch_size=32, epochs=10, validation_data=(Xv, Yv), verbose=1)
model.save(modelname)
codes.save(modelname)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: more_lstm_units\assets


INFO:tensorflow:Assets written to: more_lstm_units\assets


In [13]:
clear_session()

codes = Codemaps(traindata, 50, 5)

# 300 unidades
lstm_units = 100
embeddingW_dim, embeddingS_dim = 20, 20
model = build_network(codes)
    
Xt = codes.encode_words(traindata)
Yt = codes.encode_labels(traindata)
Xv = codes.encode_words(valdata)
Yv = codes.encode_labels(valdata)

modelname = '300_lstm_units'
model.fit(Xt, Yt, batch_size=32, epochs=10, validation_data=(Xv, Yv), verbose=1)
model.save(modelname)
codes.save(modelname)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: 300_lstm_units\assets


INFO:tensorflow:Assets written to: 300_lstm_units\assets


# Repeat the exercise using prefix instead of sufix

In [14]:
class Codemaps_pref:

    def __init__(self, data, maxlen=None, preflen=None):
        """
        Default constructor
        :param data:
        :param maxlen:
        :param preflen:
        """
        if isinstance(data, Dataset) and maxlen is not None and preflen is not None:
            self.__create_indexes(data, maxlen, preflen)

        elif type(data) == str and maxlen is None and preflen is None:
            self.__load(data)

        else:
            print('codemaps: Invalid or missing parameters in constructor')
            exit()

    def __create_indexes(self, data, maxlen, preflen):
        """
        Create indexes from the learning corpus
        Extracts all the words and labels in the given sentences and creates indexes to encode them as numbers
        :param data:
        :param maxlen:
        :param preflen:
        :return:
        """

        self.maxlen = maxlen
        self.preflen = preflen
        words = set([])
        lc_words = set([])
        prefs = set([])
        labels = set([])

        for s in data.sentences():
            for t in s:
                words.add(t['form'])
                prefs.add(t['lc_form'][:self.preflen])
                labels.add(t['tag'])

        self.word_index = {w: i + 2 for i, w in enumerate(list(words))}
        self.word_index['PAD'] = 0  # Padding
        self.word_index['UNK'] = 1  # Unknown words

        self.pref_index = {s: i + 2 for i, s in enumerate(list(prefs))}
        self.pref_index['PAD'] = 0  # Padding
        self.pref_index['UNK'] = 1  # Unknown prefixes

        self.label_index = {t: i + 1 for i, t in enumerate(list(labels))}
        self.label_index['PAD'] = 0  # Padding

    def __load(self, name):
        """
        Load the indexes
        :param name:
        :return:
        """
        self.maxlen = 0
        self.preflen = 0
        self.word_index = {}
        self.pref_index = {}
        self.label_index = {}

        with open(name + ".idx") as f:
            for line in f.readlines():
                (t, k, i) = line.split()
                if t == 'MAXLEN':
                    self.maxlen = int(k)
                elif t == 'PREFLEN':
                    self.preflen = int(k)
                elif t == 'WORD':
                    self.word_index[k] = int(i)
                elif t == 'PREF':
                    self.pref_index[k] = int(i)
                elif t == 'LABEL':
                    self.label_index[k] = int(i)

    def save(self, name):
        """
        Save the indexes
        :param name:
        :return:
        """
        with open(name + ".idx", "w") as f:
            print('MAXLEN', self.maxlen, "-", file=f)
            print('PREFLEN', self.preflen, "-", file=f)
            for key in self.label_index: print('LABEL', key, self.label_index[key], file=f)
            for key in self.word_index: print('WORD', key, self.word_index[key], file=f)
            for key in self.pref_index: print('PREF', key, self.pref_index[key], file=f)

    def encode_words(self, data):
        """
        Encodes X (text to indexes)
        :param data:
        :return:
        """
        # Encodes the words and adds padding
        Xw = [[self.word_index[w['form']] if w['form'] in self.word_index else self.word_index['UNK'] for w in s] for s
              in data.sentences()]
        Xw = pad_sequences(maxlen=self.maxlen, sequences=Xw, padding="post", value=self.word_index['PAD'])
        # Encodes the prefixes and adds padding
        Xs = [[self.pref_index[w['lc_form'][:self.preflen]] if w['lc_form'][:self.preflen] in self.pref_index else
               self.pref_index['UNK'] for w in s] for s in data.sentences()]
        Xs = pad_sequences(maxlen=self.maxlen, sequences=Xs, padding="post", value=self.pref_index['PAD'])
        # Returns the sentences
        return [Xw, Xs]

    def encode_labels(self, data):
        """
        Encodes labels and adds padding
        :param data:
        :return:
        """
        Y = [[self.label_index[w['tag']] for w in s] for s in data.sentences()]
        Y = pad_sequences(maxlen=self.maxlen, sequences=Y, padding="post", value=self.label_index["PAD"])
        return np.array(Y)

    def get_n_words(self):
        """
        Returns the number of word indexes
        :return:
        """
        return len(self.word_index)

    def get_n_prefs(self):
        """
        Returns the number of prefix indexes
        :return:
        """
        return len(self.pref_index)

    def get_n_labels(self):
        """
        Returns the number of labels
        :return:
        """
        return len(self.label_index)

    def word2idx(self, w):
        """
        Returns the index of a word
        :param w:
        :return:
        """
        return self.word_index[w]

    def pref2idx(self, s):
        """
        Returns the index of a prefix
        :param s:
        :return:
        """
        return self.pref_index[s]

    def label2idx(self, l):
        """
        Returns the index of a label
        :param l:
        :return:
        """
        return self.label_index[l]

    def idx2label(self, i):
        """
        Returns the label given an index
        :param i:
        :return:
        """
        for l in self.label_index:
            if self.label_index[l] == i:
                return l
        raise KeyError


In [15]:
def build_network_pref(codes):
    """
    Function that builds the neural network.
    :param codes:
    :return:
    """
    global lstm_units, embeddingW_dim, embeddingP_dim
    
    # sizes
    n_words = codes.get_n_words() # Number of words in the vocabulary
    n_prefs = codes.get_n_prefs() # Number of prefixes in the vocabulary
    n_labels = codes.get_n_labels() # Number of word classes
    max_len = codes.maxlen # Maximum length of each sentence, for padding

    inptW = Input(shape=(max_len,))  # Word embedding layer
    embW = Embedding(input_dim=n_words, output_dim=embeddingW_dim,
                     input_length=max_len, mask_zero=True)(inptW)

    inptP = Input(shape=(max_len,))  # Prefix embedding layer
    embP = Embedding(input_dim=n_prefs, output_dim=embeddingP_dim,
                     input_length=max_len, mask_zero=True)(inptP)

    dropW = Dropout(0.1)(embW)
    dropP = Dropout(0.1)(embP)
    drops = concatenate([dropW, dropP])

    # biLSTM
    bilstm = Bidirectional(LSTM(units=lstm_units, return_sequences=True,
                                recurrent_dropout=0.1))(drops)
    # Fully connected layer with softmax activation function to obtain the labels
    out = TimeDistributed(Dense(n_labels, activation="softmax"))(bilstm)

    # Build and compile the model
    model = Model([inptW, inptP], out)
    model.compile(optimizer="adam",
                  loss="sparse_categorical_crossentropy",
                  metrics=["accuracy"])

    return model

The result is worse if we use prefixes. This makes sense, as the type or category of a word can be better predicted by looking at its ending (suffix). In fact, the accuracy with the training data improves, but not the accuracy with the validation data. This implies that there is greater overfitting.

In [16]:
clear_session()

# Directory with the files
traindir = 'data/train'
validationdir = 'data/devel'
modelname = 'models/model_1'

# Load the training and validation corpora
# ******************************************************************
# The following features are assigned to each word:
# - lc_form: word in lowercase
# - form: raw word
# - start: position of the first character in the sentence
# - end: position of the last character in the sentence
# - tag: word classification, according to B-I-O
# ******************************************************************
traindata = Dataset(traindir)
valdata = Dataset(validationdir)

# Create the indexes
# ******************************************************************
# When initializing the object of the Codemaps class, sets with index identifiers are created for each word, suffix, and class.
# It is used to encode sentences
# ******************************************************************
max_len = 150 # Maximum length for padding
suf_len = 5 # Last characters of each word to obtain suffix
codes = Codemaps_pref(traindata, max_len, suf_len)

# Build the network
# ******************************************************************
# Bidirectional LSTM model, with Embedding and Dropout layer
# ******************************************************************
# These are some model parameters (imported into the build_network function as global variables)
lstm_units = 20
embeddingW_dim = 50
embeddingP_dim = 50

model = build_network_pref(codes)
model.summary()

# Map the datasets
# ******************************************************************
# Encodes each sentence by assigning the corresponding index to each word or suffix, and applies padding
# Also encodes the classifications
# ******************************************************************
Xt = codes.encode_words(traindata)
Yt = codes.encode_labels(traindata)
Xv = codes.encode_words(valdata)
Yv = codes.encode_labels(valdata)

# Train the model
model.fit(Xt, Yt, batch_size=32, epochs=10, validation_data=(Xv, Yv), verbose=1)

# Save the model and the indexes
model.save(modelname)
codes.save(modelname)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 150)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 150)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 150, 50)              483800    ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 150, 50)              258750    ['input_2[0][0]']             
                                                                                              

INFO:tensorflow:Assets written to: models/model_1\assets
