# Imports

In [None]:
from keras.models import Sequential
from keras.models import Model

from keras.layers import Bidirectional
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import TimeDistributed
from keras.layers import Input
from keras.layers import Masking
from keras.layers import Dropout
from keras.layers import GRU

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from keras.callbacks import EarlyStopping

from keras.optimizers import Adam

In [None]:
from urllib import request
import os
import time
import zipfile
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import string
import gensim
import gensim.downloader as gloader
import scipy.sparse 
import gc
import itertools

from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support



# Dataset download and encoding

In [None]:
dataset_folder = os.path.join(os.getcwd(), "Datasets")

if not os.path.exists(dataset_folder):
  os.makedirs(dataset_folder)

url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"

dataset_path = os.path.join(dataset_folder, "dependency_treebank.zip")

def download_dataset(download_path, url):
    if not os.path.exists(download_path):
        print("Downloading dataset...")
        request.urlretrieve(url, download_path)
        print("Download complete!")

def extract_dataset(download_path, extract_path):
    print("Extracting dataset...")
    with zipfile.ZipFile(download_path, "r") as loaded_zip:
        loaded_zip.extractall(extract_path)
    print("Extraction completed!")


download_dataset(dataset_path, url)
extract_dataset(dataset_path, dataset_folder)

Downloading dataset...
Download complete!
Extracting dataset...
Extraction completed!


In [None]:
def encode_dataset(dataset_folder, range, test=False): 
    special_characters = string.punctuation
    dataframe_rows = []

    for filename in sorted(os.listdir(dataset_folder))[range[0]: range[1]]:
      
      file_path = os.path.join(dataset_folder, filename)
      
      with open(file_path, mode='r', encoding='utf-8') as text_file:
        sentence = []
        tags = []
        for line in text_file:
          
          try:  
            text, tag, _ = line.split()
            sentence.append(text)
            tags.append(tag)

          except ValueError:
            
            if not test:
              dataframe_row = {
                "Sentence": sentence,
                "Tags": tags
              }
            
              sentence = []
              tags = []
              dataframe_rows.append(dataframe_row)
      
        if test:
          dataframe_rows.append({"Document": sentence, "Tags": tags})
  
    print("Dataset encoded!")
    if test:
      return pd.DataFrame(dataframe_rows, columns=["Document", "Tags"])
    else:
      return pd.DataFrame(dataframe_rows, columns=["Sentence", "Tags"])

In [None]:
treebank_folder = os.path.join(dataset_folder, "dependency_treebank")

train_df = encode_dataset(treebank_folder, range=(0, 100))
print("Train set: {}".format(train_df.shape))

val_df = encode_dataset(treebank_folder, range=(100, 150))
print("Val set: {}".format(val_df.shape))

test_df = encode_dataset(treebank_folder, range=(150, 200), test=True)
print("Test set: {}".format(test_df.shape))

train_df.head()

Dataset encoded!
Train set: (1863, 2)
Dataset encoded!
Val set: (1249, 2)
Dataset encoded!
Test set: (49, 2)


Unnamed: 0,Sentence,Tags
0,"[Pierre, Vinken, ,, 61, years, old, ,, will, j...","[NNP, NNP, ,, CD, NNS, JJ, ,, MD, VB, DT, NN, ..."
1,"[A, form, of, asbestos, once, used, to, make, ...","[DT, NN, IN, NN, RB, VBN, TO, VB, NNP, NN, NNS..."
2,"[The, asbestos, fiber, ,, crocidolite, ,, is, ...","[DT, NN, NN, ,, NN, ,, VBZ, RB, JJ, IN, PRP, V..."
3,"[Lorillard, Inc., ,, the, unit, of, New, York-...","[NNP, NNP, ,, DT, NN, IN, JJ, JJ, NNP, NNP, WD..."
4,"[Although, preliminary, findings, were, report...","[IN, JJ, NNS, VBD, VBN, RBR, IN, DT, NN, IN, ,..."


# Vocabulary, co-occurrence and embedding matrix

In [None]:
embedding_dimension = 300
download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
try:
  embedding_model = gloader.load(download_path)
except ValueError as e:
  print("Invalid embedding model name! Check the embedding dimension:")
  print("Glove: 50, 100, 200, 300")
  raise e




In [None]:
def build_vocabulary(corpus):

    wordlist = []
    for x in corpus:
      wordlist.extend(x)
    words = set(wordlist)
    word_vocab = {}
    inverse_word_vocab = {}
    for i, word in enumerate(words):
      word_vocab[i] = word
      inverse_word_vocab[word] = i
    
    return word_vocab, inverse_word_vocab, words


train_idx_to_word, train_word_to_idx, train_word_listing = build_vocabulary(train_df['Sentence'])
val_idx_to_word, val_word_to_idx, val_word_listing = build_vocabulary(val_df['Sentence'])
test_idx_to_word, test_word_to_idx, test_word_listing = build_vocabulary(test_df['Document'])

print("{} words in training set".format(len(train_word_listing)))
print("{} words in validation set".format(len(val_word_listing)))
print("{} words in test set".format(len(test_word_listing)))

7841 words in training set
5768 words in validation set
3623 words in test set


In [None]:
def co_occurrence_count(corpus, idx_to_word, word_to_idx, window_size=1):

    data = []
    index_i = []
    index_j = []

    for _, words in corpus.iteritems():
      for j, word in enumerate(words[::]):
        start = max(0, j-window_size)
        end = min(len(words), j + window_size+1)
        sub_sentence = words[start:end]
        for w in sub_sentence:
          if word != w:
            data.append(1.)
            index_i.append(word_to_idx[word])
            index_j.append(word_to_idx[w])
            
    co_occurrence = scipy.sparse.csr_matrix((data, (index_i, index_j)))

    return co_occurrence

window_size = 1

# Clean RAM before re-running this code snippet to avoid session crash
if 'train_co_occurrence_matrix' in globals():
    del train_co_occurrence_matrix
    gc.collect()
    time.sleep(10.)
if 'val_co_occurrence_matrix' in globals():
    del val_co_occurrence_matrix
    gc.collect()
    time.sleep(10.)


print("Building co-occurrence count matrix... (it may take a while...)")
train_co_occurrence_matrix = co_occurrence_count(train_df['Sentence'], train_idx_to_word, train_word_to_idx, window_size)
val_co_occurrence_matrix = co_occurrence_count(val_df['Sentence'], val_idx_to_word, val_word_to_idx, window_size)
test_co_occurrence_matrix = co_occurrence_count(test_df['Document'], test_idx_to_word, test_word_to_idx, window_size)

print("Building completed!")

Building co-occurrence count matrix... (it may take a while...)
Building completed!


In [None]:
def check_OOV_terms(embedding_model, word_listing):

    OOV = [word for word in word_listing if word not in embedding_model.vocab]
    return OOV


train_oov_terms = check_OOV_terms(embedding_model, train_word_listing)
val_oov_terms = check_OOV_terms(embedding_model, val_word_listing)
test_oov_terms = check_OOV_terms(embedding_model, test_word_listing)

print("Total OOV terms in train: {0} ({1:.2f}%)".format(len(train_oov_terms), float(len(train_oov_terms)) / len(train_word_listing)*100))
print("Total OOV terms in val: {0} ({1:.2f}%)".format(len(val_oov_terms), float(len(val_oov_terms)) / len(val_word_listing)*100))
print("Total OOV terms in test: {0} ({1:.2f}%)".format(len(test_oov_terms), float(len(test_oov_terms)) / len(test_word_listing)*100))

Total OOV terms in train: 2281 (29.09%)
Total OOV terms in val: 1479 (25.64%)
Total OOV terms in test: 957 (26.41%)


In [None]:
def build_embedding_matrix(embedding_model, embedding_dimension, word_to_idx, idx_to_word, co_occurrence_matrix):
   
    embedding_matrix = np.ndarray((len(word_to_idx), embedding_dimension))

    for w in word_to_idx:
      if w in embedding_model.vocab:
        embedding_matrix[word_to_idx[w], :] = embedding_model.get_vector(w)
      else:
        occurrences = co_occurrence_matrix[word_to_idx[w]]

        close_words = []
        for i in occurrences.indices:
          if idx_to_word[i] in embedding_model.vocab:
            close_words.append(embedding_model.get_vector(idx_to_word[i]))
        if len(close_words) == 0:
          embedding_matrix[word_to_idx[w], :] = np.random.rand(1, embedding_dimension)
        else:
          embedding_matrix[word_to_idx[w], :] = np.average(close_words)

    return embedding_matrix 
  
train_embedding_matrix = build_embedding_matrix(embedding_model, embedding_dimension, train_word_to_idx, train_idx_to_word, train_co_occurrence_matrix)
val_embedding_matrix = build_embedding_matrix(embedding_model, embedding_dimension, val_word_to_idx, val_idx_to_word, val_co_occurrence_matrix)
test_embedding_matrix = build_embedding_matrix(embedding_model, embedding_dimension, test_word_to_idx, test_idx_to_word, test_co_occurrence_matrix)

print("Train embedding matrix shape: {}".format(train_embedding_matrix.shape))
print("Val embedding matrix shape: {}".format(val_embedding_matrix.shape))
print("Test embedding matrix shape: {}".format(test_embedding_matrix.shape))

Train embedding matrix shape: (7841, 300)
Val embedding matrix shape: (5768, 300)
Test embedding matrix shape: (3623, 300)


# Embedding sentences and tags

In [None]:
def embed_sentence(embedding_matrix, sentence, word_to_idx):
  embedded = []
  for w in sentence:
    embedded.append(embedding_matrix[word_to_idx[w]])
  return embedded

def embed_tags(df, tags_dict):
  df['Embedded tags indexes'] = [list(map(tags_dict.get, tags)) for tags in df.Tags]
  df['Embedded tags one hot'] = [to_categorical(tags, num_classes=len(tags_dict)+1, dtype='int32') for tags in df['Embedded tags indexes']]
  

unique_tags = set(itertools.chain.from_iterable(pd.concat([train_df, val_df]).Tags))
tags_dict = {item:val+1 for val,item in enumerate(unique_tags)}

train_df['Embedded sentence'] = [embed_sentence(train_embedding_matrix, sentence, train_word_to_idx) for sentence in train_df['Sentence']]
val_df['Embedded sentence'] = [embed_sentence(val_embedding_matrix, sentence, val_word_to_idx) for sentence in val_df['Sentence']]
test_df['Embedded sentence'] = [embed_sentence(test_embedding_matrix, document, test_word_to_idx) for document in test_df['Document']]

embed_tags(train_df, tags_dict)
embed_tags(val_df, tags_dict)
embed_tags(test_df, tags_dict)

train_df.head()

Unnamed: 0,Sentence,Tags,Embedded sentence,Embedded tags indexes,Embedded tags one hot
0,"[Pierre, Vinken, ,, 61, years, old, ,, will, j...","[NNP, NNP, ,, CD, NNS, JJ, ,, MD, VB, DT, NN, ...","[[0.1550200958180733, 0.48581725412669174, 0.3...","[9, 9, 22, 24, 1, 16, 22, 7, 11, 40, 28, 33, 4...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,..."
1,"[A, form, of, asbestos, once, used, to, make, ...","[DT, NN, IN, NN, RB, VBN, TO, VB, NNP, NN, NNS...","[[0.000534959661308676, 0.000534959661308676, ...","[40, 28, 33, 28, 41, 45, 3, 11, 9, 28, 1, 17, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,"[The, asbestos, fiber, ,, crocidolite, ,, is, ...","[DT, NN, NN, ,, NN, ,, VBZ, RB, JJ, IN, PRP, V...","[[-0.006698744371533394, -0.006698744371533394...","[40, 28, 28, 22, 28, 22, 17, 41, 16, 33, 5, 17...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,"[Lorillard, Inc., ,, the, unit, of, New, York-...","[NNP, NNP, ,, DT, NN, IN, JJ, JJ, NNP, NNP, WD...","[[-0.0017415573820471764, -0.00174155738204717...","[9, 9, 22, 40, 28, 33, 16, 16, 9, 9, 42, 17, 9...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,..."
4,"[Although, preliminary, findings, were, report...","[IN, JJ, NNS, VBD, VBN, RBR, IN, DT, NN, IN, ,...","[[-0.002929060021415353, -0.002929060021415353...","[33, 16, 1, 39, 45, 37, 33, 40, 28, 33, 22, 40...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


# Preparing data

In [None]:
def valid_labels(tags_dict):

  labels = []
  target_names = []

  for key, value in tags_dict.items():
    if not key in string.punctuation and key != "``" and key != "''":
      labels.append(value)
      target_names.append(key)

  return [labels, target_names]

In [None]:
TRAIN_MAX_SENTENCE_LENGTH = train_df.Sentence.str.len().max()
VAL_MAX_SENTENCE_LENGTH = val_df.Sentence.str.len().max()
TEST_MAX_DOCUMENT_LENGTH = test_df.Document.str.len().max()

MAX_SENTENCE_LENGTH = max(TRAIN_MAX_SENTENCE_LENGTH, VAL_MAX_SENTENCE_LENGTH)

EMBEDDING_SIZE = embedding_dimension

NUM_CLASSES = len(tags_dict) + 1

[VALID_LABELS, VALID_LABELS_NAMES] = valid_labels(tags_dict)

input_shape = (None, EMBEDDING_SIZE)

In [None]:
X_train = train_df['Embedded sentence'].values
y_train = train_df['Embedded tags one hot'].values

X_val = val_df['Embedded sentence'].values
y_val = val_df['Embedded tags one hot'].values

X_train_padded = pad_sequences(X_train, maxlen=MAX_SENTENCE_LENGTH, padding="post", dtype='float32')
y_train_padded = pad_sequences(y_train, maxlen=MAX_SENTENCE_LENGTH, padding="post")

X_val_padded = pad_sequences(X_val, maxlen=MAX_SENTENCE_LENGTH, padding="post", dtype='float32')
y_val_padded = pad_sequences(y_val, maxlen=MAX_SENTENCE_LENGTH, padding="post")

callback = EarlyStopping(monitor='val_acc', mode='max', patience=5, restore_best_weights=True)




# BILSTM

In [None]:
def BiLSTM(NUM_CLASSES, input_shape):
  
  lstm_model = Sequential(name="BiLSTM")

  lstm_model.add(Masking(mask_value=0, input_shape=input_shape))
  lstm_model.add(Bidirectional(LSTM(512, return_sequences=True)))
  lstm_model.add(TimeDistributed(Dense(NUM_CLASSES, activation="softmax")))
  lstm_model.add(Dropout(0.2))
  
  adam = Adam(lr=0.01, beta_1=0.9, beta_2=0.999)
  lstm_model.compile(loss      =  'categorical_crossentropy',
                    optimizer =  adam,
                    metrics   =  ['acc'])
  
  lstm_model.summary()
  
  return lstm_model

In [None]:
biLSTM = BiLSTM(NUM_CLASSES, input_shape)
biLSTM_history = biLSTM.fit(X_train_padded, y_train_padded, batch_size=128, epochs=50, validation_data=(X_val_padded, y_val_padded), callbacks=[callback])

Model: "BiLSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (None, None, 300)         0         
_________________________________________________________________
bidirectional (Bidirectional (None, None, 1024)        3330048   
_________________________________________________________________
time_distributed (TimeDistri (None, None, 46)          47150     
_________________________________________________________________
dropout (Dropout)            (None, None, 46)          0         
Total params: 3,377,198
Trainable params: 3,377,198
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50


# BIGRU

In [None]:
def BiGRU(NUM_CLASSES, input_shape):
  gru_model = Sequential(name="BiGRU")

  gru_model.add(Masking(mask_value=0, input_shape=input_shape))
  gru_model.add(Bidirectional(GRU(512, return_sequences=True)))
  gru_model.add(TimeDistributed(Dense(NUM_CLASSES, activation="softmax")))
  gru_model.add(Dropout(0.2))
  
  adam = Adam(lr=0.01, beta_1=0.9, beta_2=0.999)
  gru_model.compile(loss      =  'categorical_crossentropy',
                    optimizer =  adam,
                    metrics   =  ['acc'])
  
  gru_model.summary()
  
  return gru_model

In [None]:
biGRU = BiGRU(NUM_CLASSES, input_shape)
biGRU_history = biGRU.fit(X_train_padded, y_train_padded, batch_size=128, epochs=50, validation_data=(X_val_padded, y_val_padded), callbacks=[callback])

Model: "BiGRU"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_1 (Masking)          (None, None, 300)         0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 1024)        2500608   
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 46)          47150     
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 46)          0         
Total params: 2,547,758
Trainable params: 2,547,758
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50


# BIBILSTM

In [None]:
def BibiLSTM(NUM_CLASSES, input_shape):
  
  lstm_model = Sequential(name="BibiLSTM")

  lstm_model.add(Masking(mask_value=0, input_shape=input_shape))
  lstm_model.add(Bidirectional(LSTM(512, return_sequences=True)))
  lstm_model.add(Bidirectional(LSTM(512, return_sequences=True)))
  lstm_model.add(TimeDistributed(Dense(NUM_CLASSES, activation="softmax")))
  lstm_model.add(Dropout(0.2))
  
  adam = Adam(lr=0.01, beta_1=0.9, beta_2=0.999)
  lstm_model.compile(loss      =  'categorical_crossentropy',
                    optimizer =  adam,
                    metrics   =  ['acc'])
  
  lstm_model.summary()
  
  return lstm_model

In [None]:
bibiLSTM = BibiLSTM(NUM_CLASSES, input_shape)
bibiLSTM_history = bibiLSTM.fit(X_train_padded, y_train_padded, batch_size=128, epochs=50, validation_data=(X_val_padded, y_val_padded), callbacks=[callback])

Model: "BibiLSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_2 (Masking)          (None, None, 300)         0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 1024)        3330048   
_________________________________________________________________
bidirectional_3 (Bidirection (None, None, 1024)        6295552   
_________________________________________________________________
time_distributed_2 (TimeDist (None, None, 46)          47150     
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 46)          0         
Total params: 9,672,750
Trainable params: 9,672,750
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epo

# BILSTM + CRF

In [None]:
!pip install tf2crf -q
from tf2crf import CRF, ModelWithCRFLoss

In [None]:
def LSTMCRF(NUM_CLASSES, input_shape):
  
  input = Input(shape=(input_shape))
  mask = Masking(mask_value=0)(input)
  bilstm = Bidirectional(LSTM(512, return_sequences=True))(mask)
  dense = Dense(NUM_CLASSES, activation=None)(bilstm)
  drop = Dropout(0.2)(dense)
  crf = CRF()
  output = crf(drop)

  base_model = Model(input, output)
  
  lstmcrf_model = ModelWithCRFLoss(base_model)
  
  adam = Adam(lr=0.01, beta_1=0.9, beta_2=0.999)
  lstmcrf_model.compile(optimizer = adam, metrics=['acc'])
  
  return lstmcrf_model

In [None]:
y_train_indexes = train_df['Embedded tags indexes']
y_val_indexes = val_df['Embedded tags indexes']

y_train_indexes_padded = pad_sequences(y_train_indexes, maxlen=MAX_SENTENCE_LENGTH, padding="post", value=0)
y_val_indexes_padded = pad_sequences(y_val_indexes, maxlen=MAX_SENTENCE_LENGTH, padding="post", value=0)

callback = EarlyStopping(monitor='val_val_accuracy', mode='max', patience=5, restore_best_weights=True)

lstmcrf = LSTMCRF(NUM_CLASSES, input_shape=input_shape)
lstmcrf_history = lstmcrf.fit(X_train_padded, y_train_indexes_padded, batch_size=128, epochs=50, validation_data=(X_val_padded, y_val_indexes_padded), callbacks=callback)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50


# BIBILSTM + CRF

In [None]:
def BibiLSTMCRF(NUM_CLASSES, input_shape):
  
  input = Input(shape=(input_shape))
  mask = Masking(mask_value=0)(input)
  bilstm = Bidirectional(LSTM(512, return_sequences=True))(mask)
  bilstm = Bidirectional(LSTM(512, return_sequences=True))(bilstm)
  dense = TimeDistributed(Dense(NUM_CLASSES, activation=None))(bilstm)
  drop = Dropout(0.2)(dense)
  crf = CRF()
  output = crf(drop)

  base_model = Model(input, output)
  
  lstmcrf_model = ModelWithCRFLoss(base_model)
  
  adam = Adam(lr=0.01, beta_1=0.9, beta_2=0.999)
  lstmcrf_model.compile(optimizer = adam, metrics=['acc'])
  
  return lstmcrf_model

In [None]:
bibilstmcrf = BibiLSTMCRF(NUM_CLASSES, input_shape=input_shape)
bibilstmcrf_history = bibilstmcrf.fit(X_train_padded, y_train_indexes_padded, batch_size=128, epochs=50, validation_data=(X_val_padded, y_val_indexes_padded), callbacks=callback)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50


# Evaluation of models

In [None]:
def evaluate_model(y_true, y_pred, crf=False):

  y_true_valid = []
  y_pred_valid = []

  for i in range(y_pred.shape[0]):
    for j in range(y_pred.shape[1]):
      if crf:
        if y_true[i][j] != 0:
          y_true_valid.append(y_true[i][j])
          y_pred_valid.append(y_pred[i][j])
      else:
        if not np.all((y_true[i][j] == 0)):
          y_true_valid.append(np.argmax(y_true[i][j]))
          y_pred_valid.append(np.argmax(y_pred[i][j]))
  

  print("\t\tEvaluating punctuation\t\tNot evaluating punctuation")
  
  averages = ['macro', 'micro', 'weighted']
  
  for avg in averages:
    print("F1 {}:\t\t {:.2f}  \t\t\t\t{:.2f}".format(avg, f1_score(y_true_valid, y_pred_valid, average=avg, zero_division=0),
                                              f1_score(y_true_valid, y_pred_valid, labels=VALID_LABELS, average=avg, zero_division=0)))

In [None]:
models = [biLSTM, biGRU, bibiLSTM]
models_histories = [biLSTM_history, biGRU_history, bibiLSTM_history]

models_crf = [lstmcrf, bibilstmcrf]
models_crf_histories =[lstmcrf_history, bibilstmcrf_history]

for i,h in enumerate(models_histories):
  m = models[i]
  print("-" * 80)
  print("\t\t\tModel evaluation: " + m.name +"\n")
  print("Train accuracy: {:.2f}".format(max(h.history['acc'])))
  print("Validation accuracy: {:.2f}\n".format(max(h.history['val_acc'])))
  pred = m.predict(X_val_padded)
  evaluate_model(y_val_padded, pred)

for i,h in enumerate(models_crf_histories):
  m = models_crf[i]
  print("-" * 80)
  print("\t\t\tModel evaluation: " + "Bi"*(i+1) +"LSTMCRF\n")
  print("Train accuracy: {:.2f}".format(max(h.history['accuracy'])))
  print("Validation accuracy: {:.2f}\n".format(max(h.history['val_val_accuracy'])))
  pred = m.predict(X_val_padded)
  evaluate_model(y_val_indexes_padded, pred[0], crf=True)

--------------------------------------------------------------------------------
			Model evaluation: BiLSTM

Train accuracy: 0.80
Validation accuracy: 0.90

		Evaluating punctuation		Not evaluating punctuation
F1 macro:		 0.75  				0.71
F1 micro:		 0.90  				0.89
F1 weighted:		 0.90  				0.88
--------------------------------------------------------------------------------
			Model evaluation: BiGRU

Train accuracy: 0.80
Validation accuracy: 0.90

		Evaluating punctuation		Not evaluating punctuation
F1 macro:		 0.74  				0.72
F1 micro:		 0.90  				0.89
F1 weighted:		 0.90  				0.89
--------------------------------------------------------------------------------
			Model evaluation: BibiLSTM

Train accuracy: 0.79
Validation accuracy: 0.90

		Evaluating punctuation		Not evaluating punctuation
F1 macro:		 0.71  				0.66
F1 micro:		 0.90  				0.88
F1 weighted:		 0.89  				0.88
--------------------------------------------------------------------------------
			Model evaluation: BiLSTMCRF

T

# Testing best model

In [None]:
X_test = test_df['Embedded sentence'].values
y_test = test_df['Embedded tags indexes'].values
X_test_padded = pad_sequences(X_test, maxlen=TEST_MAX_DOCUMENT_LENGTH, padding="post", value=0, dtype="float32")
y_test_indexes_padded = pad_sequences(y_test, maxlen=TEST_MAX_DOCUMENT_LENGTH, padding="post", value=0)

In [None]:
y_test_pred = lstmcrf.predict(X_test_padded)

In [None]:
evaluate_model(y_test_indexes_padded, y_test_pred[0], crf=True)

		Evaluating punctuation		Not evaluating punctuation
F1 macro:		 0.82  				0.70
F1 micro:		 0.91  				0.89
F1 weighted:		 0.91  				0.90


# Error analysis

In [None]:
y_true_valid = y_test_indexes_padded[y_test_indexes_padded != 0]
y_pred_valid = y_test_pred[0][y_test_indexes_padded != 0]

In [None]:
print(classification_report(y_true_valid, y_pred_valid, labels=VALID_LABELS, target_names=VALID_LABELS_NAMES, zero_division=0))
_, _, f1, sup = precision_recall_fscore_support(y_true_valid, y_pred_valid, labels=VALID_LABELS, zero_division=0)

worst = np.argsort(f1)[:6]
w_tags = []
for w in worst:
  w_tags.append(VALID_LABELS_NAMES[w])

print("Tags with worst f1 value")
print("\tTags:\t\t\t", w_tags)
print("\t F1:\t\t\t", f1[worst])
print("       Support:\t\t\t", sup[worst])
print("Support percentage on total(%):\t", np.around(sup[worst] / sum(sup) * 100, 3))

              precision    recall  f1-score   support

          MD       0.98      1.00      0.99       167
          CD       0.96      0.96      0.96       858
          JJ       0.72      0.78      0.75       918
         RBS       1.00      0.33      0.50         3
          UH       0.00      0.00      0.00         0
          FW       0.00      0.00      0.00         0
          NN       0.92      0.89      0.90      2383
         RBR       0.50      0.47      0.48        15
         PRP       0.97      0.82      0.89       192
         PDT       0.00      0.00      0.00         4
         VBP       0.91      0.88      0.89       134
          DT       0.99      0.87      0.93      1335
         VBZ       0.96      0.95      0.95       280
         WDT       0.96      0.96      0.96        84
       -RRB-       0.10      0.44      0.16        18
         JJR       0.87      0.68      0.76        59
         WP$       1.00      0.75      0.86         4
          RP       0.58    

If we try to check the supports of the tags with lowest f1 score we can see that they have very small or none support. This of course affects the evaluation of our model, and we can see this even looking at the micro average f1 and the weighted average f1 that, giving weights to the score according to support, show greater results.

Possible solutions:

*  Oversample the tags with smaller supports and undersample the ones with bigger ones
*   Change the split of train, val, test in order to get more balanced classes
*   Modify the embedding using one single vocabulary, in order to have the same embedding for OOV in train test and split






