In [None]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../..')

from functions.split_dataset import split_dataset, prepare_to_eval
import matplotlib.pyplot as plt

from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, Input, Dropout
from tensorflow.keras.layers import TimeDistributed, Bidirectional

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from seqeval.metrics import f1_score, classification_report, accuracy_score
from keras import backend as K
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from tensorflow_addons.layers import CRF
from tensorflow_addons.losses import SigmoidFocalCrossEntropy

In [None]:
seed_value = 42

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(seed_value)
# for later versions: 
# tf.compat.v1.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)

In [None]:
dataset_path = '../../../data/input/IOB/Layout3'

df = pd.DataFrame()

for filename in os.listdir(dataset_path):
    file_path = os.path.join(dataset_path, filename)
    df_file = pd.read_csv(file_path)[['Text','Tag']]
    df_file['id'] = filename.replace('.csv', '')
    df = pd.concat([df, df_file], ignore_index=True)

In [None]:
df.head()

In [None]:
df['Text'].astype(str)

In [None]:
all_documents = list(set(df["id"].values))
num_documents = len(all_documents)

all_tags = list(set(df["Tag"].values))
num_tags = len(all_tags)

all_words = list(set(df["Text"].values))
num_words = len(all_words)

print(f'Quantidade de documentos: {num_documents}')
print(f'Number of unique words: {num_words}')
print(f'Quantidade de tags: {num_tags}')

In [None]:
df[df["Tag"]!="O"]["Tag"].value_counts().plot(kind="bar", figsize=(10,5))

In [None]:
word_counts = df.groupby("id")["Text"].agg(["count"])
word_counts = word_counts.rename(columns={"count": "Word count"})
word_counts.hist(bins=50, figsize=(8,6))
word_counts.shape

In [None]:
MAX_SENTENCE = word_counts.max()[0]
print(f'Longest sentence contains {MAX_SENTENCE} words.')

In [None]:
tag_idx_dict = {tag: idx for idx, tag in enumerate(all_tags)}
idx_tag_dict = {idx: tag for idx, tag in enumerate(all_tags)}

tag_idx_dict

In [None]:
def to_tuples(data):
    iterator = zip(data["Text"].values.tolist(),
                   data["Tag"].values.tolist())
    return [(word, tag) for word, tag in iterator]

sentences = df.groupby("id").apply(to_tuples).tolist()

print(sentences[0][:10])

In [None]:
df_idx = pd.DataFrame()

df_idx['invoice_id'] = list(set(df['id']))
df_idx['text_tag'] = sentences

In [None]:
df_idx.head()

In [None]:
df_idx['text']=df_idx['text_tag'].apply(lambda text_tag:" ".join([str(s[0]) for s in text_tag]))
df_idx['tag']=df_idx['text_tag'].apply(lambda text_tag:" ".join([str(s[1]) for s in text_tag]))

In [None]:
df_idx.head()

In [None]:
df_idx.shape

In [None]:
df_idx['tokenized_text']=df_idx['text'].apply(lambda x:x.split())
df_idx['tag_list']=df_idx['tag'].apply(lambda x:x.split())
df_idx.head()

In [None]:
texts_list=df_idx['text'].tolist()
tags_list=df_idx['tag_list'].tolist()

In [None]:
tokenizer = Tokenizer(lower=False, filters='!?~[]()^_{"}\'%')
tokenizer.fit_on_texts(texts_list)
encoded_text_list = tokenizer.texts_to_sequences(texts_list)

print("Vocab size of Tokenizer ",len(tokenizer.word_index)+1)

In [None]:
encoded_tags_list = [[tag_idx_dict[w] for w in tag] for tag in tags_list]

In [None]:
max_len=502

padded_encoded_text_list = pad_sequences(maxlen=max_len, sequences=encoded_text_list, padding="post", value=0)
padded_encoded_tags_list = pad_sequences(maxlen=max_len, sequences=encoded_tags_list, padding="post", value=tag_idx_dict['O'])

In [None]:
dummy_tags = [to_categorical(i, num_classes = num_tags) for i in padded_encoded_tags_list]

In [None]:
# Train-val-test split
X, y = split_dataset(padded_encoded_text_list, dummy_tags)

In [None]:
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
from tensorflow_addons.layers import CRF
from tensorflow_addons.losses import SigmoidFocalCrossEntropy

In [None]:
max_len=502
embedding_dim=100
vocab_size=len(tokenizer.word_index)
lstm_units=50

input_word = Input(shape=(max_len,))

model = Embedding(input_dim=vocab_size+1, output_dim=embedding_dim, input_length=max_len)(input_word)

model = Bidirectional(LSTM(units=lstm_units, return_sequences=True))(model)
model = Dropout(0.1)(model)

# out = Dense(lstm_units, activation="softmax")(model)

model = TimeDistributed(Dense(50, activation="relu"))(model)  
crf = CRF(19)  # CRF layer
decoded_sequence, potentials, sequence_length, chain_kernel = crf(model)  # output

model = Model(input_word, potentials)

model.summary()

In [None]:
model.compile(optimizer=Adam(learning_rate=0.00001), 
                loss=SigmoidFocalCrossEntropy(), 
                metrics=['accuracy'])

In [None]:
history = model.fit(X['train'],
                    np.array(y['train']), 
                    validation_data=(X['val'], np.array(y['val'])),
                    batch_size=512,
                    callbacks=EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2),
                    epochs=50)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
y_pred = model.predict(X['test']) ## Predict using model on Test Data

In [None]:
pred_index_array = np.argmax(y_pred, axis=-1)
test_index_array = np.argmax(y['test'], axis=-1)

In [None]:
real_tag, pred_tag = prepare_to_eval(idx_tag_dict, test_index_array, pred_index_array)

In [None]:
accuracy_score(real_tag, pred_tag)

In [None]:
f1_score(real_tag, pred_tag)

In [None]:
print(classification_report(real_tag, pred_tag, zero_division=0))