Uncomment and execute in Colab Notebooks

In [None]:
# # execute this cell when loading the notebook for the first time
# ! pip install gensim==4.2.0
# ! pip install keras==2.8.0

# ! git clone https://github.com/michele98/POS_tagging


# from google.colab import drive
# drive.mount('/content/drive')

# drive_folder = '/content/drive/MyDrive'

In [None]:
# # execute this cell each time the runtime is restarted

# %cd -0
# %cd POS_tagging

# import os
# os.environ['TF_DETERMINISTIC_OPS'] = '1'

Main Imports

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import urllib.request

from zipfile import ZipFile

random_seed = 42
np.random.seed(random_seed)
tf.random.set_seed(random_seed)
#os.environ['TF_DETERMINISTIC_OPS'] = '1'

%load_ext autoreload
%autoreload 2

# Create Dataset

## Download data

In [None]:
dataset_folder = os.path.join(os.getcwd(), "Datasets", "Original")

if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)

url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"

dataset_path = os.path.join(dataset_folder, "data.zip")

if not os.path.exists(dataset_path):
    urllib.request.urlretrieve(url, dataset_path)
    print("Successful download")

## Create Dataframe

In [None]:
train_range = (1, 101)
val_range = (101, 151)
test_range = (151, 200)

split_sentences = True

dataframe_rows = []
with ZipFile(dataset_path, 'r') as myzip:
    for i, filename in enumerate(myzip.namelist()[1:]):
        print("Extracting", filename, end='\r')

        with myzip.open(filename) as myfile:
            file_id = int(filename.split('.')[0][-4:])

            split = 'train'
            if file_id in range(*val_range):
                split = 'val'
            elif file_id in range(*test_range):
                split = 'test'

            content_string = myfile.read().decode('utf-8')
            if split_sentences:
                sentences = content_string.split('\n\n')
            else:
                sentences = [content_string]

            for sentence in sentences:
                content = sentence.split('\n')
                content = [line.split('\t') for line in content if len(line.split('\t')) == 3]

                words, tags, _ = zip(*content)

                dataframe_rows.append({'file_id': file_id,
                                       'text': words,
                                       'tags': tags,
                                       'split': split
                                       })

df = pd.DataFrame(dataframe_rows).sort_values('file_id').reset_index(drop=True)
print("Dataframe created.".ljust(50))

df

## Preprocessing

Convert to lowercase

In [None]:
df['text'] = df['text'].apply(lambda l: [element.lower() for element in l])
df

## Data Splitting

In [None]:
train_data = df[df['split'] == 'train']
val_data = df[df['split'] == 'val']
test_data = df[df['split'] == 'test']

x_train = train_data['text'].values
y_train = train_data['tags'].values

x_val = val_data['text'].values
y_val = val_data['tags'].values

x_test = test_data['text'].values
y_test = test_data['tags'].values

print('Dataset splits statistics: ')
print(f'Train data: {x_train.shape}')
print(f'Validation data: {x_val.shape}')
print(f'Test data: {x_test.shape}')


## Add OOV words to GloVe embeddings

In [None]:
from utils.preprocessing import load_embedding_model, add_OOV_embeddings

print("Loading GloVe embedding.")
my_embedding_dimension = 50
my_embedding_model = load_embedding_model('glove', my_embedding_dimension)

In [None]:
print("GLOVE vocabulary size: ", len(my_embedding_model))

unknown_token = '[UNK]'
padding_token = ''

print(f"Add unknown token {unknown_token} and padding token {padding_token}")
add_OOV_embeddings(my_embedding_model, [unknown_token, padding_token], my_embedding_dimension)
print("V1 size: ", len(my_embedding_model))

print("\nCreating V2 using training set (V1 + OOV1)")
add_OOV_embeddings(my_embedding_model, x_train, my_embedding_dimension)
print("V2 size: ", len(my_embedding_model))

print("\nCreating V3 using validation set (V2 + OOV2)")
add_OOV_embeddings(my_embedding_model, x_val, my_embedding_dimension)
print("V3 size: ", len(my_embedding_model))

print("\nCreating V4 using validation set (V3 + OOV3)")
add_OOV_embeddings(my_embedding_model, x_test, my_embedding_dimension)
print("V4 size: ", len(my_embedding_model))

# build vocabulary for x
dataset_vocabulary = np.unique([word for sentence in df['text'] for word in sentence])
dataset_vocabulary = np.concatenate([[unknown_token], dataset_vocabulary])

# build vocabulary for y
tags_s = ' '.join([' '.join(y) for y in df['tags']])
tag_vocabulary = pd.DataFrame(tags_s.split())[0].unique()

## Explore Tags and define punctuation tags

In [None]:
tag_vocabulary

Mmh, there are some interesting looking tags. Let's build a DataFrame with some sentence examples to see what they mean.

In [None]:
d = {}

for tag in tag_vocabulary:
    i = df.loc[np.array([tag in t for t in df['tags']])].index[0]
    s1 = df['text'][i]
    t1 = df['tags'][i]
    d[f'{tag} (w)'] = np.pad(s1, (0,90-len(s1)))
    d[f'{tag} (t)'] = np.pad(t1, (0,90-len(t1)))

with open('phrase_examples.csv', 'w') as f:
    f.write(pd.DataFrame(d).to_csv())

After examining the resulting file, we know what the punctuation tags are. Let's put them in a list.

In [None]:
punctuation_tags = [',',
                    '.',
                    '``',
                    "''",
                    ':',
                    '-LRB-',
                    '-RRB-']

## Padding x

In [None]:
padding_length = int(df['tags'].apply(lambda x: len(x)).quantile(0.95))
print("The padding length is", padding_length)

dataset_dict = {k: i for i, k in enumerate(dataset_vocabulary)}

def tokenize_x(x):
    return [[dataset_dict[word] for word in phrase] for phrase in x]

x_train_tokenized = tokenize_x(x_train)
x_val_tokenized = tokenize_x(x_val)
x_test_tokenized = tokenize_x(x_test)

x_train_pad = tf.keras.preprocessing.sequence.pad_sequences(x_train_tokenized, maxlen=padding_length, padding="post")
x_val_pad = tf.keras.preprocessing.sequence.pad_sequences(x_val_tokenized, maxlen=padding_length, padding="post")
x_test_pad = tf.keras.preprocessing.sequence.pad_sequences(x_test_tokenized, maxlen=padding_length, padding="post")

## Padding y

In [None]:
tag_dict = {k: i for i, k in enumerate(tag_vocabulary)}

def tokenize_y(y):
    return [[tag_dict[tag] for tag in phrase] for phrase in y]

y_train_tokenized = tokenize_y(y_train)
y_val_tokenized = tokenize_y(y_val)
y_test_tokenized = tokenize_y(y_test)

y_train_pad = tf.keras.preprocessing.sequence.pad_sequences(y_train_tokenized, maxlen=padding_length, padding="post")
y_val_pad = tf.keras.preprocessing.sequence.pad_sequences(y_val_tokenized, maxlen=padding_length, padding="post")
y_test_pad = tf.keras.preprocessing.sequence.pad_sequences(y_test_tokenized, maxlen=padding_length, padding="post")

# Train Models

In [None]:
from keras.optimizer_v2.adam import Adam
from keras.callbacks import ModelCheckpoint
from functools import partial

from utils.training_utils import MyHistory, plot_history
from models import embedding_layer

embedding_func = partial(embedding_layer,
                         vocabulary=dataset_vocabulary,
                         embedding_model=my_embedding_model,
                         embedding_dimension=my_embedding_dimension)

try:
    weights_folder = os.path.join(drive_folder, "weights")
except NameError as e:
    weights_folder = "weights"

checkpoint_partial = partial(ModelCheckpoint, monitor="val_loss", mode="auto")#, save_format="tf")
compile_args = dict(loss="sparse_categorical_crossentropy", metrics=["acc"])

input_shape = (padding_length, my_embedding_dimension)

## Baseline LSTM

In [None]:
from models import baselineLSTM

optimizer = Adam(learning_rate=1e-4)

checkpoint_path = os.path.join(weights_folder, "baseline", "checkpoint.hdf5")
history_path = os.path.join(weights_folder, "baseline", "history.npy")

checkpoint_callback = checkpoint_partial(filepath = checkpoint_path)
hist_callback = MyHistory(history_path)

model = baselineLSTM(num_classes=len(tag_vocabulary), input_shape=(padding_length,), embedding_func=embedding_func)
model.compile(**compile_args, optimizer=optimizer)

if os.path.exists(checkpoint_path):
    model.load_weights(checkpoint_path)

history = model.fit(x=x_train_pad,
                    y=y_train_pad,
                    batch_size=32,
                    epochs=2,
                    validation_data=(x_val_pad,
                                     y_val_pad),
                    callbacks=[checkpoint_callback, hist_callback])

plot_history(history)

## GRU

In [None]:
from models import GRUModel

optimizer = Adam(learning_rate=1e-4)

checkpoint_path = os.path.join(weights_folder, "gru", "checkpoint.hdf5")
history_path = os.path.join(weights_folder, "gru", "history.npy")

checkpoint_callback = checkpoint_partial(filepath = checkpoint_path)
hist_callback = MyHistory(history_path)

model = GRUModel(num_classes=len(tag_vocabulary), input_shape=(padding_length,), embedding_func=embedding_func)
model.compile(**compile_args, optimizer=optimizer)

if os.path.exists(checkpoint_path):
    model.load_weights(checkpoint_path)

history = model.fit(x=x_train_pad,
                    y=y_train_pad,
                    batch_size=32,
                    epochs=2,
                    validation_data=(x_val_pad,
                                     y_val_pad),
                    callbacks=[checkpoint_callback, hist_callback])

plot_history(history)

## Additional LSTM layer

In [None]:
from models import additionalLSTM

optimizer = Adam(learning_rate=1e-4)

checkpoint_path = os.path.join(weights_folder, "additionalLSTM", "checkpoint.hdf5")
history_path = os.path.join(weights_folder, "additionalLSTM", "history.npy")

checkpoint_callback = checkpoint_partial(filepath = checkpoint_path)
hist_callback = MyHistory(history_path)

model = additionalLSTM(num_classes=len(tag_vocabulary), input_shape=(padding_length,), embedding_func=embedding_func)
model.compile(**compile_args, optimizer=optimizer)

if os.path.exists(checkpoint_path):
    model.load_weights(checkpoint_path)

history = model.fit(x=x_train_pad,
                    y=y_train_pad,
                    batch_size=32,
                    epochs=2,
                    validation_data=(x_val_pad,
                                     y_val_pad),
                    callbacks=[checkpoint_callback, hist_callback])

plot_history(history)

## Additional Dense

In [None]:
from models import additionalDense

optimizer = Adam(learning_rate=1e-4)

checkpoint_path = os.path.join(weights_folder, "additionalDense", "checkpoint.hdf5")
history_path = os.path.join(weights_folder, "additionalDense", "history.npy")

checkpoint_callback = checkpoint_partial(filepath = checkpoint_path)
hist_callback = MyHistory(history_path)

model = additionalDense(num_classes=len(tag_vocabulary), input_shape=(padding_length,), embedding_func=embedding_func)
model.compile(**compile_args, optimizer=optimizer)

if os.path.exists(checkpoint_path):
    model.load_weights(checkpoint_path)

history = model.fit(x=x_train_pad,
                    y=y_train_pad,
                    batch_size=32,
                    epochs=2,
                    validation_data=(x_val_pad,
                                     y_val_pad),
                    callbacks=[checkpoint_callback, hist_callback])

plot_history(history)

# Performance Analysis

In [398]:
def flatten_data(list_of_lists):
    return [element for sequence in list_of_lists for element in sequence[:padding_length]]

def remove_punctuation(y, y_true=None):
    if y_true is None:
        y_true = y
    return [tag for tag, true_tag in zip(y, y_true) if true_tag not in punctuation_tags]

x_train_flattened = flatten_data(x_train)
x_val_flattened = flatten_data(x_val)
x_test_flattened = flatten_data(x_test)

y_train_flattened = flatten_data(y_train)
y_val_flattened = flatten_data(y_val)
y_test_flattened = flatten_data(y_test)

y_train_cleaned = remove_punctuation(y_train_flattened)
y_val_cleaned = remove_punctuation(y_val_flattened)
y_test_cleaned = remove_punctuation(y_test_flattened)

## Dummy Classifiers

In [None]:
from sklearn.dummy import DummyClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix

In [None]:
majority_classifier = DummyClassifier(strategy="prior")
stratified_classifier = DummyClassifier(strategy="stratified")

majority_classifier.fit(x_train_flattened, y_train_flattened);
stratified_classifier.fit(x_train_flattened, y_train_flattened);

## Trained models

In [None]:
def unpad_result(y_padded, y_true):
    return [y[:len(yt)] for y, yt in zip(y_padded, y_true)]

def get_model_prediction(model, x, y_true):
    output = model.predict(x)
    y_pred_model = tag_vocabulary[np.argmax(output, axis=-1)]
    y_pred_unpadded = unpad_result(y_pred_model, y_true)
    return flatten_data(y_pred_unpadded)

In [None]:
from models import baselineLSTM, GRUModel, additionalLSTM, additionalDense

baseline = baselineLSTM(num_classes=len(tag_vocabulary), input_shape=(padding_length,), embedding_func=embedding_func)
baseline.load_weights(os.path.join(weights_folder, "baseline", "checkpoint.hdf5"))

gru = GRUModel(num_classes=len(tag_vocabulary), input_shape=(padding_length,), embedding_func=embedding_func)
gru.load_weights(os.path.join(weights_folder, "gru", "checkpoint.hdf5"))

additional_LSTM = additionalLSTM(num_classes=len(tag_vocabulary), input_shape=(padding_length,), embedding_func=embedding_func)
additional_LSTM.load_weights(os.path.join(weights_folder, "additionalLSTM", "checkpoint.hdf5"))

additional_dense = additionalDense(num_classes=len(tag_vocabulary), input_shape=(padding_length,), embedding_func=embedding_func)
additional_dense.load_weights(os.path.join(weights_folder, "additionalDense", "checkpoint.hdf5"))

## Build dictionary with prediction info
fill in this dictionary to get all the statistics

In [None]:
model_prediction_train = partial(get_model_prediction, x=x_train_pad, y_true=y_train)
model_prediction_val = partial(get_model_prediction, x=x_val_pad, y_true=y_val)
model_prediction_test = partial(get_model_prediction, x=x_test_pad, y_true=y_test)

remove_punctuation_train = partial(remove_punctuation, y_true=y_train_flattened)
remove_punctuation_val = partial(remove_punctuation, y_true=y_val_flattened)
remove_punctuation_test = partial(remove_punctuation, y_true=y_test_flattened)

prediction_data = [{
    'model_label': 'maj',
    'y_pred_train': remove_punctuation_train(majority_classifier.predict(x_train_flattened)),
    'y_pred_val': remove_punctuation_val(majority_classifier.predict(x_val_flattened)),
    'y_pred_test': remove_punctuation_test(majority_classifier.predict(x_test_flattened))
    }, {
    'model_label': 'stratified',
    'y_pred_train': remove_punctuation_train(stratified_classifier.predict(x_train_flattened)),
    'y_pred_val': remove_punctuation_val(stratified_classifier.predict(x_val_flattened)),
    'y_pred_test': remove_punctuation_test(stratified_classifier.predict(x_test_flattened))
    }, {
    'model_label': 'baseline',
    'y_pred_train': remove_punctuation_train(model_prediction_train(model=baseline)),
    'y_pred_val': remove_punctuation_val(model_prediction_val(model=baseline)),
    'y_pred_test': remove_punctuation_test(model_prediction_test(model=baseline))
    }, {
    'model_label': 'gru',
    'y_pred_train': remove_punctuation_train(model_prediction_train(model=gru)),
    'y_pred_val': remove_punctuation_val(model_prediction_val(model=gru)),
    'y_pred_test': remove_punctuation_test(model_prediction_test(model=gru))
    }, {
    'model_label': 'additionalDense',
    'y_pred_train': remove_punctuation_train(model_prediction_train(model=additional_dense)),
    'y_pred_val': remove_punctuation_val(model_prediction_val(model=additional_dense)),
    'y_pred_test': remove_punctuation_test(model_prediction_test(model=additional_dense))
    }, {
    'model_label': 'additionalLSTM',
    'y_pred_train': remove_punctuation_train(model_prediction_train(model=additional_LSTM)),
    'y_pred_val': remove_punctuation_val(model_prediction_val(model=additional_LSTM)),
    'y_pred_test': remove_punctuation_test(model_prediction_test(model=additional_LSTM))
    }
]

## Show analysis

Compute F1 score

In [None]:
import matplotlib.pyplot as plt

def analyze(y_true, y_pred, output_mode=0, model_label=None):
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    f1 = report['weighted avg']['f1-score']
    if output_mode >= 1:
        print(f"{model_label}, weighted F1 macro: {f1:.2f}")

    if output_mode >= 2:
        print("Confusion matrix")
        ConfusionMatrixDisplay.from_predictions(y_true, y_pred, cmap='Blues')
        plt.show()
    return confusion_matrix(y_true, y_pred), f1

output_mode = 1
for data in prediction_data:
    for y, label in zip([y_train_cleaned, y_val_cleaned, y_test_cleaned], ['train', 'val', 'test']):
        res = analyze(y, data[f'y_pred_{label}'], output_mode=output_mode, model_label=f"{data['model_label']} {label}")
        data[f'cm_{label}'], data[f'f1_{label}'] = res

f1_train = [data['f1_train'] for data in prediction_data]
f1_val = [data['f1_val'] for data in prediction_data]
f1_test = [data['f1_test'] for data in prediction_data]

model_labels = [data['model_label'] for data in prediction_data]

Bar plot of F1 score

In [None]:
w, h, dpi = 1280, 720, 100
fig, ax = plt.subplots(figsize=(w/dpi, h/dpi), dpi=dpi)

ax.bar(model_labels, height=f1_test, width=0.8)
plt.show()