In [20]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import urllib.request
import gensim
import gensim.downloader as gloader


from zipfile import ZipFile
from collections import OrderedDict
from typing import List, Callable, Dict
from tqdm import tqdm

random_seed = 42
np.random.seed(random_seed)
tf.random.set_seed(random_seed)
#os.environ['TF_DETERMINISTIC_OPS'] = '1'

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
gensim.__version__

'4.1.2'

# Create Dataset

## Download data

In [21]:
dataset_folder = os.path.join(os.getcwd(), "Datasets", "Original")

if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)

url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"

dataset_path = os.path.join(dataset_folder, "data.zip")

if not os.path.exists(dataset_path):
    urllib.request.urlretrieve(url, dataset_path)
    print("Successful download")

## Create Dataframe

In [22]:
train_range = (1, 101)
val_range = (101, 151)
test_range = (151, 200)

split_sentences = True

dataframe_rows = []
with ZipFile(dataset_path, 'r') as myzip:
    for i, filename in enumerate(myzip.namelist()[1:]):
        print("Extracting", filename, end='\r')

        with myzip.open(filename) as myfile:
            file_id = int(filename.split('.')[0][-4:])

            split = 'train'
            if file_id in range(*val_range):
                split = 'val'
            elif file_id in range(*test_range):
                split = 'test'

            content_string = myfile.read().decode('utf-8')
            if split_sentences:
                sentences = content_string.split('\n\n')
            else:
                sentences = [content_string]

            for sentence in sentences:
                content = sentence.split('\n')
                content = [line.split('\t') for line in content if len(line.split('\t')) == 3]

                words, tags, _ = zip(*content)

                dataframe_rows.append({'file_id': file_id,
                                       'text': words,
                                       'tags': tags,
                                       'split': split
                                       })

df = pd.DataFrame(dataframe_rows).sort_values('file_id').reset_index(drop=True)
print("Dataframe created.".ljust(50))

df

Dataframe created.                                


Unnamed: 0,file_id,text,tags,split
0,1,"(Pierre, Vinken, ,, 61, years, old, ,, will, j...","(NNP, NNP, ,, CD, NNS, JJ, ,, MD, VB, DT, NN, ...",train
1,1,"(Mr., Vinken, is, chairman, of, Elsevier, N.V....","(NNP, NNP, VBZ, NN, IN, NNP, NNP, ,, DT, NNP, ...",train
2,2,"(Rudolph, Agnew, ,, 55, years, old, and, forme...","(NNP, NNP, ,, CD, NNS, JJ, CC, JJ, NN, IN, NNP...",train
3,3,"(``, There, 's, no, question, that, some, of, ...","(``, EX, VBZ, DT, NN, IN, DT, IN, DT, NNS, CC,...",train
4,3,"(Workers, described, ``, clouds, of, blue, dus...","(NNS, VBD, ``, NNS, IN, JJ, NN, '', WDT, VBD, ...",train
...,...,...,...,...
3909,198,"(A, line-item, veto, is, a, procedure, that, w...","(DT, JJ, NN, VBZ, DT, NN, WDT, MD, VB, DT, NN,...",test
3910,198,"(Sen., Kennedy, said, in, a, separate, stateme...","(NNP, NNP, VBD, IN, DT, JJ, NN, IN, PRP, VBZ, ...",test
3911,199,"(Trinity, Industries, Inc., said, it, reached,...","(NNP, NNPS, NNP, VBD, PRP, VBD, DT, JJ, NN, TO...",test
3912,199,"(Terms, were, n't, disclosed, .)","(NNS, VBD, RB, VBN, .)",test


## Preprocessing

Convert to lowercase

In [6]:
df['text'] = df['text'].apply(lambda l: [element.lower() for element in l])
df

Unnamed: 0,file_id,text,tags,split
0,1,"[pierre, vinken, ,, 61, years, old, ,, will, j...","(NNP, NNP, ,, CD, NNS, JJ, ,, MD, VB, DT, NN, ...",train
1,1,"[mr., vinken, is, chairman, of, elsevier, n.v....","(NNP, NNP, VBZ, NN, IN, NNP, NNP, ,, DT, NNP, ...",train
2,2,"[rudolph, agnew, ,, 55, years, old, and, forme...","(NNP, NNP, ,, CD, NNS, JJ, CC, JJ, NN, IN, NNP...",train
3,3,"[``, there, 's, no, question, that, some, of, ...","(``, EX, VBZ, DT, NN, IN, DT, IN, DT, NNS, CC,...",train
4,3,"[workers, described, ``, clouds, of, blue, dus...","(NNS, VBD, ``, NNS, IN, JJ, NN, '', WDT, VBD, ...",train
...,...,...,...,...
3909,198,"[a, line-item, veto, is, a, procedure, that, w...","(DT, JJ, NN, VBZ, DT, NN, WDT, MD, VB, DT, NN,...",test
3910,198,"[sen., kennedy, said, in, a, separate, stateme...","(NNP, NNP, VBD, IN, DT, JJ, NN, IN, PRP, VBZ, ...",test
3911,199,"[trinity, industries, inc., said, it, reached,...","(NNP, NNPS, NNP, VBD, PRP, VBD, DT, JJ, NN, TO...",test
3912,199,"[terms, were, n't, disclosed, .]","(NNS, VBD, RB, VBN, .)",test


## Data Splitting

In [7]:
train_data = df[df['split'] == 'train']
val_data = df[df['split'] == 'val']
test_data = df[df['split'] == 'test']

x_train = train_data['text'].values
y_train = train_data['tags'].values

x_val = val_data['text'].values
y_val = val_data['tags'].values

x_test = test_data['text'].values
y_test = test_data['tags'].values

print('Dataset splits statistics: ')
print(f'Train data: {x_train.shape}')
print(f'Validation data: {x_val.shape}')
print(f'Test data: {x_test.shape}')


Dataset splits statistics: 
Train data: (1963,)
Validation data: (1299,)
Test data: (652,)


## Add OOV words to GloVe embeddings

In [8]:
from utils.preprocessing import load_embedding_model, check_OOV_terms, get_OOV_embedding, add_OOV_embeddings

print("Loading GloVe embedding.")
my_embedding_dimension = 50
my_embedding_model = load_embedding_model('glove', my_embedding_dimension)

Loading GloVe embedding.


In [9]:
print("GLOVE vocabulary size: ", len(my_embedding_model))

unknown_token = '[UNK]'
padding_token = ''

print(f"Add unknown token {unknown_token} and padding token {padding_token}")
add_OOV_embeddings(my_embedding_model, [unknown_token, padding_token], my_embedding_dimension)
print("V1 size: ", len(my_embedding_model))

print("\nCreating V2 using training set (V1 + OOV1)")
add_OOV_embeddings(my_embedding_model, x_train, my_embedding_dimension)
print("V2 size: ", len(my_embedding_model))

print("\nCreating V3 using validation set (V2 + OOV2)")
add_OOV_embeddings(my_embedding_model, x_val, my_embedding_dimension)
print("V3 size: ", len(my_embedding_model))

print("\nCreating V4 using validation set (V3 + OOV3)")
add_OOV_embeddings(my_embedding_model, x_test, my_embedding_dimension)
print("V4 size: ", len(my_embedding_model))

# build vocabulary for x
dataset_vocabulary = np.unique([word for sentence in df['text'] for word in sentence])
dataset_vocabulary = np.concatenate([[unknown_token], dataset_vocabulary])

# build vocabulary for y
tags_s = ' '.join([' '.join(y) for y in df['tags']])
tag_vocabulary = pd.DataFrame(tags_s.split())[0].unique()

GLOVE vocabulary size:  400000
Add unknown token [UNK] and padding token 


100%|██████████| 2/2 [00:00<00:00, 17.41it/s]

V1 size:  400002

Creating V2 using training set (V1 + OOV1)



100%|██████████| 359/359 [00:13<00:00, 26.20it/s]


V2 size:  400361

Creating V3 using validation set (V2 + OOV2)


100%|██████████| 189/189 [00:07<00:00, 24.56it/s]


V3 size:  400550

Creating V4 using validation set (V3 + OOV3)


100%|██████████| 128/128 [00:07<00:00, 17.38it/s]


V4 size:  400678


## Padding x

In [10]:
padding_length = int(df['tags'].apply(lambda x: len(x)).quantile(0.95))
print("The padding length is", padding_length)

dataset_dict = {k: i for i, k in enumerate(dataset_vocabulary)}

def tokenize_x(x):
    return [[dataset_dict[word] for word in phrase] for phrase in x]

x_train_tokenized = tokenize_x(x_train)
x_val_tokenized = tokenize_x(x_val)
x_test_tokenized = tokenize_x(x_test)

x_train_pad = tf.keras.preprocessing.sequence.pad_sequences(x_train_tokenized, maxlen=padding_length, padding="post")
x_val_pad = tf.keras.preprocessing.sequence.pad_sequences(x_val_tokenized, maxlen=padding_length, padding="post")
x_test_pad = tf.keras.preprocessing.sequence.pad_sequences(x_test_tokenized, maxlen=padding_length, padding="post")

The padding length is 44


## Padding y

In [11]:
tag_dict = {k: i for i, k in enumerate(tag_vocabulary)}

def tokenize_y(y):
    return [[tag_dict[tag] for tag in phrase] for phrase in y]

y_train_tokenized = tokenize_y(y_train)
y_val_tokenized = tokenize_y(y_val)
y_test_tokenized = tokenize_y(y_test)

y_train_pad = tf.keras.preprocessing.sequence.pad_sequences(y_train_tokenized, maxlen=padding_length, padding="post")
y_val_pad = tf.keras.preprocessing.sequence.pad_sequences(y_val_tokenized, maxlen=padding_length, padding="post")
y_test_pad = tf.keras.preprocessing.sequence.pad_sequences(y_test_tokenized, maxlen=padding_length, padding="post")

# Train Models

In [23]:
from keras.optimizer.adam import Adam
from keras.callbacks import ModelCheckpoint
from functools import partial

from utils.training_utils import MyHistory, plot_history
from models import embedding_layer

embedding_func = partial(embedding_layer,
                         vocabulary=dataset_vocabulary,
                         embedding_model=my_embedding_model,
                         embedding_dimension=my_embedding_dimension)

weights_folder = "weights"

checkpoint_partial = partial(ModelCheckpoint, monitor="val_loss", mode="auto")#, save_format="tf")
compile_args = dict(loss="sparse_categorical_crossentropy", metrics=["acc"])

input_shape = (padding_length, my_embedding_dimension)

ModuleNotFoundError: No module named 'keras.optimizer'

## Baseline LSTM

In [None]:
from models import baselineLSTM

optimizer = Adam(learning_rate=1e-4)

checkpoint_path = os.path.join(weights_folder, "baseline", "checkpoint.hdf5")
history_path = os.path.join(weights_folder, "baseline", "history.npy")

checkpoint_callback = checkpoint_partial(filepath = checkpoint_path)
hist_callback = MyHistory(history_path)

model = baselineLSTM(num_classes=len(tag_vocabulary), input_shape=(padding_length,), embedding_func=embedding_func)
model.compile(**compile_args, optimizer=optimizer)

if os.path.exists(checkpoint_path):
    model.load_weights(checkpoint_path)

history = model.fit(x=x_train_pad,
                    y=y_train_pad,
                    batch_size=256,
                    epochs=500,
                    validation_data=(x_val_pad,
                                     y_val_pad),
                    callbacks=[checkpoint_callback, hist_callback])

plot_history(history)

## GRU

In [None]:
from models import GRUModel

optimizer = Adam(learning_rate=1e-4)

checkpoint_path = os.path.join(weights_folder, "gru", "checkpoint.hdf5")
history_path = os.path.join(weights_folder, "gru", "history.npy")

checkpoint_callback = checkpoint_partial(filepath = checkpoint_path)
hist_callback = MyHistory(history_path)

model = GRUModel(num_classes=len(tag_vocabulary), input_shape=(padding_length,), embedding_func=embedding_func)
model.compile(**compile_args, optimizer=optimizer)

if os.path.exists(checkpoint_path):
    model.load_weights(checkpoint_path)

history = model.fit(x=x_train_pad,
                    y=y_train_pad,
                    batch_size=256,
                    epochs=500,
                    validation_data=(x_val_pad,
                                     y_val_pad),
                    callbacks=[checkpoint_callback, hist_callback])

plot_history(history)

## Additional LSTM layer

In [None]:
from models import additionalLSTM

optimizer = Adam(learning_rate=1e-4)

checkpoint_path = os.path.join(weights_folder, "additionalLSTM", "checkpoint.hdf5")
history_path = os.path.join(weights_folder, "additionalLSTM", "history.npy")

checkpoint_callback = checkpoint_partial(filepath = checkpoint_path)
hist_callback = MyHistory(history_path)

model = additionalLSTM(num_classes=len(tag_vocabulary), input_shape=(padding_length,), embedding_func=embedding_func)
model.compile(**compile_args, optimizer=optimizer)

if os.path.exists(checkpoint_path):
    model.load_weights(checkpoint_path)

history = model.fit(x=x_train_pad,
                    y=y_train_pad,
                    batch_size=256,
                    epochs=500,
                    validation_data=(x_val_pad,
                                     y_val_pad),
                    callbacks=[checkpoint_callback, hist_callback])

plot_history(history)

## Additional Dense

In [None]:
from models import additionalDense

optimizer = Adam(learning_rate=1e-2)

checkpoint_path = os.path.join(weights_folder, "additionalDense", "checkpoint.hdf5")
history_path = os.path.join(weights_folder, "additionalDense", "history.npy")

checkpoint_callback = checkpoint_partial(filepath = checkpoint_path)
hist_callback = MyHistory(history_path)

model = additionalDense(num_classes=len(tag_vocabulary), input_shape=(padding_length,), embedding_func=embedding_func)
model.compile(**compile_args, optimizer=optimizer)

if os.path.exists(checkpoint_path):
    model.load_weights(checkpoint_path)

history = model.fit(x=x_train_pad,
                    y=y_train_pad,
                    batch_size=256,
                    epochs=500,
                    validation_data=(x_val_pad,
                                     y_val_pad),
                    callbacks=[checkpoint_callback, hist_callback])

plot_history(history)

# Performance Analysis

## Dummy Classifiers

In [19]:
from sklearn.dummy import DummyClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

ModuleNotFoundError: No module named 'sklearn'

In [None]:
majority_classifier = DummyClassifier(strategy="prior")
stratified_classifier = DummyClassifier(strategy="stratified")

majority_classifier.fit(x_train, y_train)
stratified_classifier.fit(x_train, y_train)

y_pred_train_maj = majority_classifier.predict(x_train)
y_pred_test_maj = majority_classifier.predict(x_test)
y_pred_train_st = stratified_classifier.predict(x_train)
y_pred_test_st = stratified_classifier.predict(x_test)

## Trained models

In [None]:
# to be filled when the models are trained

## Build dictionary with prediction info
fill in this dictionary to get all the statistics

In [None]:
prediction_data = [{
    'model_label': 'maj',
    'y_pred_train': majority_classifier.predict(x_train),
    'y_pred_val': majority_classifier.predict(x_val),
    'y_pred_test': majority_classifier.predict(x_test)
    }, {
    'model_label': 'st',
    'y_pred_train': stratified_classifier.predict(x_train),
    'y_pred_val': stratified_classifier.predict(x_val),
    'y_pred_test': majority_classifier.predict(x_test)
    }
]

## Show analysis

Compute F1 score

In [None]:
import matplotlib.pyplot as plt

def analyze(y_true, y_pred, output_mode=0, model_label=None):
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    f1 = report['weighted avg']['f1-score']
    if output_mode >= 1:
        print(f"{model_label}, weighted F1 macro: {report['weighted avg']['f1-score']:.2f}")

    if output_mode >= 2:
        print("Confusion matrix")
        ConfusionMatrixDisplay.from_predictions(y_true, y_pred, cmap='Blues')
        plt.show()
    return f1

output_mode=2
for data in prediction_data:
    data['f1_train'] = analyze(y_train, data['y_pred_train'], output_mode=output_mode, model_label=f"{data['model_label']} train")
    data['f1_val'] = analyze(y_val, data['y_pred_val'], output_mode=output_mode, model_label=f"{data['model_label']} validation")
    data['f1_test'] = analyze(y_test, data['y_pred_test'], output_mode=output_mode, model_label=f"{data['model_label']} test")

f1_train = [data['f1_train'] for data in prediction_data]
f1_val = [data['f1_val'] for data in prediction_data]
f1_test = [data['f1_test'] for data in prediction_data]

model_labels = [data['model_label'] for data in prediction_data]

Bar plot of F1 score

In [None]:
w, h, dpi = 1280, 720, 100
fig, ax = plt.subplots(figsize=(w/dpi, h/dpi), dpi=dpi)

ax.bar(model_labels, height=f1_train, width=0.8)
plt.show()