In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import urllib.request
import gensim
import gensim.downloader as gloader


from zipfile import ZipFile
from collections import OrderedDict
from typing import List, Callable, Dict
from tqdm import tqdm

%load_ext autoreload
%autoreload 2

# Create Dataset

## Download data

In [None]:
dataset_folder = os.path.join(os.getcwd(), "Datasets", "Original")

if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)

url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"

dataset_path = os.path.join(dataset_folder, "data.zip")

if not os.path.exists(dataset_path):
    urllib.request.urlretrieve(url, dataset_path)
    print("Successful download")

## Create Dataframe

In [None]:
train_range = (1, 101)
val_range = (101, 151)
test_range = (151, 200)

split_sentences = True

dataframe_rows = []
with ZipFile(dataset_path, 'r') as myzip:
    for i, filename in enumerate(myzip.namelist()[1:]):
        print("Extracting", filename, end='\r')

        with myzip.open(filename) as myfile:
            file_id = int(filename.split('.')[0][-4:])

            split = 'train'
            if file_id in range(*val_range):
                split = 'val'
            elif file_id in range(*test_range):
                split = 'test'

            content_string = myfile.read().decode('utf-8')
            if split_sentences:
                sentences = content_string.split('\n\n')
            else:
                sentences = [content_string]

            for sentence in sentences:
                content = sentence.split('\n')
                content = [line.split('\t') for line in content if len(line.split('\t')) == 3]

                words, tags, _ = zip(*content)

                dataframe_rows.append({'file_id': file_id,
                                       'text': ' '.join(words),
                                       'tags': tags,
                                       'split': split
                                       })

df = pd.DataFrame(dataframe_rows).sort_values('file_id').reset_index(drop=True)
print("Dataframe created.".ljust(50))

df

## Preprocessing

Convert to lowercase

In [None]:
df['text'] = df['text'].apply(lambda x: x.lower())
df

## Data Splitting

In [None]:
train_data = df[df['split'] == 'train']
val_data = df[df['split'] == 'val']
test_data = df[df['split'] == 'test']

x_train = train_data['text'].values
y_train = train_data['tags'].values

x_val = val_data['text'].values
y_val = val_data['tags'].values

x_test = test_data['text'].values
y_test = test_data['tags'].values

print('Dataset splits statistics: ')
print(f'Train data: {x_train.shape}')
print(f'Validation data: {x_val.shape}')
print(f'Test data: {x_test.shape}')


## Apply GloVe embeddings and Tokenization

In [None]:
def get_oov_embedding(word, embedding_model, size):
    """For now just a random vector, can be changed to a more sophisticated method."""
    return np.random.uniform(low=-0.05, high=0.05, size=size)

In [None]:
from utils.kerasTokenizer import load_embedding_model, check_OOV_terms

print("Loading GloVe embedding.")
my_embedding_dimension = 50
my_embedding_model = load_embedding_model('glove', my_embedding_dimension)

In [None]:
print("GLOVE vocabulary (V1) size: ", len(my_embedding_model))

x_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='[UNK]')

print("Creating V2 using training set (V1 + OOV1)")
x_tokenizer.fit_on_texts(x_train)
for word in tqdm(check_OOV_terms(my_embedding_model, x_tokenizer.word_index.keys())):
    embedding_vector = get_oov_embedding(word=word, embedding_model=my_embedding_model, size=my_embedding_dimension)
    my_embedding_model.__setitem__(word, embedding_vector)

print("V2 size: ", len(my_embedding_model))

print("Creating V3 using validation set (V2 + OOV2)")
x_tokenizer.fit_on_texts(x_val)
for word in tqdm(check_OOV_terms(my_embedding_model, x_tokenizer.word_index.keys())):
    embedding_vector = get_oov_embedding(word=word, embedding_model=my_embedding_model, size=my_embedding_dimension)
    my_embedding_model.__setitem__(word, embedding_vector)

print("V3 size: ", len(my_embedding_model))

print("Creating V4 using validation set (V3 + OOV3)")
x_tokenizer.fit_on_texts(x_test)
for word in tqdm(check_OOV_terms(my_embedding_model, x_tokenizer.word_index.keys())):
    embedding_vector = get_oov_embedding(word=word, embedding_model=my_embedding_model, size=my_embedding_dimension)
    my_embedding_model.__setitem__(word, embedding_vector)

print("V4 size: ", len(my_embedding_model))

In [None]:
# small variant, using x_tokenizer for indexing
embedding_matrix = np.zeros((len(x_tokenizer.word_index), my_embedding_dimension))
for i, word in enumerate(x_tokenizer.word_index.keys()):
    embedding_matrix[i] = my_embedding_model.get_vector(word)


# large variant, using embedding model for indexing
# embedding_matrix = np.zeros((len(my_embedding_model), my_embedding_dimension))
# for word, i in my_embedding_model.key_to_index.items():
#     embedding_matrix[i] = my_embedding_model.get_vector(word)


In [None]:
tags_s = ' '.join([' '.join(y) for y in df['tags']])
tag_types = pd.DataFrame(tags_s.split())[0].unique()
tag_vocab = {t: i for i, t in enumerate(tag_types)}

vocab_to_tag = {v: k for k, v in tag_vocab.items()}

## Padding x

In [None]:
maxlen = tf.keras.preprocessing.sequence.pad_sequences(x_tokenizer.texts_to_sequences(df['text']), padding="post")[0].size

x_train_pad = tf.keras.preprocessing.sequence.pad_sequences(x_tokenizer.texts_to_sequences(x_train), maxlen=maxlen, padding="post")
x_val_pad = tf.keras.preprocessing.sequence.pad_sequences(x_tokenizer.texts_to_sequences(x_val), maxlen=maxlen, padding="post")
x_test_pad = tf.keras.preprocessing.sequence.pad_sequences(x_tokenizer.texts_to_sequences(x_test), maxlen=maxlen, padding="post")

## Padding y

In [None]:
def tokenize_y(y):
    return [[tag_vocab[tag] for tag in phrase] for phrase in y]

y_train_tokenized = tokenize_y(y_train)
y_val_tokenized = tokenize_y(y_val)
y_test_tokenized = tokenize_y(y_test)

y_train_pad = tf.keras.preprocessing.sequence.pad_sequences(y_train_tokenized, maxlen=maxlen, padding="post")
y_val_pad = tf.keras.preprocessing.sequence.pad_sequences(y_val_tokenized, maxlen=maxlen, padding="post")
y_test_pad = tf.keras.preprocessing.sequence.pad_sequences(y_test_tokenized, maxlen=maxlen, padding="post")

# Train Models

In [None]:
from keras.optimizer_v2.adam import Adam
from keras.callbacks import ModelCheckpoint
from utils.training_utils import MyHistory

from functools import partial

In [None]:
weights_folder = "weights"

checkpoint_partial = partial(ModelCheckpoint, monitor="val_loss", mode="auto")#, save_format="tf")
compile_args = dict(loss="sparse_categorical_crossentropy", metrics=["acc"])

input_shape = (maxlen, my_embedding_dimension)

## Baseline LSTM

In [None]:
from models import baselineLSTM

optimizer = Adam(lr=1e-4)

checkpoint_callback = checkpoint_partial(filepath = os.path.join(weights_folder, "baseline", "checkpoint.hdf5"))
hist_callback = MyHistory(os.path.join(weights_folder, "baseline", "history.npy"))

model = baselineLSTM(input_shape=input_shape, num_classes=len(tag_types))
model.compile(**compile_args, optimizer=optimizer)

history = model.fit(x=embedding_matrix[x_train_pad-1],
                    y=y_train_pad,
                    batch_size=32,
                    epochs=2,
                    validation_data=(embedding_matrix[x_val_pad-1],
                                     y_val_pad),
                    callbacks=[checkpoint_callback, hist_callback])


## GRU

In [None]:
from models import GRUModel

optimizer = Adam(lr=1e-4)

checkpoint_callback = checkpoint_partial(filepath = os.path.join(weights_folder, "gru", "checkpoint"))
hist_callback = MyHistory(os.path.join(weights_folder, "gru", "history.npy"))

model = GRUModel(input_shape=input_shape, num_classes=len(tag_types))
model.compile(**compile_args, optimizer=optimizer)

history = model.fit(x=embedding_matrix[x_train_pad-1],
                            y=y_train_pad,
                            batch_size=32,
                            epochs=2,
                            validation_data=(embedding_matrix[x_val_pad-1],
                                             y_val_pad),
                            callbacks=[checkpoint_callback, hist_callback])


## Additional LSTM layer

In [None]:
from models import additionalLSTM

optimizer = Adam(lr=1e-4)

checkpoint_callback = checkpoint_partial(filepath = os.path.join(weights_folder, "additional_lstm", "checkpoint"))
hist_callback = MyHistory(os.path.join(weights_folder, "additional_lstm", "history.npy"))

model = additionalLSTM(input_shape=input_shape, num_classes=len(tag_types))
model.compile(**compile_args, optimizer=optimizer)

history = model.fit(x=embedding_matrix[x_train_pad-1],
                            y=y_train_pad,
                            batch_size=32,
                            epochs=2,
                            validation_data=(embedding_matrix[x_val_pad-1],
                                             y_val_pad),
                            callbacks=[checkpoint_callback, hist_callback])


## Additional Dense

In [None]:
from models import additionalDense

optimizer = Adam(lr=1e-4)

checkpoint_callback = checkpoint_partial(filepath = os.path.join(weights_folder, "additional_dense", "checkpoint"))
hist_callback = MyHistory(os.path.join(weights_folder, "additional_dense", "history.npy"))

model = additionalDense(input_shape=input_shape, num_classes=len(tag_types))
model.compile(**compile_args, optimizer=optimizer)

history = model.fit(x=embedding_matrix[x_train_pad-1],
                            y=y_train_pad,
                            batch_size=32,
                            epochs=2,
                            validation_data=(embedding_matrix[x_val_pad-1],
                                             y_val_pad),
                            callbacks=[checkpoint_callback, hist_callback])


## Load Weights

In [None]:
from models import *

model = baselineLSTM(num_classes=45)

model.load_weights("weights/baseline/checkpoint.hdf5")

In [None]:
y_pred_dist = model.predict(embedding_matrix[x_train_pad-1])

In [None]:
x_train.shape

In [None]:
y_pred_dist.shape

In [None]:
y_pred = y_pred_dist.argmax(axis=-1)

In [None]:
i = 3
results = {'ground truth': y_train[i], 'pred': tag_types[y_pred[i][:len(y_train[i])]]}
pd.DataFrame(results)