In [42]:
from numpy import array
from numpy import asarray
from numpy import zeros
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from pathlib import Path
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from tqdm import tqdm

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Download and extract the dataset
The dataset is downloaded from the provided URL and extracted into the 'Dataset' folder

In [13]:
# Change to False if you want to start the download
skip_download = True

if not skip_download:
    import urllib.request
    import tarfile

    url = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip'

    dataset_folder = './Dataset'
    if not os.path.exists(dataset_folder):
        os.mkdir(dataset_folder)

    dataset_path = os.path.join(dataset_folder, 'dependency_treebank')

    if not os.path.exists(dataset_path):
        urllib.request.urlretrieve(url, dataset_folder)
        print("Successful download")

    tar = tarfile.open(dataset_path)
    tar.extractall(dataset_folder)
    tar.close()
    print("Successful extraction")
else:
    print("Skip download process of the Dataset")

Skip download process of the Dataset


# Split the data
The data (199 samples in total) is split into a train, validation and test set:
- 100 train samples
- 50 validation samples
- 49 test samples 
The sets are stored in data frames.

In [59]:
# Define split size
TRAIN_SPLIT = 100
VAL_SPLIT = 150

# Define file iterator
def file_iterator():
    data_dir = Path('dependency_treebank')
    for data_file in filter(lambda f: os.path.isfile(data_dir/f) and f.endswith('.dp'), os.listdir(data_dir)):
        yield data_dir/data_file

# Create train, val and test set
data_set = []
split_indexes = []

# Iterate over files and perform split
for file in tqdm(file_iterator()):
    data_set.append(pd.read_csv(file, sep="\t", names=['token', 'pos'], usecols=[0, 1], engine='python'))
    split_indexes.append(len(data_set[-1]))

data_frame = pd.concat(data_set)

split = ['train']*sum(split_indexes[0:TRAIN_SPLIT]) \
        + ['val']*sum(split_indexes[TRAIN_SPLIT:VAL_SPLIT]) \
        + ['test']*sum(split_indexes[VAL_SPLIT:len(data_set)])

assert len(split) == data_frame.shape[0]

data_frame['split'] = split

train_frame = data_frame[data_frame['split']=='train']
test_frame =  data_frame[data_frame['split']=='test']
val_frame =   data_frame[data_frame['split']=='val']

data_frame.head()

199it [00:01, 159.66it/s]


Unnamed: 0,token,pos,split
0,Pierre,NNP,train
1,Vinken,NNP,train
2,",",",",train
3,61,CD,train
4,years,NNS,train


Check the structure of the constructed data frames

# Tokenization and padding
The data frames are now tokenized and additional padded to have a unit input length 

In [60]:
# Routine to tokenize and pad data to unit length
def tokenize_and_pad(data_frame):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data_frame['token'].values)
    vocab_size = len(tokenizer.word_index) + 1
    encoded_doc = tokenizer.texts_to_sequences(data_frame['token'].values)
    max_length = 1
    padded_docs = pad_sequences(encoded_doc, maxlen=max_length, padding='post')
    return padded_docs, vocab_size, tokenizer

# train set
train_padded_docs, train_vocab_size, train_tokenizer = tokenize_and_pad(train_frame)

# val set
val_padded_docs, val_vocab_size, val_tokenizer = tokenize_and_pad(val_frame)

# test set
test_padded_docs, test_vocab_size, test_tokenizer = tokenize_and_pad(test_frame)

# Create embedding matrix
First, the pretrained glove-embedding has been downloaded from https://nlp.stanford.edu/projects/glove/ and converted to a dictionary format (100-dimensional embeddings).

In [22]:
embeddings_index = dict()
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in tqdm(f):
        values = line.split()
        embeddings_index[values[0]] = asarray(values[1:], dtype='float32')

print('Loaded %s word vectors.' % len(embeddings_index))

400000it [00:27, 14294.98it/s]

Loaded 400000 word vectors.





Define a function to create an embedding matrix from a given vocabulary, using the previously loaded glove-embeddings

In [23]:
def create_embedding_matrix(tokenizer, vocab_size):
    # load embedding into memory, skip first
    embedding_matrix = zeros((vocab_size, 100))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            # print("Not in Vocab", word)
            pass
    return embedding_matrix

# Define and train the model
## Encode the POS-labels
We also need to convert the POS labels to a numerical representation (one-hot-encoding)

In [91]:
def encode_labels(data_frame, trained_encoder = None):
    if trained_encoder is None:
        trained_encoder = LabelEncoder()
        trained_encoder.fit(data_frame['pos'].values)

    encoded_Y = trained_encoder.transform(data_frame['pos'].values)
    # convert integers to dummy variables (i.e. one hot encoded) -> labels
    labels = np_utils.to_categorical(encoded_Y)
    return labels, trained_encoder

## Define the structure and layers of the used model
Here, a sequential model is used, that receives tokens using an Embedding layer (embedding with the loaded glove-embeddings).  
The recurrent structure is implemented using a LSTM layer with 128-units

In [112]:
def create_model(vocab_size, embedding_matrix, plot_model=False, recurrent_layer='lstm'):
    # define model
    model = Sequential()
    
    # Input as Embeddings
    model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=1, trainable=False))
    # A Bidirectional recurrent layer (LSTM units)
    
    if recurrent_layer == 'lstm':
        model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=128), input_shape=(None, 100)))
    
    # Dense layer to fit output to label-vector-size
    model.add(Dense(45, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    model.summary()
    if plot_model:
        keras.utils.plot_model(model, show_shapes=True, show_layer_names=True)
    
    return model

In [26]:
# def inference(model, tokenizer, vocab_size, test_padded_docs, test_frame):
#     # predict the model
#     yhat = model.predictd(test_padded_docs, verbose=1)
#     # map predicted labels to words
#     predicted_labels = []
#     for i in yhat:
#         for word, index in tokenizer.word_index.items():
#             if index == i:
#                 predicted_labels.append(word)
#                 break
#     # map actual labels to words
#     actual_labels = []
#     for i in test_frame['pos'].values:
#         for word, index in tokenizer.word_index.items():
#             if index == i:
#                 actual_labels.append(word)
#                 break
#     # create confusion matrix
#     confusion_matrix = pd.crosstab(pd.Series(actual_labels), pd.Series(predicted_labels), rownames=['Actual'], colnames=['Predicted'])
#     print(confusion_matrix)

Encode the labels of the train data using a one hot encoding. Use the same encoder to create the label vector for the validation set

In [113]:
train_embedding_matrix = create_embedding_matrix(train_tokenizer, train_vocab_size)
train_labels, train_encoder = encode_labels(train_frame)

val_labels, _ = encode_labels(val_frame, trained_encoder=train_encoder)
test_labels, _ = encode_labels(test_frame, trained_encoder=train_encoder)

assert train_labels.shape[1] == val_labels.shape[1] and train_labels.shape[1] == test_labels.shape[1]

Train routine using a bidirectional LSTM layern with 128 units  
Evaluate the models training progress using the validation set

In [114]:
from keras.callbacks import CSVLogger

csv_logger = CSVLogger('log/log_lstm_128.csv', append=False, separator=';')

model = create_model(train_vocab_size, train_embedding_matrix)

# fit the model
model.fit(x=train_padded_docs, y=train_labels, epochs=10, validation_data=(val_padded_docs, val_labels), verbose=1, callbacks=[csv_logger])

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 1, 100)            690300    
                                                                 
 bidirectional_10 (Bidirecti  (None, 256)              234496    
 onal)                                                           
                                                                 
 dense_10 (Dense)            (None, 45)                11565     
                                                                 
Total params: 936,361
Trainable params: 246,061
Non-trainable params: 690,300
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

KeyboardInterrupt: 

In [104]:
# evaluate the model
train_loss, train_accuracy = model.evaluate(train_padded_docs, train_labels, verbose=1)
val_loss, val_accuracy = model.evaluate(val_padded_docs, val_labels, verbose=1)
test_loss, test_accuracy = model.evaluate(test_padded_docs, test_labels, verbose=1)

print('Train Accuracy: {n:.2f} %'.format(n=(train_accuracy * 100)))
print('Test Accuracy: {n:.2f} %'.format(n=(test_accuracy * 100)))
print('Validation Accuracy: {n:.2f} %'.format(n=(val_accuracy * 100)))

Train Accuracy: 85.50 %
Test Accuracy: 24.84 %
Validation Accuracy: 24.16 %


In [21]:
# validate model
val_embedding_matrix = create_embedding_matrix(val_tokenizer, val_vocab_size)
val_labels, val_encoder = encode_labels(val_frame)

loss, accuracy = model.evaluate(val_padded_docs, val_labels, verbose=1)

InvalidArgumentError: Graph execution error:

Detected at node 'categorical_crossentropy/softmax_cross_entropy_with_logits' defined at (most recent call last):
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/traitlets/config/application.py", line 982, in launch_instance
      app.start()
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/asyncio/base_events.py", line 601, in run_forever
      self._run_once()
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/asyncio/base_events.py", line 1905, in _run_once
      handle._run()
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2940, in run_cell
      result = self._run_cell(
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2995, in _run_cell
      return runner(coro)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3194, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3373, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/qr/scdb4s4n6dz5463spyd32yth0000gn/T/ipykernel_21772/2052202248.py", line 2, in <module>
      val_loss, val_accuracy = model.evaluate(val_padded_docs, val_labels, verbose=1)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/engine/training.py", line 1947, in evaluate
      tmp_logs = self.test_function(iterator)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/engine/training.py", line 1727, in test_function
      return step_function(self, iterator)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/engine/training.py", line 1713, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/engine/training.py", line 1701, in run_step
      outputs = model.test_step(data)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/engine/training.py", line 1667, in test_step
      self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/engine/training.py", line 1052, in compute_loss
      return self.compiled_loss(
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/engine/compile_utils.py", line 265, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/losses.py", line 152, in __call__
      losses = call_fn(y_true, y_pred)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/losses.py", line 272, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/losses.py", line 1990, in categorical_crossentropy
      return backend.categorical_crossentropy(
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/backend.py", line 5535, in categorical_crossentropy
      return tf.nn.softmax_cross_entropy_with_logits(
Node: 'categorical_crossentropy/softmax_cross_entropy_with_logits'
logits and labels must be broadcastable: logits_size=[32,44] labels_size=[32,43]
	 [[{{node categorical_crossentropy/softmax_cross_entropy_with_logits}}]] [Op:__inference_test_function_255175]

In [20]:
# predict model val
predictions = model.predict(val_padded_docs, verbose=1)

#TODO: compare



[[1.0907109e-05 2.2433024e-04 2.1703347e-06 ... 2.8716912e-08
  4.9551250e-07 7.0266455e-05]
 [4.2851013e-08 1.0514050e-06 4.9462809e-08 ... 8.0239711e-11
  5.8290900e-11 1.2585730e-07]
 [2.6137733e-03 6.1667468e-02 2.5079817e-06 ... 2.7957765e-07
  1.7772708e-06 5.3486343e-02]
 ...
 [1.1580768e-07 3.0247747e-06 7.4308076e-08 ... 2.9441841e-11
  2.2790166e-10 4.1862222e-07]
 [4.4803182e-05 1.6333588e-04 4.8778595e-05 ... 6.6113500e-08
  4.7637050e-07 6.0173814e-05]
 [2.6137733e-03 6.1667468e-02 2.5079817e-06 ... 2.7957765e-07
  1.7772708e-06 5.3486343e-02]]


TEST AREA

In [29]:
# import numpy as np
# # predict model test
# predictions = model.predict(test_padded_docs, verbose=1)
# # map predicted labels to words
# predicted_labels = []
# for i in predictions:
#     for word, index in test_tokenizer.word_index.items():
#         if index == np.argmax(i):
#             predicted_labels.append(word)
#             break
# # map actual labels to words
# actual_labels = []
# for i in test_frame['pos'].values:
#     for word, index in test_tokenizer.word_index.items():
#         if index == i:
#             actual_labels.append(word)
#             break
# # create confusion matrix
# confusion_matrix = pd.crosstab(pd.Series(actual_labels), pd.Series(predicted_labels), rownames=['Actual'], colnames=['Predicted'])

44

In [34]:
# import numpy as np
# np.argmax(val_labels[0])
# np.argmax(predictions[0])

35