# Assignment 1
The following tasks have to be done:
*   Download the corpora and split it in training and test sets, structuring a dataframe.
*   Embed the words using GloVe embeddings
*   Create a baseline model, using a simple neural architecture
*   Experiment doing small modifications to the baseline model, choose hyperparameters using the validation set
*   Evaluate your two best model
*   Analyze the errors of your model
## Split the data into the training-, test and validation set

In [2]:
from pathlib import Path
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

Function to iterate over all files

In [3]:
def file_iterator():
    data_dir = Path('dependency_treebank')
    for data_file in filter(lambda f: os.path.isfile(data_dir/f) and f.endswith('.dp'), os.listdir(data_dir)):
        yield data_dir/data_file

Read in the text files using the file iterator and perform data set splitting

In [4]:
train_split = 100
val_split = 150

data = []

for file in tqdm(file_iterator()):
    data.append(pd.read_csv(file, sep="\t", names=['token', 'pos'], usecols=[0, 1], engine='python'))

data_frame = pd.concat(data)

train_frame = pd.concat(data[:train_split])
test_frame = pd.concat(data[train_split:val_split])
val_frame = pd.concat(data[val_split:])

199it [00:00, 557.26it/s]


One-Hot-encode the part of speech attribute

In [5]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

labels_ohe = ohe.fit_transform(data_frame[['pos']]).toarray()

train_labels = list(labels_ohe[:train_frame.shape[0]])
test_labels = list(labels_ohe[train_frame.shape[0]:train_frame.shape[0]+test_frame.shape[0]])
val_labels = list(labels_ohe[train_frame.shape[0]+test_frame.shape[0]:])

train_frame['pos_ohe'] = train_labels
test_frame['pos_ohe'] = test_labels
val_frame['pos_ohe'] = val_labels

In [6]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(model_type: str,
                         embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    """
    Loads a pre-trained word embedding model via gensim library.

    :param model_type: name of the word embedding model to load.
    :param embedding_dimension: size of the embedding space to consider

    :return
        - pre-trained word embedding model (gensim KeyedVectors object)
    """
    download_path = ""
    if model_type.strip().lower() == 'word2vec':
        download_path = "word2vec-google-news-300"

    elif model_type.strip().lower() == 'glove':
        download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    elif model_type.strip().lower() == 'fasttext':
        download_path = "fasttext-wiki-news-subwords-300"
    else:
        raise AttributeError("Unsupported embedding model type! Available ones: word2vec, glove, fasttext")
        
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Word2Vec: 300")
        print("Glove: 50, 100, 200, 300")
        print('FastText: 300')
        raise e

    return emb_model

embedding_model = load_embedding_model(model_type="glove",
                                       embedding_dimension=50)

In [7]:
def check_OOV_terms(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                    word_listing):
    """
    Checks differences between pre-trained embedding model vocabulary
    and dataset specific vocabulary in order to highlight out-of-vocabulary terms.

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_listing: dataset specific vocabulary (list)

    :return
        - list of OOV terms
    """
    # embedding_vocabulary = set(embedding_model.vocab.keys())
    embedding_vocabulary = set(embedding_model.index_to_key)
    oov = set(word_listing).difference(embedding_vocabulary)
    return list(oov)

oov_terms = check_OOV_terms(embedding_model, data_frame['token'])

oov_percentage = float(len(oov_terms)) * 100 / len(data_frame['token'])
print(f"Total OOV terms: {len(oov_terms)} ({oov_percentage:.2f}%)")

Total OOV terms: 3745 (3.98%)


In [8]:
from collections import OrderedDict

def build_vocabulary(df):
    """
    Given a dataset, builds the corresponding word vocabulary.

    :param df: dataset from which we want to build the word vocabulary (pandas.DataFrame)
    :return:
      - word vocabulary: vocabulary index to word
      - inverse word vocabulary: word to vocabulary index
      - word listing: set of unique terms that build up the vocabulary
    """
    idx_to_word = OrderedDict()
    word_to_idx = OrderedDict()
    
    curr_idx = 0

    for token in tqdm(df['token']):
        if token not in word_to_idx:
            word_to_idx[token] = curr_idx
            idx_to_word[curr_idx] = token
            curr_idx += 1

    word_listing = list(idx_to_word.values())
    return idx_to_word, word_to_idx, word_listing
 
idx_to_word, word_to_idx, word_listing = build_vocabulary(data_frame)
print(f"Size of the vocabulary: {len(word_listing)}")

100%|██████████| 94084/94084 [00:00<00:00, 1573018.65it/s]

Size of the vocabulary: 11968





In [9]:
def build_embedding_matrix(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                           embedding_dimension: int,
                           word_to_idx,
                           vocab_size: int,
                           oov_terms):
    """
    Builds the embedding matrix of a specific dataset given a pre-trained word embedding model

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_to_idx: vocabulary map (word -> index) (dict)
    :param vocab_size: size of the vocabulary
    :param oov_terms: list of OOV terms (list)

    :return
        - embedding matrix that assigns a high dimensional vector to each word in the dataset specific vocabulary (shape |V| x d)
    """
    embedding_matrix = np.zeros((vocab_size, embedding_dimension), dtype=np.float32)
    for word, idx in tqdm(word_to_idx.items()):
        try:
            embedding_vector = embedding_model[word]
        except (KeyError, TypeError):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

        embedding_matrix[idx] = embedding_vector

    return embedding_matrix

embedding_dimension = 50
embedding_matrix = build_embedding_matrix(embedding_model, embedding_dimension, word_to_idx, len(word_to_idx), oov_terms)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

100%|██████████| 11968/11968 [00:00<00:00, 176750.89it/s]

Embedding matrix shape: (11968, 50)





Add the corresponding embeddings to the data frames that belong to the sequenced word

In [10]:
get_embeddings = lambda words: [embedding_matrix[word_to_idx[word], :] for word in words]

train_frame['embedding'] = get_embeddings(train_frame['token'])
test_frame['embedding'] = get_embeddings(test_frame['token'])
val_frame['embedding'] = get_embeddings(val_frame['token'])

In [11]:
from tensorflow import keras

# create Keras model
model = keras.Sequential()

# add embedding layer with embedding matrix
model.add(keras.Input(shape=(32, embedding_dimension)))

# add LSTM layer
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=128)))

# add dense layer
model.add(keras.layers.Dense(10))

model.summary()

2022-11-29 00:53:09.411749: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-29 00:53:13.227558: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 256)              183296    
 l)                                                              
                                                                 
 dense (Dense)               (None, 10)                2570      
                                                                 
Total params: 185,866
Trainable params: 185,866
Non-trainable params: 0
_________________________________________________________________


In [16]:
# compile model
model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy', 'precision', 'recall'])

model.fit([t for t in train_frame['embedding'].values], [t for t in train_frame['pos_ohe'].values], epochs=10, batch_size=32)

<class 'numpy.ndarray'>


IndexError: tuple index out of range