# Assignment 1
The following tasks have to be done:
*   Download the corpora and split it in training and test sets, structuring a dataframe.
*   Embed the words using GloVe embeddings
*   Create a baseline model, using a simple neural architecture
*   Experiment doing small modifications to the baseline model, choose hyperparameters using the validation set
*   Evaluate your two best model
*   Analyze the errors of your model
## Split the data into the training-, test and validation set

In [48]:
from pathlib import Path
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

Function to iterate over all files

In [7]:
def file_iterator():
    data_dir = Path('dependency_treebank')
    for data_file in filter(lambda f: os.path.isfile(data_dir/f) and f.endswith('.dp'), os.listdir(data_dir)):
        yield data_dir/data_file

Read in the text files using the file iterator

In [8]:
train_set = []
test_set = []
val_set = []

train_split = 100
val_split = 150

file_counter = 0
for file in file_iterator():
    file_counter += 1
    if file_counter <= train_split:
        train_set.append(pd.read_csv(file, sep="\t", names=['token', 'pos'], usecols=[0, 1], engine='python'))
    elif file_counter <= val_split:
        val_set.append(pd.read_csv(file, sep="\t", names=['token', 'pos'], usecols=[0, 1], engine='python'))
    else:
        test_set.append(pd.read_csv(file, sep="\t", names=['token', 'pos'], usecols=[0, 1], engine='python'))

assert len(train_set) == 100
assert len(val_set) == 50
assert len(test_set) == 49

In [9]:
train_frame = pd.concat(train_set)
test_frame = pd.concat(test_set)
val_frame = pd.concat(val_set)

assert sum([e.shape[0] for e in train_set]) == train_frame.shape[0]
assert sum([e.shape[0] for e in test_set]) == test_frame.shape[0]
assert sum([e.shape[0] for e in val_set]) == val_frame.shape[0]

In [10]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(model_type: str,
                         embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    """
    Loads a pre-trained word embedding model via gensim library.

    :param model_type: name of the word embedding model to load.
    :param embedding_dimension: size of the embedding space to consider

    :return
        - pre-trained word embedding model (gensim KeyedVectors object)
    """
    download_path = ""
    if model_type.strip().lower() == 'word2vec':
        download_path = "word2vec-google-news-300"

    elif model_type.strip().lower() == 'glove':
        download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    elif model_type.strip().lower() == 'fasttext':
        download_path = "fasttext-wiki-news-subwords-300"
    else:
        raise AttributeError("Unsupported embedding model type! Available ones: word2vec, glove, fasttext")
        
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Word2Vec: 300")
        print("Glove: 50, 100, 200, 300")
        print('FastText: 300')
        raise e

    return emb_model

In [2]:
embedding_model = load_embedding_model(model_type="glove",
                                       embedding_dimension=50)

In [18]:
# check if vocabulary in embedding model is a superset of the vocabulary in the dataset
assert set(train_frame['token'].unique()).issubset(set(embedding_model.vocab.keys()))

AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4

In [28]:
# check if vocabulary in embedding model is a superset of the vocabulary in the dataset
for word in train_frame['token'].unique():
    


  if word not in embedding_model.word_vec(word):


KeyError: "Key 'In' not present"

In [25]:
type(embedding_model.word_vec('the')[0])

  type(embedding_model.word_vec('the')[0])


numpy.float32

In [29]:
def check_OOV_terms(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                    word_listing):
    """
    Checks differences between pre-trained embedding model vocabulary
    and dataset specific vocabulary in order to highlight out-of-vocabulary terms.

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_listing: dataset specific vocabulary (list)

    :return
        - list of OOV terms
    """
    # embedding_vocabulary = set(embedding_model.vocab.keys())
    embedding_vocabulary = set(embedding_model.index_to_key)
    oov = set(word_listing).difference(embedding_vocabulary)
    return list(oov)

In [32]:
oov_terms_train = check_OOV_terms(embedding_model, train_frame['token'])
oov_percentage_train = float(len(oov_terms_train)) * 100 / len(train_frame['token'])
print(f"Total OOV terms: {len(oov_terms_train)} ({oov_percentage_train:.2f}%)")

Total OOV terms: 2041 (4.52%)


In [39]:
def build_embedding_matrix(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                           embedding_dimension: int,
                           word_to_idx,
                           vocab_size: int,
                           oov_terms):
    """
    Builds the embedding matrix of a specific dataset given a pre-trained word embedding model

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_to_idx: vocabulary map (word -> index) (dict)
    :param vocab_size: size of the vocabulary
    :param oov_terms: list of OOV terms (list)

    :return
        - embedding matrix that assigns a high dimensional vector to each word in the dataset specific vocabulary (shape |V| x d)
    """
    embedding_matrix = np.zeros((vocab_size, embedding_dimension), dtype=np.float32)
    for word, idx in tqdm(word_to_idx.items()):
        try:
            embedding_vector = embedding_model[word]
        except (KeyError, TypeError):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

        embedding_matrix[idx] = embedding_vector

    return embedding_matrix

In [49]:
from collections import OrderedDict

def build_vocabulary(df):
    """
    Given a dataset, builds the corresponding word vocabulary.

    :param df: dataset from which we want to build the word vocabulary (pandas.DataFrame)
    :return:
      - word vocabulary: vocabulary index to word
      - inverse word vocabulary: word to vocabulary index
      - word listing: set of unique terms that build up the vocabulary
    """
    idx_to_word = OrderedDict()
    word_to_idx = OrderedDict()
    
    curr_idx = 0

    for token in tqdm(df['token']):
        if token not in word_to_idx:
            word_to_idx[token] = curr_idx
            idx_to_word[curr_idx] = token
            curr_idx += 1

    word_listing = list(idx_to_word.values())
    return idx_to_word, word_to_idx, word_listing
 
idx_to_word, word_to_idx, word_listing = build_vocabulary(train_frame)

100%|██████████| 45201/45201 [00:00<00:00, 3604790.28it/s]


In [50]:
# Testing
embedding_dimension = 50
embedding_matrix = build_embedding_matrix(embedding_model, embedding_dimension, word_to_idx, len(word_to_idx), oov_terms_train)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

100%|██████████| 7404/7404 [00:00<00:00, 253040.32it/s]

Embedding matrix shape: (7404, 50)





In [52]:
len(word_listing)

7404

In [62]:
from tensorflow import keras
# create Keras model
model = keras.Sequential()

# add embedding layer with embedding matrix
model.add(keras.layers.Embedding(input_dim=len(word_to_idx),
                                    output_dim=embedding_dimension,
                                    weights=[embedding_matrix],
                                    trainable=False))

# add input layer for embedding matrix
# model.add(keras.layers.Input(shape=(None, )))                                

# add LSTM layer
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=128), input_shape=(None, embedding_dimension)))

# add dense layer
model.add(keras.layers.Dense(10))

model.summary()




Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 50)          370200    
                                                                 
 bidirectional_4 (Bidirectio  (None, 256)              183296    
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 10)                2570      
                                                                 
Total params: 556,066
Trainable params: 185,866
Non-trainable params: 370,200
_________________________________________________________________


In [64]:
# compile model
model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

# train model
model.fit(train_frame['token'], train_frame['pos'], epochs=10, batch_size=32)


Epoch 1/10


2022-11-25 13:18:28.681882: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


ValueError: in user code:

    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/engine/training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/engine/training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/engine/training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/engine/training.py", line 994, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/engine/training.py", line 1052, in compute_loss
        return self.compiled_loss(
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/losses.py", line 272, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/losses.py", line 1990, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/Users/anani/miniforge3/envs/play/lib/python3.9/site-packages/keras/backend.py", line 5529, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 1) and (None, 10) are incompatible
