In [42]:
# file management
import sys
import shutil
import urllib
import tarfile
import zipfile
from pathlib import Path

# dataframe management
import pandas as pd

# data manipulation
import numpy as np

# for readability
from typing import Iterable
from tqdm import tqdm  # for progress visualization


import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences


2024-03-04 13:25:44.516551: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Task 1 (0,5 points) - Dataset split

## Instructions

* **Download** the corpus.
* **Encode** the corpus into a pandas.DataFrame object.
* **Split** it in training, validation, and test sets.

In [34]:
class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)
        
def download_url(download_path: Path, url: str):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=download_path, reporthook=t.update_to)

        
def download_dataset(download_path: Path, url: str):
    print("Downloading dataset...")
    download_url(url=url, download_path=download_path)
    print("Download complete!")

def extract_dataset(download_path: Path, extract_path: Path):
    print("Extracting dataset... (it may take a while...)")
    with zipfile.ZipFile(download_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print("Extraction completed!")

In [35]:
#download the data from website
url = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip'
dataset_name = 'dependency_treebank'

print(f"Current work directory: {Path.cwd()}")
dataset_folder = Path.cwd().joinpath("Datasets")


if not dataset_folder.exists():
    dataset_folder.mkdir(parents=True)

dataset_zip_path = dataset_folder.joinpath("treeBank.zip")
dataset_path = dataset_folder.joinpath(dataset_name)

if not dataset_zip_path.exists():
    download_dataset(dataset_zip_path, url)

if not dataset_path.exists():
    extract_dataset(dataset_zip_path, dataset_folder)


Current work directory: /Users/madalinamone/Desktop/NLP/Assignment 1


### Encode the dataset into pd.DataFrame

In [36]:
df_rows = []
dataset_folder= dataset_folder.joinpath(dataset_name)
for fiel_path in dataset_folder.glob("*.dp"):
    with fiel_path.open(mode="r", encoding="utf-8") as dp_file:
        dp = dp_file.read()
        print(dp[0:100])
        id = fiel_path.stem.split("_")[1]
        id = int(id)
        dp = dp.split("\n")
        for el in dp:
            if el != "":
                word= el.split("\t")[0]
                #print(word)
                pos= el.split("\t")[1]
                #we don't need the third column but we have to split it to avoid errors
                n = el.split("\t")[2]
                df_row = {
                    "id": id,
                    "word": word,
                    "pos": pos
                }
                df_rows.append(df_row)
        
        
print(df_rows[0:5])

In	IN	22
reference	NN	1
to	TO	2
your	PRP$	8
Oct.	NNP	8
9	CD	8
page-one	NN	8
article	NN	3
``	``	8
Bar
Upjohn	NNP	2
Co.	NNP	3
said	VBD	0
it	PRP	5
will	MD	3
offer	VB	5
an	DT	10
early	JJ	10
retirement	NN	1
Komatsu	NNP	2
Ltd.	NNP	7
,	,	7
a	DT	7
large	JJ	7
integrated	VBN	7
maker	NN	12
of	IN	7
construction	N
Judging	VBG	33
from	IN	1
the	DT	4
Americana	NNS	2
in	IN	4
Haruki	NNP	8
Murakami	NNP	8
's	POS	13
``	`
Although	IN	15
his	PRP$	3
team	NN	4
lost	VBD	1
the	DT	7
World	NNP	7
Series	NNP	4
,	,	15
San	NNP	14
F
Sir	NNP	3
Peter	NNP	3
Walters	NNP	6
,	,	6
58-year-old	JJ	6
chairman	NN	15
of	IN	6
British	NNP	10
Pet
PAPERS	NNS	0
:	:	1

Backe	NNP	3
Group	NNP	3
Inc.	NNP	4
agreed	VBD	0
to	TO	6
acquire	VB	4
Atlantic	NN
Congress	NNP	2
learned	VBD	0
during	IN	2
the	DT	6
Reagan	NNP	6
administration	NN	3
that	IN	2
it	PRP	
A	DT	2
form	NN	12
of	IN	2
asbestos	NN	3
once	RB	6
used	VBN	2
to	TO	8
make	VB	6
Kent	NNP	11
cigarette
Standard	NNP	4
&	CC	4
Poor	NNP	4
's	POS	5
Corp.	NNP	6
lowered	VBD	0
to	TO	6
double-C	NN	7
f

In [37]:
folder = Path.cwd().joinpath("Datasets", "Dataframes", dataset_name)
if not folder.exists():
    folder.mkdir(parents=True)

#transform the list of rows into a dataframe using pandas
df = pd.DataFrame(df_rows)
df = df[["id", 
         "word",
         "pos"]
       ]
df_path = folder.with_name(dataset_name + ".pkl")
df.to_pickle(df_path)


### Inspect Dataset

In [38]:
print("Dataframe structure:")
print(df)
print()

print("Total rows %d" % (len(df)))
print()

Dataframe structure:
       id         word   pos
0      95           In    IN
1      95    reference    NN
2      95           to    TO
3      95         your  PRP$
4      95         Oct.   NNP
...    ..          ...   ...
94079  45           or    CC
94080  45  discontinue    VB
94081  45      Scoring   NNP
94082  45         High   NNP
94083  45            .     .

[94084 rows x 3 columns]

Total rows 94084



## Split the dataset

* **train** : 1-100
* **validation** : 101-150
* **test** : 151-199 


In [39]:
#split the dataframe into train, validation and test
train_df = df[df['id'] <= 100]
validation_df = df[(df['id']>100) & (df['id']<=150)]
test_df = df[df['id']>150]

df = df.drop('id', axis=1)
#show the first 5 elements of each dataframe
print("Train dataframe:")
print(train_df.head())
print()
print("Validation dataframe:")
print(validation_df.head())
print()
print("Test dataframe:")
print(test_df.head())
print("Dataframe senza id:")
print(df.head(5))
print()

Train dataframe:
   id       word   pos
0  95         In    IN
1  95  reference    NN
2  95         to    TO
3  95       your  PRP$
4  95       Oct.   NNP

Validation dataframe:
       id      word   pos
2663  126  Although    IN
2664  126       his  PRP$
2665  126      team    NN
2666  126      lost   VBD
2667  126       the    DT

Test dataframe:
      id    word  pos
227  184  Upjohn  NNP
228  184     Co.  NNP
229  184    said  VBD
230  184      it  PRP
231  184    will   MD
Dataframe senza id:
        word   pos
0         In    IN
1  reference    NN
2         to    TO
3       your  PRP$
4       Oct.   NNP



# Task 2 (0,5 points) - Embedding
To train a neural POS tagger, you first need to encode text into numerical format.


### Instructions

* Embed words using **GloVe embeddings**.
* You are **free** to pick any embedding dimension.
* [Optional] You are free to experiment with text pre-processing: **make sure you do not delete any token!**

### Text Pre-processing

Text to lower


In [40]:
def lower(df: pd.DataFrame) -> pd.DataFrame:
    """
    Transforms the pandas dataframe.
    """
    df = df.applymap(lambda s:s.lower() if type(s) == str else s)
    return df

df = lower(df)
print(df.head())


        word   pos
0         in    in
1  reference    nn
2         to    to
3       your  prp$
4       oct.   nnp


### Gensim - GloVe embedding

In [None]:
!pip install gensim

In [28]:
import gensim
import gensim.downloader as gloader

def load_glove_embedding_model(embedding_dimension):
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    emb_model = gloader.load(download_path)
    
    return emb_model

In [29]:
glove_model_50 = load_glove_embedding_model(50)

In [30]:
glove_model_200 = load_glove_embedding_model(200)

In [41]:

from gensim.models import KeyedVectors  # to load GloVe embeddings

# Step 1: Retrieve GloVe embeddings for each token
def get_embedding(token, model):
    try:
        return model[token]
    except KeyError:
        # If token is not found in the model, return zeros
        return np.zeros(model.vector_size)

def get_sentence_embedding(tokens, model):
    embeddings = [get_embedding(token, model) for token in tokens]
    # Aggregate embeddings by taking the mean
    return np.mean(embeddings, axis=0)

# Step 3: Apply GloVe embeddings to each row
tqdm.pandas()  # for progress bar
df['wor_embeddings'] = df['word'].progress_apply(lambda x: get_sentence_embedding(x, glove_model_50))
df['pos_embeddings'] = df['pos'].progress_apply(lambda x: get_sentence_embedding(x, glove_model_50))


print(df)



100%|██████████| 94084/94084 [00:03<00:00, 28080.95it/s]
100%|██████████| 94084/94084 [00:03<00:00, 29212.78it/s]

              word   pos                                     wor_embeddings  \
0               in    in  [-0.075565, 0.633475, 0.4836135, -0.08109999, ...   
1        reference    nn  [0.05720667, 0.74987555, 0.9641056, 0.82249, -...   
2               to    to  [-0.2115055, 0.96839, 0.46107498, 0.879405, 0....   
3             your  prp$  [-0.15404025, 1.0023575, 0.48766625, 1.0826524...   
4             oct.   nnp  [-0.19004776, 0.6802075, 0.30294502, 0.7082200...   
...            ...   ...                                                ...   
94079           or    cc  [-0.23831551, 1.035215, 0.451735, 0.92174, -0....   
94080  discontinue    vb  [-0.073635556, 0.7427291, 0.6463622, 0.5267691...   
94081      scoring   nnp  [-0.23221444, 0.86884433, 0.5766224, 0.7101614...   
94082         high   nnp  [-0.17292249, 0.7101325, 0.63558173, 0.6356600...   
94083            .     .  [0.15164, 0.30177, -0.16763, 0.17684, 0.31719,...   

                                          pos_embed




# [Task 3 - 1.0 points] Model definition

You are now tasked to define your neural POS tagger.

### Instructions

* **Baseline**: implement a Bidirectional LSTM with a Dense layer on top.
* You are **free** to experiment with hyper-parameters to define the baseline model.

* **Model 1**: add an additional LSTM layer to the Baseline model.
* **Model 2**: add an additional Dense layer to the Baseline model.

* **Do not mix Model 1 and Model 2**. Each model has its own instructions.

**Note**: if a document contains many tokens, you are **free** to split them into chunks or sentences to define your mini-batches.

# [Task 4 - 1.0 points] Metrics

Before training the models, you are tasked to define the evaluation metrics for comparison.

### Instructions

* Evaluate your models using macro F1-score, compute over **all** tokens.
* **Concatenate** all tokens in a data split to compute the F1-score. (**Hint**: accumulate FP, TP, FN, TN iteratively) 
* **Do not consider punctuation and symbol classes** $\rightarrow$ [What is punctuation?](https://en.wikipedia.org/wiki/English_punctuation)

**Note**: What about OOV tokens?
   * All the tokens in the **training** set that are not in GloVe **must** be added to the vocabulary.
   * For the remaining tokens (i.e., OOV in the validation and test sets), you have to assign them a **special token** (e.g., [UNK]) and a **static** embedding.
   * You are **free** to define the static embedding using any strategy (e.g., random, neighbourhood, etc...)

### More about OOV

For a given token:

* **If in train set**: add to vocabulary and assign an embedding (use GloVe if token in GloVe, custom embedding otherwise).
* **If in val/test set**: assign special token if not in vocabulary and assign custom embedding. 

Your vocabulary **should**:

* Contain all tokens in train set; or
* Union of tokens in train set and in GloVe $\rightarrow$ we make use of existing knowledge!


### Token to embedding mapping

You can follow two approaches for encoding tokens in your POS tagger.

### Work directly with embeddings

- Compute the embedding of each input token
- Feed the mini-batches of shape (batch_size, # tokens, embedding_dim) to your model

In [None]:
embedding = tf.keras.layers.Embedding(input_dim=vocab_size,
                                      output_dim=embedding_dimension,
                                      weights=[embedding_matrix],
                                      mask_zero=True,                   # automatically masks padding tokens
                                      name='encoder_embedding')

### Work with Embedding layer

- Encode input tokens to token ids
- Define a Embedding layer as the first layer of your model
- Compute the embedding matrix of all known tokens (i.e., tokens in your vocabulary)
- Initialize the Embedding layer with the computed embedding matrix
- You are **free** to set the Embedding layer trainable or not

From tutorial: https://github.com/christianversloot/machine-learning-articles/blob/main/bidirectional-lstms-with-tensorflow-and-keras.md

In [None]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Model configuration
additional_metrics = ['accuracy']
batch_size = 128
embedding_output_dims = 15
loss_function = BinaryCrossentropy()
max_sequence_length = 300
num_distinct_words = 5000
number_of_epochs = 5
optimizer = Adam()
validation_split = 0.20
verbosity_mode = 1

# what is that?
# Pad all sequences
padded_inputs = pad_sequences(x_train, maxlen=max_sequence_length, value = 0.0) # 0.0 because it corresponds with <PAD>
padded_inputs_test = pad_sequences(x_test, maxlen=max_sequence_length, value = 0.0) # 0.0 because it corresponds with <PAD>

# Define the Keras model
model = Sequential()
model.add(Embedding(num_distinct_words, embedding_output_dims, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(10), merge_mode='sum'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=optimizer, loss=loss_function, metrics=additional_metrics)

# Give a summary
model.summary()

# Train the model
history = model.fit(padded_inputs, y_train, batch_size=batch_size, epochs=number_of_epochs, verbose=verbosity_mode, validation_split=validation_split)

# Test the model after training
test_results = model.evaluate(padded_inputs_test, y_test, verbose=False)
print(f'Test results - Loss: {test_results[0]} - Accuracy: {100*test_results[1]}%')

From the tutorial: https://medium.com/@anishnama20/understanding-bidirectional-lstm-for-sequential-data-processing-b83d6283befc

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense

# Define the model architecture
model = Sequential()

# Add an embedding layer to convert input sequences to dense vectors
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))

# Add a Bidirectional LSTM layer
model.add(Bidirectional(LSTM(units=lstm_units, return_sequences=True)))

# Add a dense output layer
model.add(Dense(units=num_classes, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
model.summary()

''' 
vocab_size with the size of your vocabulary, 
embedding_dim with the desired dimensionality of the embedding space, 
max_sequence_length with the maximum length of your input sequences, 
lstm_units with the number of LSTM units
num_classes with the number of classes in your classification task.
'''