# Assignment 1: Part Of Speech tagging

## Imports

In [None]:
# Dependency management
!pip install tensorflow==2.7.0 numpy==1.21.4 pandas==1.3.4 requests==2.26.0 gensim==4.1.2 wandb==0.12.7 -qqq

In [None]:
# Disable tensorflow warnings
import os
import tensorflow as tf
import logging

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.autograph.set_verbosity(0)
logging.getLogger("tensorflow").setLevel(logging.ERROR)

In [None]:
# Text pre-processing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Model definition
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, GRU, Bidirectional, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import mixed_precision

# Data packages
import numpy as np
import pandas as pd

# System packages
import glob
import os
import logging

# Cloning
from copy import deepcopy

# File management
import requests
import zipfile
import io
from pathlib import Path

# Notebook visualization
from IPython.core.display import display

# Typing
from typing import Set, List, Dict, Tuple

# For GloVe wrapper
from gensim import downloader as gensloader
from gensim.models.keyedvectors import KeyedVectors

# Plotting
import plotly.express as px
import wandb
from wandb.keras import WandbCallback

# Metrics and utility
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight
import time

In [None]:
# Set the seeds
SEED = 1
tf.random.set_seed(SEED)
np.random.seed(SEED)

## 1 - Data Pipeline

### 1.1 - Data loading
First, we load the dataset and store it into a dataframe.

In [None]:
DATASET_PATH = './dependency_treebank'  # Change if dataset already present locally
DATASET_URL = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip'


def load_dataset(ds_path: str, ds_url: str) -> pd.DataFrame:
    # Check if dataset is already present, otherwise download it
    if not Path(ds_path).exists():
        request_zip = requests.get(ds_url, stream=True)
        zip = zipfile.ZipFile(io.BytesIO(request_zip.content))
        zip.extractall()

    # Load each file into a list
    documents = []
    for file_name in sorted(glob.glob(f"{ds_path}/*.dp")):
        with open(file_name) as f:
            documents.append(f.read())

    # Convert each row of the documents into a list
    raw_df = []
    sentence_idx = 0
    for doc_idx, doc in enumerate(documents):
        rows = doc.split('\n')
        for row in rows:
            cols = row.split('\t')[:2]  # Ignore the last column
            if cols == ['']:
                sentence_idx += 1
            else:
                raw_df.append([doc_idx, sentence_idx, *cols])

    # Finally, convert the nested list into a pandas dataframe
    df = pd.DataFrame(raw_df, columns=['document', 'sentence', 'token', 'tag'])
    return df


dataset = load_dataset(DATASET_PATH, DATASET_URL)
dataset.head()

### 1.2 - GloVe loading
Then, we load the GloVe embeddings (GloVe-50, to be precise).

In [None]:
EMB_DIM = 50
GLOVE_TYPE = f'glove-wiki-gigaword-{EMB_DIM}'
GLOVE_FILE = f'./glove/glove-wiki-gigaword-{EMB_DIM}.kv'


def load_glove(gl_file: str, gl_type: str) -> KeyedVectors:
    # Load local version
    path = Path(gl_file)
    if path.exists():
        return KeyedVectors.load(gl_file)

    # Otherwise download and store glove
    path.parent.mkdir(parents=True, exist_ok=True)
    glove = gensloader.load(gl_type)
    glove.save(gl_file)
    return glove


glove = load_glove(GLOVE_FILE, GLOVE_TYPE)

In [None]:
# Test GloVe loading
print(f'cat = {glove["cat"]}')

### 1.3 - Pre-processing
Our dataset is already relatively clean; however, one point that might be worth considering is how to handle lowercase conversions. Some tokens in our dataset will be intrinsically capitalized (e.g. proper nouns, the personal pronoun "I"), whereas some other will be capitalized only because they follow a period in the sentence they occur in.

One might think of converting a token to lowercase based on its tag (e.g. if a token is a proper noun, keep it capitalized); however, to be fair, this could only be done on the training set, since in a real scenario test-set tags would be unknown.

Anyway, all these considerations hold only if GloVe contains embeddings of capitalized words; if that's not the case, every word we keep as capitalized will be classified as OOV when matched with GloVe, even when their lowercase embedding actually exists.

As it turns out, Glove does not encode capitalized words:


In [None]:
num_capitalized = len(
    list(filter(lambda w: w[0].isupper(), glove.key_to_index.keys())))

print(f'GloVe-50 encodes {num_capitalized} capitalized words')

Therefore, we will be forced to convert all tokens to lowercase.

We are also interested to see which "special" tokens are encoded in GloVe, i.e. punctuation, quotation marks, and tokens such as "-LRB-" and "-RRB-", which in our dataset replace "(" and ")", respectively.

As it turns out, GloVe contains every special symbol we care about, except for tokens reserved to brackets:

In [None]:
special_tokens = [
    *',.:;"`$#£!%/?^-()[]{}_', "''", "``", "--", "-LRB-", "-RRB-", "-LSB-",
    "-RSB-", "-LCB-", "-RCB-"
]
for st in special_tokens:
    if st not in glove:
        print(f"GloVe does not contain token {st}")

Based on the previous considerations, we convert all tokens to lowercase and replace "-LRB"-like symbols with the corresponding bracket:

In [None]:
# Convert the brackets
for token, bracket in [('-LRB-', '('), ('-RRB-', ')'), ('-LSB-', '['),
                       ('-RSB-', ']'), ('-LCB-', '{'), ('-RCB-', '}')]:
    dataset.loc[dataset.token == token, 'token'] = bracket

# Convert dataset tokens to lowercase
dataset.loc[:, 'token'] = dataset['token'].str.lower()

### 1.4 - Splitting
After pre-processing the data, we can split the dataset into train, validation and test sets.

In [None]:
TRAIN_DOC_UB = 99
TEST_DOC_LB = 150

ds_train = dataset[dataset['document'].le(TRAIN_DOC_UB)]
ds_val = dataset[dataset['document'].between(
    TRAIN_DOC_UB, TEST_DOC_LB, inclusive='neither')].reset_index()
ds_test = dataset[dataset['document'].ge(TEST_DOC_LB)].reset_index()

print_split = lambda df: f"{df.groupby('document').ngroups} documents, {len(df)} tokens"
print(f"""Dataset split: 
    TRAIN: {print_split(ds_train)}
    VALIDATION: {print_split(ds_val)}
    TEST: {print_split(ds_test)}
""")

### 1.5 - OOV Handling

#### 1.5.1 - OOV Analysis
First of all, let us take a look at how many Out-Of-Vocabulary tokens (w.r.t. GloVe) our dataset contains. In order to simulate a real-world scenario, in which test samples are not readily available at training time, we are going to check (and then handle) OOVs *incrementally*; that is, we will consider:
* **OOV1:** training-set tokens which are not found in V1 = GloVe.
* **OOV2:** validation-set tokens which are not found in V2 = `union(`V1, OOV1`)`.
* **OOV3:** test-set tokens which are not found in V3 = `union(`V2, OOV2`)`.

Our final vocabulary, which will provide encodings to our model(s), is V4 = `union(`V3, OOV3`)` = `union(`GloVe, OOV1, OOV2, OOV3).

In [None]:
glove_keys = set(glove.key_to_index.keys())
oov1 = set(ds_train['token']) - glove_keys
v2 = glove_keys.union(oov1)
oov2 = set(ds_val['token']) - v2
v3 = v2.union(oov2)
oov3 = set(ds_test['token']) - v3

print(f'OOV1:  {len(oov1)} tokens')
print(f'OOV2:  {len(oov2)} tokens')
print(f'OOV3:  {len(oov3)} tokens')
l = len(oov1) + len(oov2) + len(oov3)
print(
    f'Total: {l} tokens ({l / len(set(dataset["token"])) * 100:.2f}% of dataset)'
)


#### 1.5.2 - Adding OOVs to GloVe
We can now add OOV tokens to the GloVe vocabulary. Many strategies can be adopted to encode OOVs as vectors:
1. Static embeddings with the same vector for all OOV tokens (e.g. zeros).
2. Random embeddings. 
3. Computing an embedding as some statistic involving neighboring tokens (e.g. their mean).

Two observations can guide us in the choice of an embedding strategy:
* OOV tokens are not negligible (about 6% of the *total* dataset).
* Our GloVe embeddings will not undergo further training, therefore fixed or random embedding values will not be refined during the training process.

For the two reasons above, given an OOV token, we will compute its embedding as the mean of its left and right neighbors across all its occurrences throughout the dataset:


In [None]:
def compute_neighbor_mean(oov_token: str, df: pd.DataFrame,
                          embeddings: KeyedVectors) -> np.ndarray:
    # Find indexes where the oov token appears, and shift them by -1 +1
    indexes = df.index[df['token'] == oov_token].values
    indexes = np.concatenate((indexes - 1, indexes + 1))

    # For each oov word index, look at the left and right until a word with embedding has been found
    neighbor_embeddings = []
    for idx in indexes:
        for direction in (range(idx - 1, -1, -1), range(idx + 1, len(df))):
            for i in direction:
                tok = df['token'].iloc[i]
                if tok not in embeddings:
                    continue
                vector = embeddings[tok]
                neighbor_embeddings.append(vector)
                break

    return np.mean(neighbor_embeddings, axis=0)


def add_oovs(oov_tokens: Set, df: pd.DataFrame,
             embeddings: KeyedVectors) -> KeyedVectors:
    # Clone the embedding (KeyedVectors does not have a clone method)
    emb_filled = deepcopy(embeddings)

    # Estimate the OOV embeddings
    keys, values = [], []
    for oov in oov_tokens:
        vector = compute_neighbor_mean(oov, df, emb_filled)
        keys.append(oov)
        values.append(vector)
    # Add the estimates to the embedding
    emb_filled.add_vectors(keys, values)
    return emb_filled

In [None]:
# V2 = union(Glove, OOV1)
# where neighbors of OOV1 are taken from the trainig set
embeddings = add_oovs(oov1, ds_train, glove)

# V3 = union(V2, OOV2)
# where neighbors of OOV2 are taken from the validation set
embeddings = add_oovs(oov2, ds_val, embeddings)

# V4 = union(V3, OOV3)
# where neighbors of OOV3 are taken from the test set
embeddings = add_oovs(oov3, ds_test, embeddings)

# Test number of embeddings
print(f'Number of vectors in original GloVe:                  {len(glove)}')
print(
    f'Number of vectors after incremental addition of OOVs: {len(embeddings)}')


### 1.6 - Embedding Matrix

In [None]:
# Define the indexes used for word -> index -> embedding
word_index = {k: v + 1
              for k, v in embeddings.key_to_index.items()
              }  # +1 because index 0...
vocab_size = len(embeddings) + 1  # ...will be reserved to padding

# Define the embedding matrix
embedding_matrix = np.zeros(shape=(vocab_size, EMB_DIM))
for word, index in word_index.items():
    embedding_matrix[index] = embeddings[word]

In [None]:
# Quick test
assert np.all(embedding_matrix[embeddings.key_to_index['cat'] +
                               1] == embeddings['cat'])

### 1.7 - Data Conversion

The input data of our model could be either whole documents or single sentences contained in those documents; we will choose sentences as input data.

Tokens in each sentence will be converted to integer sequences and later fed into a static `Embedding` layer storing the matrix of Glove encodings + OOVs, which will provide the input to our model.

The corresponding tags—i.e. the output of our model—will be instead one-hot encoded. The rationale behind this choice is that tags are purely categorical data, hence encoding them as integer sequences would inject a notion of ordering into the model, which however is not reflected in the original data.

In [None]:
# Utility function
flatten_1d = lambda nested_list: [li[0] for li in nested_list]

# Convert tokens into sequences (their vocabulary indexes)
tokenizer = Tokenizer(filters='')
tokenizer.word_index = word_index
token_indexes = tokenizer.texts_to_sequences_generator(dataset['token'].array)
token_indexes = flatten_1d(token_indexes)

# Convert tags into sequences
# (as an intermediate step before one-hot encoding them)
tag_to_int = {k: v + 1 for v, k in enumerate(dataset['tag'].unique())}
num_tags = len(tag_to_int) + 1
tokenizer = Tokenizer(filters='', lower=False)
tokenizer.word_index = tag_to_int
tag_indexes = tokenizer.texts_to_sequences_generator(dataset['tag'].array)
tag_indexes = flatten_1d(tag_indexes)

# Augment dataset with new data
dataset['token_index'] = token_indexes
dataset['tag_index'] = tag_indexes

# Group dataset by 'sentence', aggregating remaining data into lists
ds_sentences = dataset.groupby(['document', 'sentence']).agg(list)
ds_sentences.head()

Let's take a look at the distribution of sentence length in the training + validation set (leaving the test set aside) to determine what is an appropriate padded-sequence size (for batching).

In [None]:
sentences_len = ds_sentences.query(
    f'document <= {TEST_DOC_LB-1}')['token'].transform(len)
len_quantile = sentences_len.quantile(.99)
print("99th percentile of sentence length in training + validation set:",
      len_quantile)

fig = px.histogram(sentences_len, labels={'value': 'Sentence Length'})
fig.add_vline(len_quantile,
              annotation_text="99th percentile",
              line_color='green')
fig.update_layout(showlegend=False)
fig.show()

The 99th percentile suggests to trim sentences that exceed 56 tokens and pad sentences with fewer tokens, in order to prevent the few outliers from causing sentence encodings to be wastefully long.

In [None]:
pad = lambda x: pad_sequences(
    x, maxlen=int(len_quantile), padding='pre', truncating='pre')


def get_model_data(df: pd.DataFrame, lb=0, ub=None):
    df = df.query(f'{lb} <= document <= {ub}') if ub else df.query(
        f'{lb} <= document')
    toks = pad(df['token_index'])
    tags = pad(df['tag_index'])
    # One-hot encode tags
    tags = to_categorical(tags, num_classes=num_tags)
    return toks, tags


# Build the data that will be fed to the model
x_train, y_train = get_model_data(ds_sentences, 0, TRAIN_DOC_UB)
x_val, y_val = get_model_data(ds_sentences, TRAIN_DOC_UB + 1, TEST_DOC_LB - 1)
x_test, y_test = get_model_data(ds_sentences, TEST_DOC_LB)

# Check shapes
print(f"""
X shapes [sentences x tokens]
    x_train.shape = {x_train.shape}
    x_val.shape   = {x_val.shape}
    x_test.shape  = {x_test.shape}

Y shapes [sentences x tags x one-hot-size]
    y_train.shape = {y_train.shape}
    y_val.shape   = {y_val.shape}
    y_test.shape  = {y_test.shape}
""")

### 1.8 - Class Imbalance Analysis
It is reasonable to expect tags to be non-uniformly distributed throughout our dataset, as some parts of speech are intrinsically more common than others in any natural language.

In [None]:
# Plot tag distribution
px.histogram(dataset, x='tag').show()

And indeed, as shown by the histogram above, the tag distribution in out dataset is clearly imbalanced. In order to counteract this phenomenon during training, we are going to assign a (positive) weight to each tag — where such weights get bigger as tags get less common — and then apply those weights to the loss function. In other words, our models are going to get "more rewarded" when they correctly predict an uncommon tag.

In [None]:
def get_sample_weigths(y: np.ndarray) -> np.ndarray:
    """Compute class weights for unbalanced label vector y.

    Args:
         y: array of shape (n_samples, sequence_length, one_hot_length)
            storing, for each sample, a sequence of one-hot encoded elements.
    
    Returns:
        Array of shape (n_samples, sequence_length) storing, for each sample,
        a sequence of weights which are inversely proportional to the number 
        of occurrences of the corresponding sequence element in y.
    """
    y_int = np.argmax(y, axis=-1)
    class_weights = compute_class_weight('balanced',
                                         classes=np.unique(y_int.flatten()),
                                         y=y_int.flatten())
    class_weights[0] = 0  # Ignore padding
    int_to_weight = {i: w for i, w in zip(np.unique(y_int), class_weights)}
    int_to_weight_v = np.vectorize(int_to_weight.get)
    return int_to_weight_v(y_int)


train_sample_weights = get_sample_weigths(y_train)
val_sample_weights = get_sample_weigths(y_val)

## 2 - Model Definition

### 2.1 - Hyperparameters

In [None]:
# Hyperparameters shared by all models
model_hyperparams = dict(
    epochs=500,
    batch_size=256,
    mem_units=32,
    optimizer='adam',
    learning_rate=0.001,
    loss='categorical_crossentropy',
)

# Embedding layer hyperparameters — same for all models
embedding_hyperparams = dict(
    input_dim=vocab_size,
    output_dim=EMB_DIM,
    input_length=x_train.shape[-1],
    weights=[embedding_matrix],
    trainable=False,
)

### 2.2 - Architectures
Four model architectures will be compared, each with different combinations of recurrent and dense layers.

In [None]:
mu = model_hyperparams['mem_units']  # Number of memory units

# Baseline: Bidirectional LSTM + Dense layer
bi_lstm = Sequential([
    Embedding(**embedding_hyperparams),
    Bidirectional(LSTM(mu, return_sequences=True)),
    Dense(num_tags, activation='softmax'),
], 'BiLSTM-Dense')

# Bidirectional GRU + Dense layer
bi_gru = Sequential([
    Embedding(**embedding_hyperparams),
    Bidirectional((GRU(mu, return_sequences=True))),
    Dense(num_tags, activation='softmax'),
], 'BiGRU-Dense')

# Bidirectional LSTM + Bidirectional LSTM + Dense layer
double_bi_lstm = Sequential([
    Embedding(**embedding_hyperparams),
    Bidirectional(LSTM(mu, return_sequences=True)),
    Bidirectional(LSTM(mu, return_sequences=True)),
    Dense(num_tags, activation='softmax'),
], '2xBiLSTM-Dense')

# Bidirectional LSTM + Dense layer + Dense layer
double_dense = Sequential([
    Embedding(**embedding_hyperparams),
    Bidirectional(LSTM(mu, return_sequences=True)),
    Dense(2 * num_tags, activation='relu'),
    Dense(num_tags, activation='softmax'),
], 'BiLSTM-2xDense')

## 3 - Training
All four models will be trained for the same number of epochs and with the same hyperparameters (when comparable). During training, both accuracy and weighted accuracy will be monitored. When training ends, the only stored weigths will be those corresponding to the epoch at which the model reached maximum weighted accuracy on the validation set.

In [None]:
# Set this to true to do an anonymous session
# and reproduce the results without logging into wandb
ANON_SESSION = False
RUN_TRAINING = False

group_name = time.strftime('%Y%m%d-%H%M')
models_history = dict()

# Train and validate all models, logging data in wandb
if RUN_TRAINING:
    trained_runs_ids = []

    for model in [bi_lstm, bi_gru, double_bi_lstm, double_dense]:

        # Initialize wandb with relevant info
        config = model_hyperparams
        config['model-name'] = model.name
        wandb_params = dict(
            project='NLP-POS-Tagging',
            name=model.name,
            reinit=True,
            config=config,
            group=group_name,
        )

        if ANON_SESSION:
            run = wandb.init(anonymous='must', **wandb_params)
        else:
            run = wandb.init(entity='frantoman', **wandb_params)

        # Define a keras callback to save the model when it reaches
        # max weighted validation accuracy
        model_checkpoint = ModelCheckpoint(
            filepath=os.path.join(wandb.run.dir, f'{model.name}.h5'),
            monitor='val_acc',
            mode='max',
        )

        # Compile the model
        model.compile(
            optimizer=Adam(learning_rate=config['learning_rate']),
            loss='categorical_crossentropy',
            metrics=['acc'],
            # weighted_metrics=['acc'],
        )
        print(f'========= TRAINING MODEL: {model.name} {run.id}=========')
        with run:

            # Fit on data and run validation
            history = model.fit(
                x_train,
                y_train,
                epochs=config['epochs'],
                batch_size=config['batch_size'],
                # validation_data=(x_val, y_val, val_sample_weights),
                validation_data=(x_val, y_val),
                # sample_weight=train_sample_weights,
                callbacks=[WandbCallback(save_model=False), model_checkpoint],
                verbose=0,
            )

            # Log useful stuff
            h = history.history
            info = dict(
                best_val_weighted_acc=np.max(h['val_acc']),
                best_weighted_epoch=np.argmax(h['val_acc']) + 1,
            )
            run.log(info)
            trained_runs_ids.append(run.id)
    print(trained_runs_ids)

In [None]:
# Free up some memory from previous models
del bi_lstm
del bi_gru
del double_bi_lstm
del double_dense
tf.keras.backend.clear_session()

## 4 - Evaluation

### 4.1 - Loading Trained Models
Once models have been trained and their "best" weights have been saved, we can load them back to evaluate their performance on unseen data.

In [None]:
acc_ids = ['1r268con', '13s0tew7', 'mszgpyn0', '36hs34rs']
weighted_acc_ids = ['3m0ggwu5', '3owjlhkw', '3qw2e1fu', '3st72i2q']


def load_models(ids: List[str]) -> Dict[str, tf.keras.Model]:
    """Load keras models and training info from wandb.

    Args:
        ids: list of wandb run ids

    Returns:
        models: mapping from model names to keras objects
    """
    models = dict()
    for id in ids:
        uri = 'frantoman/NLP-POS-Tagging/' + id
        api = wandb.Api()
        run = api.run(uri)

        mn = run.config['model-name']
        print('Downloading', mn)
        run.file(f'{mn}.h5').download(f'./models/', replace=True)

        model = load_model(f'./models/{mn}.h5')
        models[model.name] = model
    return models


models_unweighted = load_models(acc_ids)
print('--------------------------')
models_weighted = load_models(weighted_acc_ids)

### 4.2 - Evaluation functions
Now we define some functions to evaluate the model on a dataset.

In [None]:
def to_dataframe(x, y, p) -> pd.DataFrame:
    """Convert input, ground truth and prediction of a model to a dataframe"""
    # Define a mapping int->token and int->tag
    int_to_tok = {v: k for k, v in word_index.items()}
    int_to_tag = {v: k for k, v in tag_to_int.items()}
    convert_tok = np.vectorize(int_to_tok.get)
    convert_tag = np.vectorize(lambda i: int_to_tag.get(i, 'PAD'))

    # Convert one-hot encodings to integer indexes
    y = np.argmax(y, axis=-1)
    p = np.argmax(p, axis=-1)

    # Convert integer indexes to the corresponding token/tag, skipping padding
    raw_df = dict(token=[], tag_true=[], tag_pred=[])
    for zx, zy, zp in zip(x, y, p):
        pad_mask = zx > 0
        raw_df['token'] += convert_tok(zx[pad_mask]).tolist()
        raw_df['tag_true'] += convert_tag(zy[pad_mask]).tolist()
        raw_df['tag_pred'] += convert_tag(zp[pad_mask]).tolist()

    # Note: the returned df will contain truncated sentences
    df = pd.DataFrame(raw_df)
    df['tag_true'] = df['tag_true'].astype('category')
    df['tag_pred'] = df['tag_pred'].astype('category')
    return df


def classif_report(pred: pd.DataFrame) -> pd.DataFrame:
    """Writes a report showing the main classification metrics
    on a text file.

    Args:
        pred: dataframe with columns 'tag_true' and 'tag_pred'.
        model_name: name of the model the dataset refers to.
        weigthed: whether the model has been trained with weighted loss or not.
    """
    # First, remove punctuation
    punctuation = "``|''|\,|\.|\:|-LRB-|-RRB-"
    is_punct = pred['tag_true'].str.contains(punctuation)
    pred = pred[~is_punct]

    # Then calculate the classification report
    cr = classification_report(
        pred['tag_true'],
        pred['tag_pred'],
        labels=pred['tag_true'].unique(),
        zero_division=0,
        output_dict=True,
    )
    df = pd.DataFrame(cr).transpose().round(3)
    return df


def evaluate_model(model, x, y):
    df_pred = to_dataframe(x, y, model.predict(x))
    cr = classif_report(df_pred)
    f1m = cr.loc['macro avg', 'f1-score']
    return df_pred, cr, f1m

### 4.2 - Weighted vs Unweighted class training
In the training phase we tried solving the problem class unbalance by using sample weighting.  
Now we evaluate if the sample weighting has had any effect on the model's performance, by using F1-Macro on the validation set.

In [None]:
weight_evaluation = []
for models, note in [
    (models_unweighted, "Un-Weighted"),
    (models_weighted, "Weighted"),
]:
    for model in models.values():
        _, _, f1m = evaluate_model(model, x_val, y_val)
        weight_evaluation.append(
            dict(model_name=model.name,
                 notes=note,
                 f1_macro_val=f1m,
                 model=model))

weight_evaluation = pd.DataFrame(weight_evaluation).sort_values(
    by=['notes', 'f1_macro_val'], ascending=False)
display(weight_evaluation.loc[:, weight_evaluation.columns != 'model'])

Surprisingly, there is not much difference between the weighted and unweighted training. Nevetheless, a slight increase in F1-Score is present. TODO

### 4.3 - Test Set Performance
In the following, we are going to evaluate the test set performance of the two models that reached the highest f1 score on the validation set.

In [None]:
best_models_eval = weight_evaluation.iloc[:2].copy().reset_index(drop=True)
best_models_eval['f1_macro_test'] = [
    evaluate_model(model, x_test, y_test)[2]
    for model in best_models_eval.model.values
]
display(best_models_eval.loc[:, best_models_eval.columns != 'model'])

The test set performance exceeds the expectations with respect to the validation set.

## 5 - Error Analysis
In this section we analyze the errors made on the test set by the best model.

In [None]:
best_model = best_models_eval.sort_values(by=['f1_macro_val'],
                                          ascending=False).loc[0, 'model']
test_pred, report, _ = evaluate_model(best_model, x_test, y_test)

### 5.1 - Problematic classes

In [None]:
metric = 'f1-score'
class_f1 = report[:-3].sort_values(by=metric)
class_f1['support'] = -class_f1['support'] / sum(
    class_f1['support'])  # Norm+sign
fig = px.histogram(class_f1, x=class_f1.index, y=[metric, 'support'])
fig.show()

It seems that f1 scores are not correlated with their support. Moreover, classes with high support such as `NN` and `NNP` are not the best scoring, while the worse class of all is `NNPS`.

In [None]:
def plot_confusion_matrix(df: pd.DataFrame, threshold: float = 1.0):
    # Extract true tags and predictions
    corr, pred = df['tag_true'], df['tag_pred']
    labels = list(set(corr) | set(pred))  # Get tag names

    # Compute confusion matrix
    conf = confusion_matrix(corr, pred, labels=labels, normalize='true')

    # Threshold the diagonal, to keep only problematic classes
    diag = conf.diagonal() <= threshold
    conf = conf[np.ix_(diag, diag)]
    labels = np.array(labels)[diag]

    # Plot matrix
    fig = px.imshow(conf,
                    x=labels,
                    y=labels,
                    labels={
                        'x': 'Predicted Tag',
                        'y': 'True Tag',
                        'color': 'Value'
                    })
    fig.update(layout_coloraxis_showscale=False)
    fig.update_layout(width=600, height=600)
    fig.show()


plot_confusion_matrix(test_pred, threshold=0.8)

### 5.2 - OOV Words
Lets see if OOV words have an impact on the f1 macro score

In [None]:
def get_f1_tok_subset(df, tokens):
    mask = df['token'].apply(lambda t: t not in tokens)
    df = df[mask]
    cr = classif_report(df)
    return cr.loc['macro avg', 'f1-score']


print(f"""
F1-Macro on test set:
  - All tokens:           {get_f1_tok_subset(test_pred, [])}
  - Tokens not in GloVe:  {get_f1_tok_subset(test_pred, glove_keys)}
  - Tokens not in V1:     {get_f1_tok_subset(test_pred, oov1)}
  - Tokens not in V2:     {get_f1_tok_subset(test_pred, oov2)}
""")

TODO: scrivere qualcosa su questo ^