# Assignment 1: Part Of Speech tagging

## Imports

In [None]:
# Main framework
from tensorflow import keras

# Data packages
import numpy as np
import pandas as pd

# System packages
import glob
import random

# File management
import requests
import zipfile
import io
import pathlib

# Notebook visualization
from IPython.core.display import display, HTML

# Seed initialization
random.seed(0)

# Typing
from typing import Set

# For GloVe wrapper
!pip install gensim -U
import gensim
from gensim import downloader as gensloader
from gensim.models.keyedvectors import KeyedVectors

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

## 1 - Data Pipeline

### 1.1 - Data loading
First, we load the dataset and store it into a dataframe.

In [None]:
DATASET_PATH = './dependency_treebank'  # Change if dataset already present locally
DATASET_URL = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip'


def load_dataset(ds_path: str, ds_url: str) -> pd.DataFrame:
    # Check if dataset is already present, otherwise download it
    if not pathlib.Path(ds_path).exists():
        request_zip = requests.get(ds_url, stream=True)
        zip = zipfile.ZipFile(io.BytesIO(request_zip.content))
        zip.extractall()

    # Load each file into a list
    documents = []
    for file_name in sorted(glob.glob(f"{ds_path}/*.dp")):
        with open(file_name) as f:
            documents.append(f.read())

    # Convert each row of the documents into a list
    raw_df = []
    sentence_idx = 0
    for doc_idx, doc in enumerate(documents):
        rows = doc.split('\n')
        for row in rows:
            cols = row.split('\t')[:2]  # Ignore the last column
            if cols == ['']:
                sentence_idx += 1
            else:
                raw_df.append([doc_idx, sentence_idx, *cols])

    # Finally, convert the nested list into a pandas dataframe
    df = pd.DataFrame(raw_df, columns=['document', 'sentence', 'token', 'tag'])
    return df


dataset = load_dataset(DATASET_PATH, DATASET_URL)
dataset.head()

### 1.2 - GloVe loading
Then, we load the GloVe embeddings (GloVe-50, to be precise).

In [None]:
GLOVE_TYPE = 'glove-wiki-gigaword-50'
GLOVE_FILE = './glove/glove-wiki-gigaword-50.kv'


def load_glove(gl_file: str, gl_type: str) -> KeyedVectors:
    # Load local version
    path = pathlib.Path(gl_file)
    if path.exists():
        return gensim.models.KeyedVectors.load(gl_file)

    # Otherwise download and store glove
    path.parent.mkdir(parents=True, exist_ok=True)
    glove = gensloader.load(gl_type)
    glove.save(gl_file)
    return glove


glove = load_glove(GLOVE_FILE, GLOVE_TYPE)

In [None]:
# Test GloVe loading
print(f'cat = {glove["cat"]}')

### 1.3 - Data visualization
One of the most important ML tasks is getting familiar with the data in order to gain a deeper insight on their structure and nature.

To do so, we define a function that displays tokens with their POS tags in a human-friendlier way.

In [None]:
# NOTE: this could be put in its own file to keep things clean,
#       but to upload just one notebook we instead code-golfed a bit :)

# Define a mapping between POS tags, their meaning and some colors
from collections import defaultdict

tag_map = {
    'CC': ('Coordin. Conjunction', '#c18401'),
    'TO': ('“to”', '#c18401'),
    'DT': ('Determiner', '#c18401'),
    'UH': ('Interjection', '#c18401'),
    'EX': ('Existential ‘there', '#c18401'),
    'MD': ('Modal can', '#c18401'),
    'LS': ('List item marker', '#c18401'),
    'IN': ('Preposition/sub-conj', '#c18401'),
    'CD': ('Cardinal number', '#282828'),
    'FW': ('Foreign word', '#282828'),
    'NN': ('Noun, singular/mass', '#282828'),
    'NNS': ('Noun, plural', '#282828'),
    'NNP': ('Proper noun, singul.', '#282828'),
    'NNPS': ('Proper noun, plural', '#282828'),
    'JJ': ('Adjective', '#50a14f'),
    'JJR': ('Adj. comparative ', '#50a14f'),
    'JJS': ('Adj. superlative ', '#50a14f'),
    'VB': ('Verb, base form', '#e45649'),
    'VBD': ('Verb, past tense ', '#e45649'),
    'VBG': ('Verb, gerund ', '#e45649'),
    'VBN': ('Verb, past particip. ', '#e45649'),
    'VBP': ('Verb, non-3sg pres', '#e45649'),
    'VBZ': ('Verb, 3sg pres ', '#e45649'),
    'WDT': ('Wh-determiner', '#4078f2'),
    'WP': ('Wh-pronoun', '#4078f2'),
    'WP$': (' Possessive wh-', '#4078f2'),
    'WRB': ('Wh-adverb how', '#4078f2'),
    'PDT': ('Predeterminer ', '#4078f2'),
    'POS': ('Possessive ending', '#4078f2'),
    'PP': ('Personal pronoun', '#4078f2'),
    'PP$': (' Possessive pronoun ', '#4078f2'),
    'RB': ('Adverb', '#a626a4'),
    'RBR': ('Adverb, comparative', '#a626a4'),
    'RBS': ('Adverb, superlative', '#a626a4'),
    'RP': ('Particle', '#a626a4'),
}
tag_map = defaultdict(lambda: ('', '#282828'), tag_map)


def display_pos_tagging(tokens: pd.Series,
                        predicted_tags: pd.Series,
                        correct_tags: pd.Series = None,
                        limit=1000):
    # If no correct tags are passed, we ignore the "error highlighting"
    if correct_tags is None:
        correct_tags = predicted_tags

    # Limit the inputs
    tokens = tokens[:limit]
    predicted_tags = predicted_tags[:limit]
    correct_tags = correct_tags[:limit]

    # Iterate through tokens and tags, generating styled HTML based on the tags
    html_sequence = []
    for token, tag, correct in zip(tokens, predicted_tags, correct_tags):
        tag_meaning = tag_map[tag][0]
        err = 'pos-error' if tag != correct else ''
        h = f'<div class="token {tag} {err}">{token} <span class="tag">[{tag}] {tag_meaning}</span></div>'
        if tag == '.':
            h += '<div class="separator"/>'
        html_sequence.append(h)
    html_body = '<div class="pos-visualizer">'
    html_body += ' '.join(html_sequence) + '</div>'

    # Generate the style (WARNING: CSS lies ahead)
    html_style = """
	<style>
	.pos-visualizer { padding: 32px; background-color: #FEFEFE; border-left:solid 1px grey;}
	.token { position:relative; display:inline-block; font-size:16px;}
	.token .tag { 
		visibility:hidden; width: 120px; text-align:center; position:absolute;
		width: 160px; background-color: #282828; color: #fff; border-radius: 6px;
		z-index: 1; bottom: 100%; left: 50%; margin-left:-80px; font-size:12px;
	}
	.pos-error { text-decoration: underline solid #F94144;}
	.separator { margin-top:12px }
	.token:hover .tag { visibility:visible }
	"""
    html_style += '\n'.join((f'.{tag} {{color:{tag_map[tag][1]};}}'
                             for tag in predicted_tags.unique()))
    html_style += '</style>'

    # Display the HTML in the cell's output
    display(HTML(html_style + html_body))

In [None]:
# Display some sample POS tagging (hover on text for tag meaning)
predicted_example = dataset['tag'].copy()
predicted_example[0:8] = 'CD'  # Wrong prediction example for the first 8 words
display_pos_tagging(dataset['token'],
                    predicted_example,
                    dataset['tag'],
                    limit=120)

### 1.4 - Pre-processing
Our dataset is already relatively clean; however, one point that might be worth considering is how to handle lowercase conversions. Some tokens in our dataset will be intrinsically capitalized (e.g. proper nouns, the personal pronoun "I"), whereas some other will be capitalized only because they follow a period in the sentence they occur in.

One might think of converting a token to lowercase based on its tag (e.g. if a token is a proper noun, keep it capitalized); however, to be fair, this could only be done on the training set, since in a real scenario test-set tags would be unknown.

Anyway, all these considerations hold only if GloVe contains embeddings of capitalized words; if that's not the case, every word we keep as capitalized will be classified as OOV when matched with GloVe, even when their lowercase embedding actually exists.

As it turns out, Glove does not encode capitalized words:


In [None]:
num_capitalized = len(list(filter(lambda w: w[0].isupper(), glove.key_to_index.keys())))

print(f'GloVe-50 encodes {num_capitalized} capitalized words')

Therefore, we will be forced to convert all tokens to lowercase.

We are also interested to see which "special" tokens are encoded in GloVe, i.e. punctuation, quotation marks, and tokens such as "-LRB-" and "-RRB-", which in our dataset replace "(" and ")", respectively.

As it turns out, GloVe contains every special symbol we care about, except for tokens reserved to brackets:

In [None]:
special_tokens = [
    *',.:;"`$#£!%/?^-()[]{}_', "''", "``", "--", "-LRB-", "-RRB-", "-LSB-",
    "-RSB-", "-LCB-", "-RCB-"
]
for st in special_tokens:
    if st not in glove:
        print(f"GloVe does not contain token {st}")

Based on the previous considerations, we convert all tokens to lowercase and replace "-LRB"-like symbols with the corresponding bracket:

In [None]:
# Convert the brackets
for token, bracket in [('-LRB-', '('), ('-RRB-', ')'), ('-LSB-', '['),
                       ('-RSB-', ']'), ('-LCB-', '{'), ('-RCB-', '}')]:
    dataset.loc[dataset.token == token, 'token'] = bracket

# Convert dataset tokens to lowercase
dataset.loc[:, 'token'] = dataset['token'].str.lower()

### 1.5 - Splitting
After pre-processing the data, we can finally split the dataset into train, validation and test.

In [None]:
ds_train = dataset[dataset['document'].lt(100)]
ds_val = dataset[dataset['document'].between(100, 149)].reset_index()
ds_test = dataset[dataset['document'].gt(149)].reset_index()

print_split = lambda df: f"{df.groupby('document').ngroups} documents, {len(df)} tokens"
print(f"""Dataset split: 
    TRAIN: {print_split(ds_train)}
    VALIDATION: {print_split(ds_val)}
    TEST: {print_split(ds_test)}
""")

### 1.6 - OOV Handling

#### 1.6.1 - OOV Analysis
First of all, let us take a look at how many Out-Of-Vocabulary tokens (w.r.t. GloVe) our dataset contains:

In [None]:
def get_oov(tokens, embedding_keys):
    return set(tokens) - set(embedding_keys)

# NOTE _ TODO: gensim3 usa glove.vocab al posto di .key_to_index.keys()
glove_keys = glove.key_to_index.keys()
oov_train = get_oov(ds_train['token'].unique(), glove_keys)
oov_val = get_oov(ds_val['token'].unique(), glove_keys)
oov_test = get_oov(ds_test['token'].unique(), glove_keys)

print_oov = lambda s, d: f"{len(s)} [{len(s) / len(d['token'].unique()) * 100:.2f}%]"
print(f"""Number of OOV tokens in dataset:
    TRAIN: {print_oov(oov_train, ds_train)}
    VALIDATION: {print_oov(oov_val, ds_val)}
    TEST: {print_oov(oov_test, ds_test)}
""")

However, from an experimental perspective, we should consider "incremental" OOV words, i.e. how OOVs would actually be identified in a real world scenario:
* **Training OOVs:** training-set tokens which are not found in GloVe.
* **Validation OOVs:** validation-set tokens which are not found in `union(`GloVe, Training OOVs`)`.
* **Test OOVs:** test-set tokens which are not found in `union(`GloVe, Training OOVs, Validation OOVs`)`.

In [None]:
print(
    f"""Number of OOV tokens in dataset, considering INCREMENTAL OOV EMBEDDING:
    TRAIN: {print_oov(oov_train, ds_train)}
    VALIDATION: {print_oov(oov_val - oov_train, ds_val)}
    TEST: {print_oov(oov_test - (oov_val | oov_train), ds_test)}
    === TOTAL: {print_oov(oov_train | oov_val | oov_test, dataset)}
""")

#### 1.6.2 - Adding OOVs to GloVe
We can now add OOV tokens to the GloVe vocabulary. Many strategies can be adopted to encode OOVs as vectors:
1. Static embeddings with the same vector for all OOV tokens (e.g. zeros).
2. Random embeddings. 
3. Computing an embedding as some statistic involving neighboring tokens (e.g. their mean).

Two observations can guide us in the choice of an embedding strategy:
* OOV tokens are not negligible (about 6% of the *total* dataset)
* Our GloVe embeddings will not undergo further training, therefore fixed or random embedding values will not be refined during the training process.

For the two reasons above, given an OOV token, we will compute its embedding as the mean of its left and right neighbors across all its occurrences throughout the dataset:


In [None]:
def compute_neighbor_mean(oov_token: str, df: pd.DataFrame,
                                 embeddings: KeyedVectors) -> np.ndarray:
    # Find indexes where the oov token appears, and shift them by -1 +1
    indexes = df.index[df['token'] == oov_token].values
    indexes = np.concatenate((indexes - 1, indexes + 1))

    # For each oov word index, look at the left and right until a word with embedding has been found
    neighbor_embeddings = []
    for idx in indexes:
        for direction in (range(idx - 1, -1, -1), range(idx + 1, len(df))):
            for i in direction:
                tok = df['token'].iloc[i]
                if tok not in embeddings:
                    continue
                vector = embeddings[tok]
                neighbor_embeddings.append(vector)
                break

    return np.mean(neighbor_embeddings, axis=0)


def fill_oov_embeddings(oov_tokens: Set, df: pd.DataFrame,
                       embeddings: KeyedVectors) -> KeyedVectors:
    # Clone the embedding (KeyedVectors does not have a clone method)
    from copy import deepcopy
    emb_filled = deepcopy(embeddings)

    # Estimate the OOV embeddings
    keys, values = [], []
    for oov in oov_tokens:
        vector = compute_neighbor_mean(oov, df, emb_filled)
        keys.append(oov)
        values.append(vector)
    # Add the estimates to the embedding
    emb_filled.add_vectors(keys, values)
    return emb_filled


emb_train = fill_oov_embeddings(oov_train, ds_train, glove)
emb_val = fill_oov_embeddings(oov_val - oov_train, ds_val, emb_train)
emb_test = fill_oov_embeddings(oov_test - oov_val - oov_train, ds_test, emb_val)

In [None]:
# Test embedding dimensions
dim = len(glove['cat'])
print(f'glove.shape     = ({len(glove)}, {dim})')
print(f'emb_train.shape = ({len(emb_train)}, {dim})')
print(f'emb_val.shape   = ({len(emb_val)}, {dim})')
print(f'emb_val.shape   = ({len(emb_test)}, {dim})')