# Assignment 1: Part Of Speech tagging

In [None]:
# Data packages
import numpy as np
import pandas as pd

# System packages
import os
import glob
import random

# File management
import requests
import zipfile
import io

# Types and type-annotations
from typing import List, Dict, Tuple
from collections import OrderedDict

# To store vocabulary as .json
!pip install simplejson
import simplejson as sj

# Notebook visualization
from IPython.core.display import display, HTML

# Seed initialization
random.seed(0)

## Data Pipeline

### Data Loading
First we load the data (downloading it if not present), and store it into a dataframe.

In [None]:
DATASET_PATH = "./dependency_treebank"
DATASET_URL = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"


def load_dataset(ds_path: str, ds_url: str) -> pd.DataFrame:
    # Check if dataset is already present, otherwise download it
    if not os.path.isdir(ds_path):
        request_zip = requests.get(ds_url, stream=True)
        zip = zipfile.ZipFile(io.BytesIO(request_zip.content))
        zip.extractall()

    # Load each file into a list
    documents = []
    for file_name in sorted(glob.glob(f"{ds_path}/*.dp")):
        with open(file_name) as f:
            documents.append(f.read())

    # Convert each row of the documents into a list
    raw_df = []
    sentence_idx = 0
    for doc_idx, doc in enumerate(documents):
        rows = doc.split('\n')
        for row in rows:
            cols = row.split('\t')[:2]  # Ignore the last column
            if cols == ['']:
                sentence_idx += 1
            else:
                raw_df.append([doc_idx, sentence_idx, *cols])

    # Finally, convert the nested list into a pandas dataframe
    df = pd.DataFrame(raw_df, columns=['document', 'sentence', 'token', 'tag'])
    return df


dataset = load_dataset(DATASET_PATH, DATASET_URL)
dataset[dataset['document'].lt(1)]  # Display the first document

### Data Pre-Processing
The dataset does not need much of a cleanup: the only pre-processing we need to perform is converting tokens to lowercase, so that we can create a vocabulary without ending up with two entries for the same word.

However, we need to distinguish between words that are inherently capitalized (e.g. proper nouns) and those that are so just because they follow a period.

First of all, let us check which kinds of tags produce capitalized words, and how many those words are for each tag:

In [None]:
dataset[dataset['token'].str[0].str.isupper()].groupby(
    'tag').size().reset_index(name='capitalized counts')

The most meaningful tags are (see [here](https://sites.google.com/site/partofspeechhelp/)):
* `NNP`: Proper Nouns (Singular)
* `NNPS`: Proper Nouns (Plural)
* `PRP`: Personal Pronouns

The weird ones are:
* `$`: Dollar mark
* `,`: Non-full stop break punctuation marks for the sentence

Proper nouns are always capitalized, and we should probably leave them as such, as in this case capitalization and tag are tigthly linked to each other.

Personal pronouns are meaningfully capitalized only in the case of "I", which is capitalized no matter where it occurs in a sentence. We should therefore keep "I" as it is and convert the other pronouns to lowecase.

When is `$` capitalized?

In [None]:
dataset[dataset['tag'] == '$'].groupby('token').size().reset_index(
    name='"$"-tag counts')


The `$` tag is attached not only to the dollar symbol, but also to "C\$" (Canadian dollars) and "US$" (United States dollar), which are capitalized strings. It might make sense to leave them uppercase, as they are in some sense a label denoting a special symbol. In any case, there are just 6 of them in the whole dataset, so it probably does not matter that much.

When is `,` capitalized?

In [None]:
tmp = dataset[dataset['token'].str[0].str.isupper()]
tmp[tmp['tag'] == ',']

No idea what "Wa" means, possibly a typo/labeling mistake. Anyway, being a single instance, it really does not matter.

### Data Splitting

In [None]:
train_ds = dataset[dataset['document'].lt(100)]
validation_ds = dataset[dataset['document'].between(100, 149)]
test_ds = dataset[dataset['document'].gt(149)]

print_split = lambda df: f"{df.groupby('document').ngroups} documents, {len(df)} samples"
print(f"""Dataset split: 
    TRAIN: {print_split(train_ds)}
    VALIDATION: {print_split(validation_ds)}
    TEST: {print_split(test_ds)}
""")

### Vocabulary Creation
> TODO: lowercase conversion before vocabulaty creation.

In [None]:
def build_vocabulary(
        df: pd.DataFrame) -> Tuple[Dict[int, str], Dict[str, int], List[str]]:
    """Given a dataset, builds the corresponding token vocabulary.
    The vocabulary starts from index 1 so as to allow the 0 slot to be reserved to the padding token.

    Args:
        df: dataset, assumed to have a 'token' column.

    Returns:
        idx_to_pos: token vocabulary, i.e. from index to token.
        pos_to_idx: inverse token vocabulary, i.e. from token to index.
        pos_listing: list of unique tokens that build up the vocabulary.
    """
    idx_to_tok = OrderedDict()
    tok_to_idx = OrderedDict()

    curr_idx = 1
    for tok in df['token']:
        if tok not in tok_to_idx:
            tok_to_idx[tok] = curr_idx
            idx_to_tok[curr_idx] = tok
            curr_idx += 1

    tok_listing = list(idx_to_tok.values())

    return idx_to_tok, tok_to_idx, tok_listing


idx_to_tok, tok_to_idx, tok_listing = build_vocabulary(dataset)

Once the vocabulary is built, we perform some sanity checks:

In [None]:
assert len(idx_to_tok) == len(tok_to_idx)
assert len(idx_to_tok) == len(tok_listing)

for i in range(1, len(idx_to_tok) + 1):
    assert idx_to_tok[i] in tok_to_idx
    assert tok_to_idx[idx_to_tok[i]] == i

And then save the vocabulary for a more detailed inspection:

In [None]:
vocab_path = os.path.join(os.getcwd(), 'vocab.json')

with open(vocab_path, 'w') as f:
    sj.dump(idx_to_tok, f, indent=4)

## Document visualization
To gain more insight on the dataset, and on the classified words next, we can visualize the dataset in a human readable way.

In [None]:
# Define a mapping between POS tags, their meaning and some colors
tag_map = {
    'CC': ('Coordin. Conjunction', '#c18401'),
    'TO': ('“to”', '#c18401'),
    'DT': ('Determiner', '#c18401'),
    'UH': ('Interjection', '#c18401'),
    'EX': ('Existential ‘there', '#c18401'),
    'MD': ('Modal can', '#c18401'),
    'LS': ('List item marker', '#c18401'),
    'IN': ('Preposition/sub-conj', '#c18401'),
    'CD': ('Cardinal number', '#282828'),
    'FW': ('Foreign word', '#282828'),
    'NN': ('Noun, singular/mass', '#282828'),
    'NNS': ('Noun, plural', '#282828'),
    'NNP': ('Proper noun, singul.', '#282828'),
    'NNPS': ('Proper noun, plural', '#282828'),
    'JJ': ('Adjective', '#50a14f'),
    'JJR': ('Adj. comparative ', '#50a14f'),
    'JJS': ('Adj. superlative ', '#50a14f'),
    'VB': ('Verb, base form', '#e45649'),
    'VBD': ('Verb, past tense ', '#e45649'),
    'VBG': ('Verb, gerund ', '#e45649'),
    'VBN': ('Verb, past particip. ', '#e45649'),
    'VBP': ('Verb, non-3sg pres', '#e45649'),
    'VBZ': ('Verb, 3sg pres ', '#e45649'),
    'WDT': ('Wh-determiner', '#4078f2'),
    'WP': ('Wh-pronoun', '#4078f2'),
    'WP$': (' Possessive wh-', '#4078f2'),
    'WRB': ('Wh-adverb how', '#4078f2'),
    'PDT': ('Predeterminer ', '#4078f2'),
    'POS': ('Possessive ending', '#4078f2'),
    'PP': ('Personal pronoun', '#4078f2'),
    'PP$': (' Possessive pronoun ', '#4078f2'),
    'RB': ('Adverb', '#a626a4'),
    'RBR': ('Adverb, comparative', '#a626a4'),
    'RBS': ('Adverb, superlative', '#a626a4'),
    'RP': ('Particle', '#a626a4'),
}


def display_pos_tagging(tokens: pd.Series,
                        predicted_tags: pd.Series,
                        correct_tags: pd.Series = None,
                        limit=1000):
    # If no correct tags are passed, we ignore the "error highlighting"
    if correct_tags is None:
        correct_tags = predicted_tags

    # Limit the inputs
    tokens = tokens[:limit]
    predicted_tags = predicted_tags[:limit]
    correct_tags = correct_tags[:limit]

    # Iterate through tokens and tags, generating styled html based on the color
    html_sequence = []
    for token, tag, correct in zip(tokens, predicted_tags, correct_tags):
        tag_meaning = tag_map.get(tag, ('', ''))[0]
        err = 'error' if tag != correct else ''
        h = f'<div class="token {tag} {err}">{token} <span class="tag">[{tag}] {tag_meaning}</span></div>'
        if tag == '.':
            h += '<div class="separator"/>'
        html_sequence.append(h)
    html_body = '<div class="pos-visualizer">'
    html_body += ' '.join(html_sequence) + '</div>'

    # Generate the style (WARNING: CSS lies ahead)
    html_style = """
    <style>
    .pos-visualizer { margin: 32px;}
    .token { position:relative; display:inline-block; font-size:16px;}
    .token .tag { 
        visibility:hidden; width: 120px; text-align:center; position:absolute;
        width: 160px; background-color: #282828; color: #fff; border-radius: 6px;
        z-index: 1; bottom: 100%; left: 50%; margin-left:-80px; font-size:12px;
    }
    .error { text-decoration: underline solid #F94144;}
    .separator { margin-top:12px }
    .token:hover .tag { visibility:visible }
    """
    html_style += '\n'.join(
        (f'.{tag} {{color:{tag_map.get(tag, ("", "#282828"))[1]};}}'
         for tag in predicted_tags.unique()))
    html_style += '</style>'

    # Display the html in the cell's output
    display(HTML(html_style + html_body))

In [None]:
# Display some sample POS tagging (hover on text for tag meaning)
predicted_example = dataset['tag'].copy()
predicted_example[0:8] = 'CD'  # Wrong prediction example
display_pos_tagging(dataset['token'],
                    predicted_example,
                    dataset['tag'],
                    limit=120)

Now let's take a look at the sentences containing the "Wa" token

In [None]:
wa_sentences = dataset[dataset['token'].eq('Wa')]['sentence']
wa_df = dataset[dataset['sentence'].isin(wa_sentences)]
display_pos_tagging(wa_df['token'], wa_df['tag'])