# Assignment 1: Part Of Speech tagging

In [None]:
import numpy as np
import pandas as pd

# System packages
import os
import glob

# File management
import requests
import zipfile
import io

# Types and type-annotations
from typing import List, Dict
from collections import OrderedDict

# To store vocabulary as .json
!pip install simplejson
import simplejson as sj

## Data Pipeline

### Data Loading
First we load the data (downloading it if not present), and store it into a dataframe.

In [None]:
DATASET_PATH = "./dependency_treebank"
DATASET_URL = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"


def load_dataset(ds_path: str, ds_url: str) -> pd.DataFrame:
    # Check if dataset is already present, otherwise download it
    if not os.path.isdir(ds_path):
        request_zip = requests.get(ds_url, stream=True)
        zip = zipfile.ZipFile(io.BytesIO(request_zip.content))
        zip.extractall()

    # Load each file into a list
    documents = []
    for file_name in sorted(glob.glob(f"{ds_path}/*.dp")):
        with open(file_name) as f:
            documents.append(f.read())

    # Convert each row of the documents into a list
    raw_df = []
    sentence_idx = 0
    for doc_idx, doc in enumerate(documents):
        rows = doc.split('\n')
        for row in rows:
            cols = row.split('\t')[:2]  # Ignore the last column
            if cols == ['']: 
                sentence_idx += 1
            else:
                raw_df.append([doc_idx, sentence_idx, *cols])

    # Finally, convert the nested list into a pandas dataframe
    df = pd.DataFrame(raw_df, columns=['document', 'sentence', 'token', 'tag'])
    return df


dataset = load_dataset(DATASET_PATH, DATASET_URL)
dataset[dataset['document'].lt(2)]

### Data Pre-Processing
The dataset does not need much of a cleanup: the only pre-processing we need to perform is converting tokens to lowercase, so that we can create a vocabulary without ending up with two entries for the same word.

However, we need to distinguish between words that are inherently capitalized (e.g. proper nouns) and those that are so just because they follow a period.

First of all, let us check which kinds of tags produce capitalized words, and how many those words are for each tag:

In [None]:
dataset[dataset['token'].str[0].str.isupper()].groupby('tag').size().reset_index(name='capitalized counts')

The most meaningful tags are (see [here](https://sites.google.com/site/partofspeechhelp/)):
* `NNP`: Proper Nouns (Singular)
* `NNPS`: Proper Nouns (Plural)
* `PRP`: Personal Pronouns

The weird ones are:
* `$`: Dollar mark
* `,`: Non-full stop break punctuation marks for the sentence

Proper nouns are always capitalized, and we should probably leave them as such, as in this case capitalization and tag are tigthly linked to each other.

Personal pronouns are meaningfully capitalized only in the case of "I", which is capitalized no matter where it occurs in a sentence. We should therefore keep "I" as it is and convert the other pronouns to lowecase.

When is `$` capitalized?

In [None]:
dataset[dataset['tag']=='$'].groupby('token').size().reset_index(name='"$"-tag counts')

The `$` tag is attached not only to the dollar symbol, but also to "C\$" (Canadian dollars) and "US$" (United States dollar), which are capitalized strings. It might make sense to leave them uppercase, as they are in some sense a label denoting a special symbol. In any case, there are just 6 of them in the whole dataset, so it probably does not matter that much.

When is `,` capitalized?

In [None]:
tmp = dataset[dataset['token'].str[0].str.isupper()]
tmp[tmp['tag'] == ',']

No idea what "Wa" means, possibly a typo/labeling mistake. Anyway, being a single instance, it really does not matter.

### Data Splitting

In [None]:
train_ds = dataset[dataset['document'].lt(100)]
validation_ds = dataset[dataset['document'].between(100, 149)]
test_ds = dataset[dataset['document'].gt(149)]

print_split = lambda df: f"{df.groupby('document').ngroups} documents, {len(df)} samples"
print(f"""Dataset split: 
    TRAIN: {print_split(train_ds)}
    VALIDATION: {print_split(validation_ds)}
    TEST: {print_split(test_ds)}
""")

### Vocaboulary Creation
> TODO: lowercase conversion before vocabulaty creation.

In [None]:
def build_vocabulary(df: pd.DataFrame) -> (Dict[int, str],
                                           Dict[str, int],
                                           List[str]):
    """Given a dataset, builds the corresponding token vocabulary.
    The vocabulary starts from index 1 so as to allow the 0 slot to be reserved to the padding token.

    Args:
        df: dataset, assumed to have a 'token' column.

    Returns:
        idx_to_pos: token vocabulary, i.e. from index to token.
        pos_to_idx: inverse token vocabulary, i.e. from token to index.
        pos_listing: list of unique tokens that build up the vocabulary.
    """
    idx_to_tok = OrderedDict()
    tok_to_idx = OrderedDict()
    
    curr_idx = 1
    for tok in df['token']:
        if tok not in tok_to_idx:
            tok_to_idx[tok] = curr_idx
            idx_to_tok[curr_idx] = tok
            curr_idx += 1

    tok_listing = list(idx_to_tok.values())

    return idx_to_tok, tok_to_idx, tok_listing


idx_to_tok, tok_to_idx, tok_listing = build_vocabulary(dataset)

Once the vocabulary is built, we perform some sanity checks:

In [None]:
assert len(idx_to_tok) == len(tok_to_idx)
assert len(idx_to_tok) == len(tok_listing)

for i in range(1, len(idx_to_tok) + 1):
    assert idx_to_tok[i] in tok_to_idx
    assert tok_to_idx[idx_to_tok[i]] == i

And then save the vocabulary for a more detailed inspection:

In [None]:
vocab_path = os.path.join(os.getcwd(), 'vocab.json')

with open(vocab_path, 'w') as f:
    sj.dump(idx_to_tok, f, indent=4)