# Assignment 1: Part Of Speech tagging

In [None]:
import numpy as np
import pandas as pd

# System packages
import os
import glob

# File management
import requests
import zipfile
import io

# Types and type-annotations
from typing import List, Dict
from collections import OrderedDict

# To store vocabulary as .json
!pip install simplejson
import simplejson as sj

## Data loading
First we load the data (downloading it if not present), and store it into a dataframe.

In [None]:
DATASET_PATH = "./dependency_treebank"
DATASET_URL = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"


def load_dataset(ds_path: str, ds_url: str) -> pd.DataFrame:
    # Check if dataset is already present, otherwise download it
    if not os.path.isdir(ds_path):
        request_zip = requests.get(ds_url, stream=True)
        zip = zipfile.ZipFile(io.BytesIO(request_zip.content))
        zip.extractall()

    # Load each file into a list
    documents = []
    for file_name in sorted(glob.glob(f"{ds_path}/*.dp")):
        with open(file_name) as f:
            documents.append(f.read())

    # Convert each row of the documents into a list
    raw_df = []
    sentence_idx = 0
    for doc_idx, doc in enumerate(documents):
        rows = doc.split('\n')
        for row in rows:
            cols = row.split('\t')[:2]  # Ignore the last column
            if cols == ['']: 
                sentence_idx += 1
            else:
                raw_df.append([doc_idx, sentence_idx, *cols])

    # Finally, convert the nested list into a pandas dataframe
    df = pd.DataFrame(raw_df, columns=['document', 'sentence', 'POS', 'tag'])
    return df


dataset = load_dataset(DATASET_PATH, DATASET_URL)
dataset[dataset['document'].lt(2)]

### Splitting the dataset

In [None]:
train_ds = dataset[dataset['document'].lt(100)]
validation_ds = dataset[dataset['document'].between(100, 149)]
test_ds = dataset[dataset['document'].gt(149)]

print_split = lambda df: f"{df.groupby('document').ngroups} documents, {len(df)} samples"
print(f"""Dataset split: 
    TRAIN: {print_split(train_ds)}
    VALIDATION: {print_split(validation_ds)}
    TEST: {print_split(test_ds)}
""")

## Word embedding

### Vocaboulary creation
For the moment, we will build the vocabulary on the whole dataset, just to test things out.

In [None]:
def build_vocabulary(df: pd.DataFrame) -> (Dict[int, str],
                                           Dict[str, int],
                                           List[str]):
    """Given a dataset, builds the corresponding POS vocabulary.
    The vocabulary starts from index 1 so as to allow the 0 slot to be reserved to the padding token.

    Args:
        df: dataset, assumed to have a 'POS' column.

    Returns:
        idx_to_pos: POS vocabulary, i.e. from index to POS.
        pos_to_idx: inverse POS vocabulary, i.e. from token to POS.
        pos_listing: list of unique POSs that build up the vocabulary.
    """
    idx_to_pos = OrderedDict()
    pos_to_idx = OrderedDict()
    
    curr_idx = 1
    for pos in df.POS:
        if pos not in pos_to_idx:
            pos_to_idx[pos] = curr_idx
            idx_to_pos[curr_idx] = pos
            curr_idx += 1

    pos_listing = list(idx_to_pos.values())

    return idx_to_pos, pos_to_idx, pos_listing


idx_to_pos, pos_to_idx, pos_listing = build_vocabulary(dataset)

Once the vocabulary is built, we perform some sanity checks:

In [None]:
assert len(idx_to_pos) == len(pos_to_idx)
assert len(idx_to_pos) == len(pos_listing)

for i in range(1, len(idx_to_pos) + 1):
    assert idx_to_pos[i] in pos_to_idx
    assert pos_to_idx[idx_to_pos[i]] == i

And then save the vocabulary for a more detailed inspection:

In [None]:
vocab_path = os.path.join(os.getcwd(), 'vocab.json')

with open(vocab_path, 'w') as f:
    sj.dump(idx_to_pos, f, indent=4)