# Assignment 1: Part Of Speech tagging

In [119]:
# Main framework
from tensorflow import keras

# Data packages
import numpy as np
import pandas as pd

# System packages
import os
import glob
import random

# File management
import requests
import zipfile
import io

# Types and type-annotations
from typing import List, Dict, Tuple
from collections import OrderedDict

# To store vocabulary as .json
!pip install simplejson
import simplejson

# Notebook visualization
from IPython.core.display import display, HTML

# Seed initialization
random.seed(0)

# For GloVe wrapper
import gensim

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')



## Data Pipeline

### Data Loading
First we load the data (downloading it if not present), and store it into a dataframe.

In [None]:
DATASET_PATH = "./dependency_treebank"  # Change if dataset already present locally
DATASET_URL = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"


def load_dataset(ds_path: str, ds_url: str) -> pd.DataFrame:
    # Check if dataset is already present, otherwise download it
    if not os.path.isdir(ds_path):
        request_zip = requests.get(ds_url, stream=True)
        zip = zipfile.ZipFile(io.BytesIO(request_zip.content))
        zip.extractall()

    # Load each file into a list
    documents = []
    for file_name in sorted(glob.glob(f"{ds_path}/*.dp")):
        with open(file_name) as f:
            documents.append(f.read())

    # Convert each row of the documents into a list
    raw_df = []
    sentence_idx = 0
    for doc_idx, doc in enumerate(documents):
        rows = doc.split('\n')
        for row in rows:
            cols = row.split('\t')[:2]  # Ignore the last column
            if cols == ['']:
                sentence_idx += 1
            else:
                raw_df.append([doc_idx, sentence_idx, *cols])

    # Finally, convert the nested list into a pandas dataframe
    df = pd.DataFrame(raw_df, columns=['document', 'sentence', 'token', 'tag'])
    return df


dataset = load_dataset(DATASET_PATH, DATASET_URL)
dataset[dataset['document'].lt(1)]  # Display the first document

### Data Visualization
To gain a deeper insight on the dataset, we define a function allowing to diplay it in a human readable way:

In [110]:
# Define a mapping between POS tags, their meaning and some colors
tag_map = {
    'CC': ('Coordin. Conjunction', '#c18401'),
    'TO': ('“to”', '#c18401'),
    'DT': ('Determiner', '#c18401'),
    'UH': ('Interjection', '#c18401'),
    'EX': ('Existential ‘there', '#c18401'),
    'MD': ('Modal can', '#c18401'),
    'LS': ('List item marker', '#c18401'),
    'IN': ('Preposition/sub-conj', '#c18401'),
    'CD': ('Cardinal number', '#282828'),
    'FW': ('Foreign word', '#282828'),
    'NN': ('Noun, singular/mass', '#282828'),
    'NNS': ('Noun, plural', '#282828'),
    'NNP': ('Proper noun, singul.', '#282828'),
    'NNPS': ('Proper noun, plural', '#282828'),
    'JJ': ('Adjective', '#50a14f'),
    'JJR': ('Adj. comparative ', '#50a14f'),
    'JJS': ('Adj. superlative ', '#50a14f'),
    'VB': ('Verb, base form', '#e45649'),
    'VBD': ('Verb, past tense ', '#e45649'),
    'VBG': ('Verb, gerund ', '#e45649'),
    'VBN': ('Verb, past particip. ', '#e45649'),
    'VBP': ('Verb, non-3sg pres', '#e45649'),
    'VBZ': ('Verb, 3sg pres ', '#e45649'),
    'WDT': ('Wh-determiner', '#4078f2'),
    'WP': ('Wh-pronoun', '#4078f2'),
    'WP$': (' Possessive wh-', '#4078f2'),
    'WRB': ('Wh-adverb how', '#4078f2'),
    'PDT': ('Predeterminer ', '#4078f2'),
    'POS': ('Possessive ending', '#4078f2'),
    'PP': ('Personal pronoun', '#4078f2'),
    'PP$': (' Possessive pronoun ', '#4078f2'),
    'RB': ('Adverb', '#a626a4'),
    'RBR': ('Adverb, comparative', '#a626a4'),
    'RBS': ('Adverb, superlative', '#a626a4'),
    'RP': ('Particle', '#a626a4'),
}


def display_pos_tagging(tokens: pd.Series,
                        predicted_tags: pd.Series,
                        correct_tags: pd.Series = None,
                        limit=1000):
    # If no correct tags are passed, we ignore the "error highlighting"
    if correct_tags is None:
        correct_tags = predicted_tags

    # Limit the inputs
    tokens = tokens[:limit]
    predicted_tags = predicted_tags[:limit]
    correct_tags = correct_tags[:limit]

    # Iterate through tokens and tags, generating styled html based on the color
    html_sequence = []
    for token, tag, correct in zip(tokens, predicted_tags, correct_tags):
        tag_meaning = tag_map.get(tag, ('', ''))[0]
        err = 'error' if tag != correct else ''
        h = f'<div class="token {tag} {err}">{token} <span class="tag">[{tag}] {tag_meaning}</span></div>'
        if tag == '.':
            h += '<div class="separator"/>'
        html_sequence.append(h)
    html_body = '<div class="pos-visualizer">'
    html_body += ' '.join(html_sequence) + '</div>'

    # Generate the style (WARNING: CSS lies ahead)
    html_style = """
	<style>
	.pos-visualizer { margin: 32px;}
	.token { position:relative; display:inline-block; font-size:16px;}
	.token .tag { 
		visibility:hidden; width: 120px; text-align:center; position:absolute;
		width: 160px; background-color: #282828; color: #fff; border-radius: 6px;
		z-index: 1; bottom: 100%; left: 50%; margin-left:-80px; font-size:12px;
	}
	.error { text-decoration: underline solid #F94144;}
	.separator { margin-top:12px }
	.token:hover .tag { visibility:visible }
	"""
    html_style += '\n'.join(
        (f'.{tag} {{color:{tag_map.get(tag, ("", "#282828"))[1]};}}'
         for tag in predicted_tags.unique()))
    html_style += '</style>'

    # Display the html in the cell's output
    display(HTML(html_style + html_body))

In [111]:
# Display some sample POS tagging (hover on text for tag meaning)
predicted_example = dataset['tag'].copy()
predicted_example[0:8] = 'CD'  # Wrong prediction example
display_pos_tagging(dataset['token'],
                    predicted_example,
                    dataset['tag'],
                    limit=120)

### Data Pre-Processing
The dataset does not need much of a cleanup: the only pre-processing we need to perform is converting tokens to lowercase, so that we can create a vocabulary without ending up with two entries for the same word.

However, we need to distinguish between words that are inherently capitalized (e.g. proper nouns) and those that are so just because they follow a period.

#### Analysys
First of all, let us check which kinds of tags produce capitalized words, and how many those words are for each tag:

In [112]:
def get_capitalized_tokens(df: pd.DataFrame) -> pd.DataFrame:
    """Given a dataset, counts the number of capitalized tokens for each tag.

    Args:
        df: a Pandas DataFrame with 'token' and 'tag' columns

    Returns:
        A DataFrame that, for each tag, displays the number 
        of occurrences of capitalized tokens with that tag.
    """
    return df[dataset['token'].str[0].str.isupper()]\
        .groupby('tag')\
        .size()\
        .sort_values(ascending=False)\
        .reset_index(name='capitalized counts')

In [113]:
get_capitalized_tokens(dataset)

Unnamed: 0,tag,capitalized counts
0,NNP,9400
1,DT,1022
2,IN,541
3,JJ,460
4,PRP,432
5,NN,304
6,NNS,255
7,NNPS,244
8,CC,210
9,RB,205


The most meaningful tags are (see [here](https://sites.google.com/site/partofspeechhelp/)):
* `NNP`: Proper Nouns (Singular)
* `NNPS`: Proper Nouns (Plural)
* `PRP`: Personal Pronouns

The weird ones are:
* `$`: Dollar mark
* `,`: Non-full stop break punctuation marks for the sentence

Proper nouns are always capitalized, and we should probably leave them as such, as in this case capitalization and tag are tigthly linked to each other.

Personal pronouns are meaningfully capitalized only in the case of "I", which is capitalized no matter where it occurs in a sentence. We should therefore keep "I" as it is and convert the other pronouns to lowecase.

When is `$` capitalized?

In [114]:
dataset[dataset['tag'] == '$'].groupby('token').size().reset_index(
    name='"$"-tag counts')

Unnamed: 0,token,"""$""-tag counts"
0,$,718
1,C$,2
2,US$,4


The `$` tag is attached not only to the dollar symbol, but also to "C\$" (Canadian dollars) and "US$" (United States dollar), which are capitalized strings. It might make sense to leave them uppercase, as they are in some sense a label denoting a special symbol. In any case, there are just 6 of them in the whole dataset, so it probably does not matter that much.

When is `,` capitalized?

In [115]:
tmp = dataset[dataset['token'].str[0].str.isupper()]
tmp[tmp['tag'] == ',']

Unnamed: 0,document,sentence,token,tag
10454,36,452,Wa,","


What does "Wa" mean? and why is it tagged as `,`?

In [116]:
wa_sentences = dataset[dataset['token'].eq('Wa')]['sentence']
wa_df = dataset[dataset['sentence'].isin(wa_sentences)]
display_pos_tagging(wa_df['token'], wa_df['tag'])

The text snipped above provides an insight on the meaning of the word "Wa". Also, it allows us to see that the single instance in which "Wa" is tagged as `,` is a labeling mistake, as the other occurrences are tagged as `NNP`. Anyway, a single token–tag combination will not have an impact on our future classification.

#### Text Cleaning

Let us now define our text cleaning:

In [120]:
# Define masks selecting what to convert to lowercase
proper_noun_mask = (dataset['tag'] != 'NNP') & (dataset['tag'] != 'NNPS')
personal_pronoun_mask = ((dataset['tag'] == 'PRP') & (dataset['token'] != 'I')) | (dataset['tag'] != 'PRP')
dollar_mark_mask = dataset['tag'] != '$'

# Apply selective lowercase conversion to dataset
masked_dataset = dataset[proper_noun_mask & personal_pronoun_mask & dollar_mark_mask]
masked_dataset['token'] = masked_dataset['token'].map(lambda x: x.lower())
dataset.loc[masked_dataset.index, 'token'] = masked_dataset['token']

And check if it worked:

In [121]:
get_capitalized_tokens(dataset)

Unnamed: 0,tag,capitalized counts
0,NNP,9400
1,NNPS,244
2,PRP,113
3,$,6


### Data Splitting

In [122]:
train_ds = dataset[dataset['document'].lt(100)]
validation_ds = dataset[dataset['document'].between(100, 149)]
test_ds = dataset[dataset['document'].gt(149)]

print_split = lambda df: f"{df.groupby('document').ngroups} documents, {len(df)} samples"
print(f"""Dataset split: 
    TRAIN: {print_split(train_ds)}
    VALIDATION: {print_split(validation_ds)}
    TEST: {print_split(test_ds)}
""")

Dataset split: 
    TRAIN: 100 documents, 47356 samples
    VALIDATION: 50 documents, 31183 samples
    TEST: 49 documents, 15545 samples



### Vocabulary Creation
> TODO: it would probably be better to define a ``Keras.Tokenizer`` wrapper that handles vocabulary creation, embedding and OOV tokens alltogether, as done is Section 6.3 of `Tutorial2`.

In [123]:
tokenizer = keras.preprocessing.text.Tokenizer(filters='', lower=False)
tokenizer.fit_on_texts(dataset['token'].values)

Save the vocabulary for a more detailed inspection:

In [124]:
vocab_path = os.path.join(os.getcwd(), 'vocab.json')

with open(vocab_path, 'w') as f:
    simplejson.dump(tokenizer.word_index, f, indent=4)

### GloVe embedding
After the dataset has been cleaned and pre-processed, it is possible to embed the tokens by using the GloVe embedding.

In this case the Stanford's `glove.6B` pre-trained embedding is used. Glove 6B contains 6 billion tokens, and its size is 800MB.

In [None]:
GLOVE_FOLDER_PATH = "./glove_6b"  # Change if embedding folder already present locally
GLOVE_FILENAME = "glove.6B.50d.txt"  # Use the 50 dimensions embedding. Can be changed
GLOVE_DL_URL = "https://nlp.stanford.edu/data/glove.6B.zip"


def load_glove_embedding(gl_folder_path: str, gl_url: str, gl_filename: str):
    # Download if folder does not exist
    if not os.path.isdir(gl_folder_path):
        print(
            "Downloading GloVe. This may take a while depending on internet speed."
        )
        request_zip = requests.get(gl_url, stream=True)
        zip = zipfile.ZipFile(io.BytesIO(request_zip.content))
        print("Download complete! Unzipping file...")
        zip.extractall(gl_folder_path)
        print(f"GloVe downloaded successfully in {gl_folder_path}")

    # Load the txt file into a map (word -> embedding)
    embedding_dict = dict()
    with open(os.path.join(gl_folder_path, gl_filename), 'r',
              encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vect = values[1:]
            embedding_dict[word] = np.array(
                vect, dtype='f4')  # TODO: check if we can use less bits
    return embedding_dict


embedding_dict = load_glove_embedding(GLOVE_FOLDER_PATH, GLOVE_DL_URL,
                                      GLOVE_FILENAME)
embedding_dict['the']  # Display an example

Downloading GloVe. This may take a while depending on internet speed.
Download complete! Unzipping file...
GloVe downloaded successfully in ./glove_6b


array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)

What about using a gensim [`KeyedVector`](https://radimrehurek.com/gensim/models/keyedvectors.html) object? It is ligther and provides a built-in API to query and manipulate it. Also—and maybe most importantly—is what is used in `Tutorial2` (Section 4.1), so we will be able to take inspiration from it if needed.

In [125]:
word_vectors = gensim.downloader.load("glove-wiki-gigaword-50")

In [126]:
word_vectors['the'] 

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)