# Assignment 1: Part Of Speech tagging

## Data loading
First we load the data (downloading it if not present), and store it into a dataframe

In [None]:
import pandas as pd

DATASET_PATH = "./dependency_treebank"
DATASET_URL = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"


def load_dataset(ds_path: str, ds_url: str) -> pd.DataFrame:
    import os, glob
    # Check if dataset is already present, otherwise download it
    if not os.path.isdir(ds_path):
        import requests, zipfile, io
        request_zip = requests.get(ds_url, stream=True)
        zip = zipfile.ZipFile(io.BytesIO(request_zip.content))
        zip.extractall()

    # Load each file into a list
    documents = []
    for file_name in sorted(glob.glob(f"{ds_path}/*.dp")):
        with open(file_name) as f:
            documents.append(f.read())

    # Convert each row of the documents into a list
    raw_df = []
    sentence_idx = 0
    for doc_idx, doc in enumerate(documents):
        rows = doc.split('\n')
        for row in rows:
            cols = row.split('\t')[:2]  # Ignore the last column
            if cols == ['']: 
                sentence_idx += 1
            else:
                raw_df.append([doc_idx, sentence_idx, *cols])

    # Finally, convert the nested list into a pandas dataframe
    df = pd.DataFrame(raw_df, columns=['document', 'sentence', 'token', 'label'])
    return df


dataset = load_dataset(DATASET_PATH, DATASET_URL)
dataset[dataset['document'].lt(2)]

### Splitting the dataset

In [None]:
train_ds = dataset[dataset['document'].lt(100)]
validation_ds = dataset[dataset['document'].between(100, 149)]
test_ds = dataset[dataset['document'].gt(149)]

print_split = lambda df: f"{df.groupby('document').ngroups} documents, {len(df)} samples"
print(f"""Dataset split: 
    TRAIN: {print_split(train_ds)}
    VALIDATION: {print_split(validation_ds)}
    TEST: {print_split(test_ds)}
""")