## Convert the datasets from conll format to HuggingFace DatasetDict 

In [None]:
import pandas as pd
import os
from datasets import Dataset, DatasetDict
from pathlib import Path


def parse_conll(raw:str, sep="\t"):
    """Parses the norec-fine conll files with tab separator and sentence id"""
    doc_parsed = [] # One dict per sentence. meta, tokens and tags
    for sent in raw.strip().split("\n\n"):
        meta = ""
        tokens, tags = [], []
        for line in sent.split("\n"):
            if line.startswith("#") and "=" in line:
                meta = line.split("=")[-1]
            else:
                elems = line.strip().split(sep)
                assert len(elems) == 2
                tokens.append(elems[0])
                tags.append(elems[1])
        assert len(meta) > 0
        doc_parsed.append({"idx": meta, "tokens":tokens, "tsa_tags":tags})
    return doc_parsed


conll_folders = ["tsa_conll", 
                "tsa-conll-intensity"
                ] 
assert all( ["conll" in s for s in conll_folders]) # If you remove this, change rule for naming arrow folder.
assert all([Path(s).is_dir() for s in conll_folders ]), "Not all source folders exist"

splits = {"train": "train", "dev": "validation", "test": "test"} # "validation" for HF naming convention
for c_folder in conll_folders:
    arrow_folder = c_folder.replace("conll", "arrow")
    d_sets = {}
    for split in splits:
        conll_txt = Path(c_folder, split+".conll").read_text()
        print("\n",c_folder, split, len(conll_txt.split("\n\n")))
        sents = parse_conll(conll_txt)
        # for sent in sents:
            # sent["labels"] = [label_mapping[tag] for tag in sent["tsa_tags"]]
        d_sets[splits[split]] = Dataset.from_pandas(pd.DataFrame(sents))
        print(d_sets[splits[split]][102])

        DatasetDict(d_sets).save_to_disk(arrow_folder)


In [None]:
d_sets
labels = [l for s in d_sets["test"]['tsa_tags'] for l in s]
set(labels)