## Convert the datasets from conll format to HuggingFace DatasetDict 

In [19]:
# Convert the TSA conll data to DatasetDict
import pandas as pd
import os
from datasets import Dataset, DatasetDict
from pathlib import Path


def parse_conll(raw:str, sep="\t"):
    """Parses the norec-fine conll files with tab separator and sentence id"""
    doc_parsed = [] # One dict per sentence. meta, tokens and tags
    for sent in raw.strip().split("\n\n"):
        meta = ""
        tokens, tags = [], []
        for line in sent.split("\n"):
            if line.startswith("#") and "=" in line:
                meta = line.split("=")[-1]
            else:
                elems = line.strip().split(sep)
                assert len(elems) == 2
                tokens.append(elems[0])
                tags.append(elems[1])
        assert len(meta) > 0
        doc_parsed.append({"idx": meta, "tokens":tokens, "tsa_tags":tags})
    return doc_parsed


conll_folders = ["tsa_conll", 
                "tsa-conll-intensity"
                ] 
assert all( ["conll" in s for s in conll_folders]) # If you remove this, change rule for naming arrow folder.
assert all([Path(s).is_dir() for s in conll_folders ]), "Not all source folders exist"

splits = {"train": "train", "dev": "validation", "test": "test"} # "validation" for HF naming convention
for c_folder in conll_folders:
    arrow_folder = c_folder.replace("conll", "arrow")
    d_sets = {}
    for split in splits:
        conll_txt = Path(c_folder, split+".conll").read_text()
        print("\n",c_folder, split, len(conll_txt.split("\n\n")))
        sents = parse_conll(conll_txt)
        # for sent in sents:
            # sent["labels"] = [label_mapping[tag] for tag in sent["tsa_tags"]]
        d_sets[splits[split]] = Dataset.from_pandas(pd.DataFrame(sents))
        print(d_sets[splits[split]][102])

        DatasetDict(d_sets).save_to_disk(arrow_folder)



 tsa_conll train 8634
{'idx': '701363-07-05', 'tokens': ['Allerede', 'tidlig', 'i', 'filmen', 'gjennomfører', 'Moore', 'et', 'demolition', 'derby', 'gjennom', 'Moskvas', 'gater', 'der', 'Willis', 'på', 'sitt', 'beste', 'overkjører', 'rushtrafikken', 'i', 'sin', 'robuste', 'Mercedes', ',', 'mens', 'bilmerker', 'fra', 'flere', 'verdenshjørner', 'kaster', 'seg', 'rundt', 'i', 'sanseløse', 'turnoppvisninger', 'og', 'blir', 'knust', 'til', 'resirkulasjonsmaterale', 'som', 'tyggegummi-Ladaer', '.'], 'tsa_tags': ['O', 'B-targ-Positive', 'I-targ-Positive', 'I-targ-Positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-targ-Positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


Saving the dataset (0/1 shards):   0%|          | 0/8634 [00:00<?, ? examples/s]


 tsa_conll dev 1531
{'idx': '202259-20-04', 'tokens': ['Utvendig', 'har', 'den', 'svært', 'forsiktig', 'styling', '.'], 'tsa_tags': ['B-targ-Negative', 'O', 'O', 'O', 'O', 'O', 'O']}


Saving the dataset (0/1 shards):   0%|          | 0/8634 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1531 [00:00<?, ? examples/s]


 tsa_conll test 1272
{'idx': '000298-18-02', 'tokens': ['Det', 'er', 'også', 'interessant', 'å', 'se', 'en', 'serie', 'der', 'kvinnen', 'er', 'den', 'sterkeste', 'parten', 'uten', 'at', 'dette', 'plager', 'mannen', 'nevneverdig', '.'], 'tsa_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-targ-Positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


Saving the dataset (0/1 shards):   0%|          | 0/8634 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1531 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1272 [00:00<?, ? examples/s]


 tsa-conll-intensity train 8634
{'idx': '701363-07-05', 'tokens': ['Allerede', 'tidlig', 'i', 'filmen', 'gjennomfører', 'Moore', 'et', 'demolition', 'derby', 'gjennom', 'Moskvas', 'gater', 'der', 'Willis', 'på', 'sitt', 'beste', 'overkjører', 'rushtrafikken', 'i', 'sin', 'robuste', 'Mercedes', ',', 'mens', 'bilmerker', 'fra', 'flere', 'verdenshjørner', 'kaster', 'seg', 'rundt', 'i', 'sanseløse', 'turnoppvisninger', 'og', 'blir', 'knust', 'til', 'resirkulasjonsmaterale', 'som', 'tyggegummi-Ladaer', '.'], 'tsa_tags': ['O', 'B-targ-Positive-3', 'I-targ-Positive-3', 'I-targ-Positive-3', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-targ-Positive-2', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


Saving the dataset (0/1 shards):   0%|          | 0/8634 [00:00<?, ? examples/s]


 tsa-conll-intensity dev 1531
{'idx': '202259-20-04', 'tokens': ['Utvendig', 'har', 'den', 'svært', 'forsiktig', 'styling', '.'], 'tsa_tags': ['B-targ-Negative-2', 'O', 'O', 'O', 'O', 'O', 'O']}


Saving the dataset (0/1 shards):   0%|          | 0/8634 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1531 [00:00<?, ? examples/s]


 tsa-conll-intensity test 1272
{'idx': '000298-18-02', 'tokens': ['Det', 'er', 'også', 'interessant', 'å', 'se', 'en', 'serie', 'der', 'kvinnen', 'er', 'den', 'sterkeste', 'parten', 'uten', 'at', 'dette', 'plager', 'mannen', 'nevneverdig', '.'], 'tsa_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-targ-Positive-2', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}


Saving the dataset (0/1 shards):   0%|          | 0/8634 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1531 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1272 [00:00<?, ? examples/s]