To run this script, download the [MOCHA.tar.gz](https://github.com/anthonywchen/MOCHA/blob/main/data/mocha.tar.gz) and extract it to `data/mocha`. This script will use the following frameworks:
- `SPACY` for tokenization;
- HuggingFace `datasets` for evaluation.

In [3]:
# Data loading
import json

# Tokenization
import spacy
!python3 -m spacy download en_core_web_sm

# Evaluation
import datasets

In [20]:
# the (lowercase) name of the dataset to run the analysis for
DATASET = "narrativeqa"

# Data directory: where to look for a model
DATA_DIR = "../data/mocha/"

# Full filepath to load
FILEPATH = f"{DATA_DIR}/dev.json"

# Evaluation tokenizer
EVALUATION_TOKENIZER = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [22]:
data = json.load(open(FILEPATH))
data = data[DATASET] if DATASET is not None else data
print("Number of examples:", len(data))
next(iter(data.items()))

Number of examples: 890


('0005c7718ff653683df879622efb02d1',
 {'candidate': 'his distant relative pascal rougon',
  'context': "The plot centres on the neurotic young priest Serge Mouret, first seen in La ConquĂŞte de Plassans, as he takes his orders and becomes the parish priest for the uninterested village of Artauds. The inbred villagers have no interest in religion and Serge is portrayed giving several wildly enthusiastic Masses to his completely empty, near-derelict church. Serge not only seems unperturbed by this state of affairs but actually appears to have positively sought it out especially, for it gives him time to contemplate religious affairs and to fully experience the fervour of his faith. Eventually he has a complete nervous breakdown and collapses into a near-comatose state, whereupon his distant relative, the unconventional doctor Pascal Rougon (the central character of the last novel in the series, 1893's Le Docteur Pascal), places him in the care of the inhabitants of a nearby derelict stat

In [25]:
def tokenize(texts, tokens=True, sep: str = " ", tokenizer=EVALUATION_TOKENIZER) -> list:
    arg_is_str = isinstance(texts, str)
    if arg_is_str:
        texts = [texts]

    results = []
    for text in texts:
        text_tokens = tokenizer(text)
        text_tokens = [str(t) for t in text_tokens]
        results.append(text_tokens if tokens else [sep.join(text_tokens)])

    return results[0] if arg_is_str else results

# Sanity check
assert tokenize("Hello, world!") == ['Hello', ',', 'world', '!']
assert tokenize(["Hello, world!"]) == [['Hello', ',', 'world', '!']]
assert tokenize(["Hello, world!", "Ola, mundo!"]) == [['Hello', ',', 'world', '!'], ["Ola", ",", "mundo", "!"]]

In [None]:
# We will now proceed to the tokenization
