In [1]:
import os

LANGS = [x.strip() for x in os.getenv("LANGS", "en,ro,de").split(",") if x.strip()]
OUT_DIR = os.getenv("OUT_DIR", "data/dump").strip()

print("params:", LANGS, OUT_DIR)
os.makedirs(OUT_DIR, exist_ok=True)

params: ['en', 'ro', 'de'] data/dump


## Get spacy models

In [2]:
!python -m spacy download xx_sent_ud_sm

language_models = {
    "en": "en_core_web_lg",
    "ro": "ro_core_news_lg",
    "de": "de_core_news_lg",
}

for lm in LANGS:
    if lm in language_models:
        print(f"downloading {lm}: {language_models[lm]}")
        !python -m spacy download {language_models[lm]}
    else:
        print(f"use blank")

Collecting xx-sent-ud-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.8.0/xx_sent_ud_sm-3.8.0-py3-none-any.whl (4.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xx-sent-ud-sm
Successfully installed xx-sent-ud-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_sent_ud_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
downloading en: en_core_web_lg
Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Get the MultiplEYE json data

In [3]:
! rm -rf languages*
! wget https://github.com/senisioi/repository/releases/download/eyelanguages0/languages_json_all.zip
! unzip languages_json_all.zip

--2026-01-21 21:29:48--  https://github.com/senisioi/repository/releases/download/eyelanguages0/languages_json_all.zip
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://release-assets.githubusercontent.com/github-production-release-asset/930203766/79211e09-0821-4137-a97f-9395779aa594?sp=r&sv=2018-11-09&sr=b&spr=https&se=2026-01-21T22%3A24%3A02Z&rscd=attachment%3B+filename%3Dlanguages_json_all.zip&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2026-01-21T21%3A23%3A34Z&ske=2026-01-21T22%3A24%3A02Z&sks=b&skv=2018-11-09&sig=Zx2PKJ8OzHuShDFxhcRTq0kYEutl60vL4kTE4PRB3N4%3D&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc2OTAzMTI4OCwibmJmIjoxNzY5MDMwOTg4LCJwYXRoIjoicmVsZWFzZWF

In [4]:
SPACY_LANGUAGES = ["ca", "de", "el", "en", "es", "fr", "hr", "it", "lt", "mk", "nl", "pl", "pt", "ro", "ru", "sl", "sv", "uk", "zh"]

CODE2LANG = {
    "ar": "Arabic",
    "ca": "Catalan",
    "cs": "Czech",
    "de": "German",
    "gsw": "Swiss German",
    "el": "Greek",
    "en": "English",
    #"es": "Spanish",
    "et": "Estonian",
    "eu": "Basque",
    #"fr": "French",
    #"he": "Hebrew",
    "hi": "Hindi",
    "hr": "Croatian",
    "it": "Italian",
    "kl": "Kalaallisut",
    "lt": "Lithuanian",
    "lv": "Latvian",
    "mk": "Macedonian",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "rm": "Romansh",
    "ro": "Romanian",
    "ru": "Russian",
    "sl": "Slovenian",
    "sq": "Albanian",
    "sv": "Swedish",
    "tr": "Turkish",
    "uk": "Ukrainian",
    #"yue": "Cantonese",
    "zh": "Chinese"
}

LANGUAGES = list(CODE2LANG.keys())

## Load data

In [5]:
import os
import json
import spacy

def load_all_json(lang_folder):
    all_data = {}
    for file in os.listdir(lang_folder):
        if file.endswith('.json'):
            lang_code = file.replace('.json', '').replace('multipleye_stimuli_experiment_', '')
            if lang_code == 'zd':
                lang_code = 'gsw'
            if (lang_code not in LANGUAGES) or (lang_code not in LANGS):
                continue
            with open(os.path.join(lang_folder, file), 'r', encoding='utf-8') as f:
                all_data[lang_code] = json.load(f)
    return all_data

In [6]:
all_data = load_all_json('languages_json')
for k,v in all_data.items():
  print(k, v[0])

en {'stimulus_id': 1, 'stimulus_name': 'PopSci_MultiplEYE', 'stimulus_type': 'experiment', 'pages': ['The MultiplEYE Project\n\nThe name "MultiplEYE" is a wordplay combining "multilingualism" or "multiple languages" with "eye" from "eye-tracking". MultiplEYE is a COST Action funded by the European Union. COST Actions are research networks supported by the European Cooperation in Science and Technology or COST for short. As a funding organisation, COST supports our growing network of researchers across Europe and beyond by providing financial assistance for conducting different networking activities.', 'These activities include working group meetings, training schools to share skills with younger researchers, and scientific research visits. The project title of the MultiplEYE COST Action is: Enabling multilingual eye-tracking data collection for human and machine language processing research. This means that the MultiplEYE COST Action aims to foster an interdisciplinary network of resea

## Prepare spaCy code to generate template csv files

In [8]:
LANG_FOLDER = "languages_json"
NLP_MODEL = None
CURRENT_LANG = ''
IN_DIR = 'languages_json/'

from spacy.util import get_lang_class


def exists_spacy_blank(lang_code):
    try:
        get_lang_class(lang_code)
        return True
    except:
        return False

def load_spacy_model(lang_code, small=True):
    model = None
    if lang_code in SPACY_LANGUAGES:
        genre = 'news'
        if lang_code in {'zh', 'en'}:
            genre = 'web'
        if lang_code == 'rm':
            return ''
        model_name = f'{lang_code}_core_{genre}_{"sm" if small else "lg"}'
        print(f"Loading model {model_name} for {lang_code}")
        model = spacy.load(model_name)
        model.add_pipe("sentencizer")
    elif lang_code == "rm":
        model = spacy.load("it_core_news_lg")
        # keep 'morphologizer' ?
        model.disable_pipes('tok2vec', 'tagger', 'parser', 'lemmatizer', 'attribute_ruler', 'ner')
    elif lang_code == 'gsw':
        model = spacy.load('de_core_news_lg')
    elif exists_spacy_blank(lang_code):
        print(f"Loading model blank model for {lang_code}")
        model = spacy.blank(lang_code)
        model.add_pipe("sentencizer")
    else:
        model_name = f'xx_sent_ud_sm'
        print(f"Loading model {model_name} for {lang_code}")
        model = spacy.load(model_name)
        model.add_pipe("sentencizer")
    return model


def get_nlp(lang_code, small=False):
    """To avoid loading all models at the same time
    """
    global NLP_MODEL, CURRENT_LANG
    if lang_code != CURRENT_LANG:
        try:
            print(f"Deleting model for {CURRENT_LANG}")
            del NLP_MODEL
        except:
            print("No model to delete")
        print(f"Loading model for {lang_code}")
        NLP_MODEL = load_spacy_model(lang_code, small=small)
        CURRENT_LANG = lang_code
    return NLP_MODEL


In [9]:
def feats_str(token):
    if not token.morph:
        return "_"
    md = token.morph.to_dict()
    if not md:
        return "_"
    bits = []
    for k in sorted(md):
        v = md[k]
        if isinstance(v, (list, tuple)):
            bits.append(f"{k}={','.join(v)}")
        else:
            bits.append(f"{k}={v}")
    return "|".join(bits) if bits else "_"


def get_head(token, sent):
    if token.head == token or token.dep_ == "ROOT":
        head = 0
        deprel = "root"
    else:
        head = (token.head.i - sent.start) + 1  # 1-based in sentence
        deprel = token.dep_.lower() if token.dep_ else "_"
    return head, deprel


def get_misc(token, include_ner=True):
    misc_parts = []
    if not token.whitespace_:
        misc_parts.append("SpaceAfter=No")
    if include_ner and token.ent_iob_ != "O":
        misc_parts.append(f"NER={token.ent_iob_}-{token.ent_type_}")
    misc = "|".join(misc_parts) if misc_parts else "_"
    return misc


def iter_pages(stimuli, nlp):
    for stim in stimuli:
        sid, sname = stim["stimulus_id"], stim["stimulus_name"]
        for pnum, page_text in enumerate(stim["pages"], start=1):
            yield sid, sname, pnum, nlp(page_text)

def stimuli2csv(stimuli, lang_code, level="page", small=False):
    rows = []
    nlp = get_nlp(lang_code, small=small)
    for sid, sname, page, doc in iter_pages(stimuli, nlp):
        ptext = doc.text
        document = nlp(ptext)
        for sent_idx, sentence in enumerate(document.sents):
            eos = {
              "language": CODE2LANG[lang_code],
              "language_code": lang_code,
              "stimulus_name": sname,
              "page": page,
              #"sent_idx": sent_idx+1,
              "token": "<eos>",
              "is_alpha": False,
              "is_stop": False,
              "is_punct": False,
              "lemma": "",
              "upos": "",
              "xpos": "",
              "feats": "",
              "head": "",
              "deprel": "",
              "deps": "",
              "misc": ""
              }
            for token in sentence:
                head, deprel = get_head(token, sentence)
                rows.append(
                    {
                        #"stimulus_id": sid,
                        "language": CODE2LANG[lang_code],
                        "language_code": lang_code,
                        "stimulus_name": sname,
                        "page": page,
                        #"sent_idx": sent_idx+1,
                        "token": token.text,
                        "is_alpha": token.is_alpha,
                        "is_stop": token.is_stop,
                        "is_punct": token.is_punct,
                        "lemma": token.lemma_,
                        "upos": token.pos_,
                        "xpos": token.tag_,
                        "feats": feats_str(token),
                        "head": head,
                        "deprel": deprel,
                        "deps": "_",
                        "misc": get_misc(token, include_ner=True)
                    }
                )
            rows.append(eos)


    df = pd.DataFrame(rows).sort_values(by=["stimulus_name", "page"])
    df = pd.DataFrame(rows)
    return df

## Generate csv templates

In [10]:
from tqdm import tqdm
import pandas as pd
from collections import defaultdict

preproc = defaultdict(dict)
for lang_code, data in tqdm(all_data.items()):
    if lang_code not in LANGS:
        continue
    preproc[lang_code] = stimuli2csv(data, lang_code, small=False)

  0%|          | 0/3 [00:00<?, ?it/s]

Deleting model for 
Loading model for en
Loading model en_core_web_lg for en


 33%|███▎      | 1/3 [00:09<00:19,  9.83s/it]

Deleting model for en
Loading model for ro
Loading model ro_core_news_lg for ro


 67%|██████▋   | 2/3 [00:18<00:08,  8.97s/it]

Deleting model for ro
Loading model for de
Loading model de_core_news_lg for de


100%|██████████| 3/3 [00:31<00:00, 10.41s/it]


In [11]:
import os
import pandas as pd
import shutil
from google.colab import files

def stimuli2sentences(stimuli, lang_code, small=False):
    rows = []
    nlp = get_nlp(lang_code, small=small)

    global_sent_idx = 0

    for sid, sname, page, doc in iter_pages(stimuli, nlp):
        for sent in doc.sents:
            raw_text = sent.text

            if '\n' in raw_text:
                parts = [t.strip() for t in raw_text.split('\n') if t.strip()]
            else:
                cleaned = raw_text.strip()
                parts = [cleaned] if cleaned else []

            for part in parts:
                rows.append({
                    "stimulus_id": sid,
                    "stimulus_name": sname,
                    "language": lang_code,
                    "screen_id": page,
                    "global_sent_index": global_sent_idx,
                    "text": part
                })
                global_sent_idx += 1

    return pd.DataFrame(rows)

def download_processed_data(all_data, languages=['en', 'ro', 'de']):
    out_dir = "processed_sentences"
    if os.path.exists(out_dir):
        shutil.rmtree(out_dir)
    os.makedirs(out_dir)

    for lang in languages:
        if lang not in all_data:
            continue

        print(f"Processing {lang}...")
        df = stimuli2sentences(all_data[lang], lang)

        filename = f"{out_dir}/{lang}_sentences.csv"
        df.to_csv(filename, index=False)
        print(f"Saved: {filename} ({len(df)} sentences)")

    print("Zipping files...")
    shutil.make_archive("eyetracking_sentences", 'zip', out_dir)
    files.download("eyetracking_sentences.zip")

download_processed_data(all_data, languages=['en', 'ro', 'de'])

Processing en...
Deleting model for de
Loading model for en
Loading model en_core_web_lg for en
Saved: processed_sentences/en_sentences.csv (312 sentences)
Processing ro...
Deleting model for en
Loading model for ro
Loading model ro_core_news_lg for ro
Saved: processed_sentences/ro_sentences.csv (327 sentences)
Processing de...
Deleting model for ro
Loading model for de
Loading model de_core_news_lg for de
Saved: processed_sentences/de_sentences.csv (342 sentences)
Zipping files...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>