In [3]:
import os

LANGS = [x.strip() for x in os.getenv("LANGS", "sq").split(",") if x.strip()]
OUT_DIR = os.getenv("OUT_DIR", "data/dump").strip()

print("params:", LANGS, OUT_DIR)
os.makedirs(OUT_DIR, exist_ok=True)

params: ['sq'] data/dump


## Get spacy models

In [None]:
# for Turkish, we'd have to run a separate pipeline with a different spacy version
#! pip install "tr_core_news_lg @ https://huggingface.co/turkish-nlp-suite/tr_core_news_lg/resolve/main/tr_core_news_lg-1.0-py3-none-any.whl"

In [4]:
!python -m spacy download xx_sent_ud_sm

language_models = {
}

for lm in LANGS:
    if lm in language_models:
        print(f"downloading {lm}: {language_models[lm]}")
        !python -m spacy download {language_models[lm]}
    else:
        print(f"use blank")

Collecting xx-sent-ud-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.8.0/xx_sent_ud_sm-3.8.0-py3-none-any.whl (4.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xx-sent-ud-sm
Successfully installed xx-sent-ud-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_sent_ud_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
use blank


## Get the MultiplEYE json data

In [5]:
! rm -rf languages*
! wget https://github.com/senisioi/repository/releases/download/eyelanguages0/languages_json_all.zip
! unzip languages_json_all.zip

--2025-10-11 11:12:02--  https://github.com/senisioi/repository/releases/download/eyelanguages0/languages_json_all.zip
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://release-assets.githubusercontent.com/github-production-release-asset/930203766/79211e09-0821-4137-a97f-9395779aa594?sp=r&sv=2018-11-09&sr=b&spr=https&se=2025-10-11T12%3A09%3A10Z&rscd=attachment%3B+filename%3Dlanguages_json_all.zip&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2025-10-11T11%3A08%3A11Z&ske=2025-10-11T12%3A09%3A10Z&sks=b&skv=2018-11-09&sig=4G4rdchKGCWjcdhaIkRShK%2BH4gmOTDutAxVzq5hZ958%3D&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc2MDE4MTQyMiwibmJmIjoxNzYwMTgxMTIyLCJwYXRoIjoicmVsZWFzZWF

In [12]:
SPACY_LANGUAGES = []

CODE2LANG = {
    "sq": "Albanian",
}

LANGUAGES = list(CODE2LANG.keys())

## Load data

In [13]:
import os
import json
import spacy

def load_all_json(lang_folder):
    all_data = {}
    for file in os.listdir(lang_folder):
        if file.endswith('.json'):
            lang_code = file.replace('.json', '').replace('multipleye_stimuli_experiment_', '')
            if lang_code == 'zd':
                lang_code = 'gsw'
            if (lang_code not in LANGUAGES) or (lang_code not in LANGS):
                continue
            with open(os.path.join(lang_folder, file), 'r', encoding='utf-8') as f:
                all_data[lang_code] = json.load(f)
    return all_data

In [14]:
all_data = load_all_json('languages_json')
for k,v in all_data.items():
  print(k, v[0])

sq {'stimulus_id': 1, 'stimulus_name': 'PopSci_MultiplEYE', 'stimulus_type': 'experiment', 'pages': ['Projekti MultiplEYE\n\nEmri “MultiplEYE” është një lojë fjalësh që kombinon “multilingualism”\n(shumëgjuhësinë) apo “multiple languages”  (gjuhët e shumta) me “sy”, duke iu\nreferuar “eye-tracking” (gjurmimit të syve). MultiplEYE është një COST Action i\nfinancuar nga Bashkimi Evropian. COST Actions janë rrjete kërkimore të mbështetura\nnga Bashkëpunimi Evropian në Shkencë dhe Teknologji ose shkurt COST. Si organizatë\nfinancuese, COST mbështet rrjetin tonë kërkimor në rritje në Evropë dhe më gjerë,\nduke ofruar ndihmë financiare për aktivitete të ndryshme.', 'Këto aktivitete përfshijnë takimet e grupeve të punës, shkollat e trajnimit për të\ntrajnuar për aftësi të reja studiuesit e rinj si edhe vizita kërkimore shkencore.\nTitulli i projektit COST MultiplEYE nënkupton: Mundësimi i mbledhjes së të dhënave\nshumëgjuhëshe përmes gjurmimit të syve për hulumtim rreth përpunimit të gjuhës\n

## Prepare spaCy code to generate template csv files

In [65]:

LANG_FOLDER = "languages_json"
NLP_MODEL = None
CURRENT_LANG = ''
IN_DIR = 'languages_json/'

from spacy.util import get_lang_class


def exists_spacy_blank(lang_code):
    try:
        get_lang_class(lang_code)
        return True
    except:
        return False

def load_spacy_model(lang_code, small=True):
    model = None
    if lang_code in SPACY_LANGUAGES:
        genre = 'news'
        if lang_code in {'zh', 'en'}:
            genre = 'web'
        if lang_code == 'rm':
            return ''
        model_name = f'{lang_code}_core_{genre}_{"sm" if small else "lg"}'
        print(f"Loading model {model_name} for {lang_code}")
        model = spacy.load(model_name)
        model.add_pipe("sentencizer")
    elif lang_code == "rm":
        model = spacy.load("it_core_news_lg")
        # keep 'morphologizer' ?
        model.disable_pipes('tok2vec', 'tagger', 'parser', 'lemmatizer', 'attribute_ruler', 'ner')
    elif lang_code == 'gsw':
        model = spacy.load('de_core_news_lg')
    elif exists_spacy_blank(lang_code):
        print(f"Loading model blank model for {lang_code}")
        model = spacy.blank(lang_code)
        model.add_pipe("sentencizer")
    else:
        model_name = f'xx_sent_ud_sm'
        print(f"Loading model {model_name} for {lang_code}")
        model = spacy.load(model_name)
        model.add_pipe("sentencizer")
    return model


def get_nlp(lang_code, small=False):
    """To avoid loading all models at the same time
    """
    global NLP_MODEL, CURRENT_LANG
    if lang_code != CURRENT_LANG:
        try:
            print(f"Deleting model for {CURRENT_LANG}")
            del NLP_MODEL
        except:
            print("No model to delete")
        print(f"Loading model for {lang_code}")
        NLP_MODEL = load_spacy_model(lang_code, small=small)
        CURRENT_LANG = lang_code
    return NLP_MODEL


In [66]:
from spacy.symbols import ORTH
import re

def feats_str(token):
    if not token.morph:
        return "_"
    md = token.morph.to_dict()
    if not md:
        return "_"
    bits = []
    for k in sorted(md):
        v = md[k]
        if isinstance(v, (list, tuple)):
            bits.append(f"{k}={','.join(v)}")
        else:
            bits.append(f"{k}={v}")
    return "|".join(bits) if bits else "_"


def get_head(token, sent):
    if token.head == token or token.dep_ == "ROOT":
        head = 0
        deprel = "root"
    else:
        head = (token.head.i - sent.start) + 1  # 1-based in sentence
        deprel = token.dep_.lower() if token.dep_ else "_"
    return head, deprel


def get_misc(token, include_ner=True):
    misc_parts = []
    if not token.whitespace_:
        misc_parts.append("SpaceAfter=No")
    if include_ner and token.ent_iob_ != "O":
        misc_parts.append(f"NER={token.ent_iob_}-{token.ent_type_}")
    misc = "|".join(misc_parts) if misc_parts else "_"
    return misc


def iter_pages(stimuli, nlp):
    for stim in stimuli:
        sid, sname = stim["stimulus_id"], stim["stimulus_name"]
        for pnum, page_text in enumerate(stim["pages"], start=1):
            yield sid, sname, pnum, nlp(page_text)

def stimuli2csv(stimuli, lang_code, level="page", small=False):
    rows = []
    nlp = get_nlp(lang_code, small=small)

    pat = re.compile(r"^([ts])'(\w+)$", re.UNICODE)
    diacritic = {"t": "të", "s": "së"}

    @spacy.Language.component("split_t_s_apostrophe")
    def split_t_s_apostrophe(doc):
        with doc.retokenize() as retok:
            for tok in doc:
                m = pat.match(tok.text)
                if m:
                    first = tok.text[:2]
                    rest  = tok.text[2:]
                    retok.split(tok, [first, rest], [tok.head, tok.head])
        return doc

    @spacy.Language.component("split_enclitics")
    def split_enclitics(doc):
        #mapping = {"ia": ["i","a"], "iu": ["i","u"], "ta": ["të","a"]}
        mapping = {"ia": ["i","a"], "iu": ["i","u"], "ta": ["t","a"]}

        with doc.retokenize() as retok:
            for tok in doc:
                parts = mapping.get(tok.text)
                if parts:
                    retok.split(tok, parts, [tok.head] * len(parts))
        return doc

    nlp.add_pipe("split_t_s_apostrophe", first=True)
    nlp.add_pipe("split_enclitics", first=True)

    special_cases = {"eye-tracking" : [{ORTH: "eye-tracking"}],
                     "Ottenskjold-it" : [{ORTH: "Ottenskjold-it"}],
                     "Ottenskjold-i" : [{ORTH: "Ottenskjold-i"}],
                     "BE-së" : [{ORTH: "BE-së"}]}

    for token, special_case in special_cases.items():
        nlp.tokenizer.add_special_case(token, special_case)


    for sid, sname, page, doc in iter_pages(stimuli, nlp):
        ptext = doc.text
        document = nlp(ptext)
        for sent_idx, sentence in enumerate(document.sents):
            eos = {
              "language": CODE2LANG[lang_code],
              "language_code": lang_code,
              "stimulus_name": sname,
              "page": page,
              #"sent_idx": sent_idx+1,
              "token": "<eos>",
              "is_alpha": False,
              "is_stop": False,
              "is_punct": False,
              "lemma": "",
              "upos": "",
              "xpos": "",
              "feats": "",
              "head": "",
              "deprel": "",
              "deps": "",
              "misc": ""
              }
            for token in sentence:
                head, deprel = get_head(token, sentence)
                rows.append(
                    {
                        #"stimulus_id": sid,
                        "language": CODE2LANG[lang_code],
                        "language_code": lang_code,
                        "stimulus_name": sname,
                        "page": page,
                        #"sent_idx": sent_idx+1,
                        "token": token.text,
                        "is_alpha": token.is_alpha,
                        "is_stop": token.is_stop,
                        "is_punct": token.is_punct,
                        "lemma": token.lemma_,
                        "upos": token.pos_,
                        "xpos": token.tag_,
                        "feats": feats_str(token),
                        "head": head,
                        "deprel": deprel,
                        "deps": "_",
                        "misc": get_misc(token, include_ner=True)
                    }
                )
            rows.append(eos)


    df = pd.DataFrame(rows).sort_values(by=["stimulus_name", "page"])
    df = pd.DataFrame(rows)
    return df

## Generate csv templates

In [67]:
from tqdm import tqdm
import pandas as pd
from collections import defaultdict

preproc = defaultdict(dict)
for lang_code, data in tqdm(all_data.items()):
    if lang_code not in LANGS:
        continue
    preproc[lang_code] = stimuli2csv(data, lang_code, small=False)

  0%|          | 0/1 [00:00<?, ?it/s]

Deleting model for 
Loading model for sq
Loading model blank model for sq


100%|██████████| 1/1 [00:00<00:00,  2.94it/s]


## Save

In [68]:
import os
from tqdm import tqdm

for lang_code, df in tqdm(preproc.items()):
    lang_out = 'gsw' if lang_code == 'zd' else lang_code
    out_dir = os.path.join(OUT_DIR, lang_out)
    os.makedirs(out_dir, exist_ok=True)

    for stim_name, group in df.groupby('stimulus_name'):
        out_fis = os.path.join(out_dir, f"{stim_name}.csv")
        g = group.copy()
        g['language_code'] = lang_out
        g.to_csv(out_fis, index=False)
        print(out_fis)

100%|██████████| 1/1 [00:00<00:00, 15.85it/s]

data/dump/sq/Arg_PISACowsMilk.csv
data/dump/sq/Arg_PISARapaNui.csv
data/dump/sq/Enc_WikiMoon.csv
data/dump/sq/Ins_HumanRights.csv
data/dump/sq/Ins_LearningMobility.csv
data/dump/sq/Lit_Alchemist.csv
data/dump/sq/Lit_BrokenApril.csv
data/dump/sq/Lit_MagicMountain.csv
data/dump/sq/Lit_NorthWind.csv
data/dump/sq/Lit_Solaris.csv
data/dump/sq/PopSci_Caveman.csv
data/dump/sq/PopSci_MultiplEYE.csv





In [69]:
import shutil
from google.colab import files

shutil.make_archive("/content/sq_dump", "zip", "/content/data/dump/sq")
files.download("/content/sq_dump.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [70]:
import os
os.makedirs("/content", exist_ok=True)

for lang_code, df in preproc.items():
    if lang_code not in LANGS:
        continue
    out_path = f"/content/tokens_{lang_code}.csv"
    df["token"].to_csv(out_path, index=False, header=False)
    print(out_path)

/content/tokens_sq.csv
