In [70]:
import os

LANGS = [x.strip() for x in os.getenv("LANGS", "yue").split(",") if x.strip()]
OUT_DIR = os.getenv("OUT_DIR", "data/dump").strip()

print("params:", LANGS, OUT_DIR)
os.makedirs(OUT_DIR, exist_ok=True)

params: ['yue'] data/dump


## Get spacy models

In [71]:
!python -m spacy download xx_sent_ud_sm

Collecting xx-sent-ud-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.8.0/xx_sent_ud_sm-3.8.0-py3-none-any.whl (4.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_sent_ud_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
use blank


## Get the MultiplEYE json data

In [None]:
! rm -rf languages*
! wget https://github.com/senisioi/repository/releases/download/eyelanguages0/languages_json_all.zip
! unzip languages_json_all.zip

In [76]:
SPACY_LANGUAGES = ["ca", "de", "el", "en", "es", "fr", "hr", "it", "lt", "mk", "nl", "pl", "pt", "ro", "ru", "sl", "sv", "uk", "zh"]

CODE2LANG = {
    "yue": "Cantonese"
}

LANGUAGES = list(CODE2LANG.keys())

## Load data

In [77]:
import os
import json
import spacy

def load_all_json(lang_folder):
    all_data = {}
    for file in os.listdir(lang_folder):
        if file.endswith('.json'):
            lang_code = file.replace('.json', '').replace('multipleye_stimuli_experiment_', '')
            if lang_code == 'zd':
                lang_code = 'gsw'
            if (lang_code not in LANGUAGES) or (lang_code not in LANGS):
                continue
            with open(os.path.join(lang_folder, file), 'r', encoding='utf-8') as f:
                all_data[lang_code] = json.load(f)
    return all_data

In [78]:
all_data = load_all_json('languages_json')
for k,v in all_data.items():
  print(k, v[0])

yue {'stimulus_id': 1, 'stimulus_name': 'PopSci_MultiplEYE', 'stimulus_type': 'experiment', 'pages': ['MultiplEYE项目\n\n「MultiplEYE」這個名字玩了一些文字遊戲，巧妙結合了「多語言性（multilingualism / multiple languages）」與「眼動追蹤（eye-tracking)」中的「眼睛（eye）」。MultiplEYE 是由歐盟資助的 COST 行動項目。COST行動項目是由COST（歐洲科學與技術合作組織）專門支持的研究網絡。作為資助機構，COST為歐洲及其他地區的研究人員提供財務支持，以推動不同形式的合作活動。', 'MultiplEYE 行動的正式項目名稱為：促進多語言眼動數據收集，以支持人類與機器語言處理研究。簡言之，MultiplEYE 致力於建立一個跨學科研究網絡，推動各語言研究小組合作，收集閱讀過程中的眼動數據。', '其目標是建立一個大型的多語言眼動語料庫，並透過各學科（如語言學、心理學、言語及語言治療、計算機科學等）之間的知識交流，助力研究員收集數據。這些數據既可以用於從心裡語言學角度分析人類語言處理，也可以用於從機器學習角度，評估與提高計算機語言處理能力。', '那麼，什麼是「眼動追蹤」？\n眼動追蹤（eye-tracking）是一个測量眼睛注視點（也就是你在看哪裡），以及眼睛在注視點間如何移動的過程。用來測量眼睛註視位置和移動軌跡的裝置被稱為「眼動儀」。它包含一個紅外線攝像頭，以不對人眼造成幹擾及傷害的光頻率進行追蹤。', '藉助圖像識別算法，眼動儀能根據被試頭和眼睛的位置、參與者與屏幕的距離，以及眼動儀本身的位置，精準預測眼睛的註視點。眼動追蹤技術可應用於多個領域。比如，它可以幫助檢測疲勞駕駛、還可以為醫療領域診斷與訓練的應用提供支持。它還可以用於遊戲設計、市場營銷、以及人機互動等。', '為什麼我們的項目特別關註閱讀時的眼動追蹤？\n當你閱讀這段文字時，眼動儀會追蹤你的眼睛在文字上的移動。這也提供了相關的信息，包括你花了多長時間來看這段文本，在每個詞上花多長時間，跳過了哪些詞，專注於哪些詞，以及是否需要回頭重新閱讀某些部分，以更好地理解文本。', '當你的大腦在處理文本內容時，你的眼球運動就

## Prepare spaCy code to generate template csv files

In [79]:
LANG_FOLDER = "languages_json"
NLP_MODEL = None
CURRENT_LANG = ''
IN_DIR = 'languages_json/'

from spacy.util import get_lang_class


def exists_spacy_blank(lang_code):
    try:
        get_lang_class(lang_code)
        return True
    except:
        return False

def load_spacy_model(lang_code, small=True):
    model = None
    if lang_code in SPACY_LANGUAGES:
        genre = 'news'
        if lang_code in {'zh', 'en'}:
            genre = 'web'
        if lang_code == 'rm':
            return ''
        model_name = f'{lang_code}_core_{genre}_{"sm" if small else "lg"}'
        print(f"Loading model {model_name} for {lang_code}")
        model = spacy.load(model_name)
        model.add_pipe("sentencizer")
    elif lang_code == "rm":
        model = spacy.load("it_core_news_lg")
        # keep 'morphologizer' ?
        model.disable_pipes('tok2vec', 'tagger', 'parser', 'lemmatizer', 'attribute_ruler', 'ner')
    elif lang_code == 'gsw':
        model = spacy.load('de_core_news_lg')
    elif exists_spacy_blank(lang_code):
        print(f"Loading model blank model for {lang_code}")
        model = spacy.blank(lang_code)
        model.add_pipe("sentencizer")
    else:
        model_name = f'xx_sent_ud_sm'
        print(f"Loading model {model_name} for {lang_code}")
        model = spacy.load(model_name)
        model.add_pipe("sentencizer")
    return model


def get_nlp(lang_code, small=False):
    """To avoid loading all models at the same time
    """
    global NLP_MODEL, CURRENT_LANG
    if lang_code != CURRENT_LANG:
        try:
            print(f"Deleting model for {CURRENT_LANG}")
            del NLP_MODEL
        except:
            print("No model to delete")
        print(f"Loading model for {lang_code}")
        NLP_MODEL = load_spacy_model(lang_code, small=small)
        CURRENT_LANG = lang_code
    return NLP_MODEL

In [None]:
!unzip /content/multipleye_stimuli_yu_segmentation_pages.zip

In [81]:
# preprocess custom tokens
# format should be dict[sname] = [str1, str2, str3, ...] where each index is a different page

import re
from pathlib import Path

root = Path("/content/multipleye_stimuli_yu_segmentation_pages")

def read_text_safely(fp):
    b = fp.read_bytes()
    for enc in ("utf-8", "utf-8-sig", "cp1252", "latin-1"):
        try:
            return b.decode(enc)
        except UnicodeDecodeError:
            pass
    return b.decode("utf-8", errors="replace")

def key_name(p):
    parts = p.stem.split("_")
    return "_".join(parts[:2]) if len(parts) >= 2 else p.stem

page_header = re.compile(r"^===\s*PAGE\s*\d+\s*===\s*$",
                         re.IGNORECASE | re.MULTILINE)

def clean_page_text(block):
    lines = [ln for ln in block.splitlines() if not page_header.match(ln.strip())]
    txt = " ".join(lines)
    txt = txt.replace("***", " ")
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

custom_tokens = {}

files = list(root.rglob("*_seg_pages.txt"))
if not files:
    files = list(root.rglob("*.txt"))

for fp in sorted(files):
    if fp.name.startswith("."):
        continue

    k = key_name(fp)
    raw = read_text_safely(fp).replace("\r\n", "\n").replace("\r", "\n")

    first_nonempty = next((ln for ln in raw.splitlines() if ln.strip()), "")
    if not page_header.match(first_nonempty.strip()):
        raw = "=== PAGE 1 ===\n" + raw

    blocks = [b for b in re.split(page_header, raw) if b.strip()]
    pages = [clean_page_text(b) for b in blocks]

    custom_tokens[k] = pages

In [94]:
# for item, value in custom_tokens.items():
#     for page in value:
#         print(item, page)

In [95]:
from spacy.tokens import Doc
# outputs content from custom_tokens dict by pages
class CustomTokenizer:
    def __init__(self, vocab, pages_by_key):
        self.vocab = vocab
        self.pages_by_key = pages_by_key
        self.key = None
        self.page_idx = 0

    def set_key(self, key):
        if key != self.key:
            self.key = key
            self.page_idx = 0

    def __call__(self, text):
        pages = self.pages_by_key.get(self.key, [])
        words = pages[self.page_idx].split() if self.page_idx < len(pages) else []
        self.page_idx += 1
        spaces = [True] * len(words)
        if spaces: spaces[-1] = False
        return Doc(self.vocab, words=words, spaces=spaces)

In [99]:
def feats_str(token):
    if not token.morph:
        return "_"
    md = token.morph.to_dict()
    if not md:
        return "_"
    bits = []
    for k in sorted(md):
        v = md[k]
        if isinstance(v, (list, tuple)):
            bits.append(f"{k}={','.join(v)}")
        else:
            bits.append(f"{k}={v}")
    return "|".join(bits) if bits else "_"


def get_head(token, sent):
    if token.head == token or token.dep_ == "ROOT":
        head = 0
        deprel = "root"
    else:
        head = (token.head.i - sent.start) + 1  # 1-based in sentence
        deprel = token.dep_.lower() if token.dep_ else "_"
    return head, deprel


def get_misc(token, include_ner=True):
    misc_parts = []
    if not token.whitespace_:
        misc_parts.append("SpaceAfter=No")
    if include_ner and token.ent_iob_ != "O":
        misc_parts.append(f"NER={token.ent_iob_}-{token.ent_type_}")
    misc = "|".join(misc_parts) if misc_parts else "_"
    return misc


def iter_pages(stimuli, nlp, set_key=None):
    for stim in stimuli:
        sid, sname = stim["stimulus_id"], stim["stimulus_name"]
        if set_key: set_key(sname)

        for pnum, page_text in enumerate(stim["pages"], start=1):
            yield sid, sname, pnum, nlp(page_text)

def stimuli2csv(stimuli, lang_code, level="page", small=False):
    rows = []
    nlp = get_nlp(lang_code, small=small)

    if lang_code == "yue":
        yue_tok = CustomTokenizer(nlp.vocab, custom_tokens)
        nlp.tokenizer = yue_tok
        set_key = yue_tok.set_key
    else:
        set_key = None

    for sid, sname, page, doc in iter_pages(stimuli, nlp, set_key=set_key):
        for sent_idx, sentence in enumerate(doc.sents):
            eos = {
              "language": CODE2LANG[lang_code],
              "language_code": lang_code,
              "stimulus_name": sname,
              "page": page,
              #"sent_idx": sent_idx+1,
              "token": "<eos>",
              "is_alpha": False,
              "is_stop": False,
              "is_punct": False,
              "lemma": "",
              "upos": "",
              "xpos": "",
              "feats": "",
              "head": "",
              "deprel": "",
              "deps": "",
              "misc": ""
              }
            for token in sentence:
                head, deprel = get_head(token, sentence)
                rows.append(
                    {
                        #"stimulus_id": sid,
                        "language": CODE2LANG[lang_code],
                        "language_code": lang_code,
                        "stimulus_name": sname,
                        "page": page,
                        #"sent_idx": sent_idx+1,
                        "token": token.text,
                        "is_alpha": token.is_alpha,
                        "is_stop": token.is_stop,
                        "is_punct": token.is_punct,
                        "lemma": token.lemma_,
                        "upos": token.pos_,
                        "xpos": token.tag_,
                        "feats": feats_str(token),
                        "head": head,
                        "deprel": deprel,
                        "deps": "_",
                        "misc": get_misc(token, include_ner=True)
                    }
                )
            rows.append(eos)


    df = pd.DataFrame(rows).sort_values(by=["stimulus_name", "page"])
    df = pd.DataFrame(rows)
    return df

## Generate csv templates

In [100]:
from tqdm import tqdm
import pandas as pd
from collections import defaultdict

preproc = defaultdict(dict)
for lang_code, data in tqdm(all_data.items()):
    if lang_code not in LANGS:
        continue
    preproc[lang_code] = stimuli2csv(data, lang_code, small=False)

100%|██████████| 1/1 [00:00<00:00,  4.42it/s]


## Save

In [101]:
import os
from tqdm import tqdm

for lang_code, df in tqdm(preproc.items()):
    lang_out = 'gsw' if lang_code == 'zd' else lang_code
    out_dir = os.path.join(OUT_DIR, lang_out)
    os.makedirs(out_dir, exist_ok=True)

    for stim_name, group in df.groupby('stimulus_name'):
        out_fis = os.path.join(out_dir, f"{stim_name}.csv")
        g = group.copy()
        g['language_code'] = lang_out
        g.to_csv(out_fis, index=False)
        print(out_fis)

100%|██████████| 1/1 [00:00<00:00, 18.83it/s]

data/dump/yue/Arg_PISACowsMilk.csv
data/dump/yue/Arg_PISARapaNui.csv
data/dump/yue/Enc_WikiMoon.csv
data/dump/yue/Ins_HumanRights.csv
data/dump/yue/Ins_LearningMobility.csv
data/dump/yue/Lit_Alchemist.csv
data/dump/yue/Lit_BrokenApril.csv
data/dump/yue/Lit_MagicMountain.csv
data/dump/yue/Lit_Solaris.csv
data/dump/yue/PopSci_Caveman.csv
data/dump/yue/PopSci_MultiplEYE.csv





In [102]:
!zip -r /content/data_dump.zip /content/data/dump
from google.colab import files
files.download('/content/data_dump.zip')

updating: content/data/dump/ (stored 0%)
updating: content/data/dump/zh/ (stored 0%)
updating: content/data/dump/zh/Arg_PISACowsMilk.csv (deflated 87%)
updating: content/data/dump/zh/Lit_MagicMountain.csv (deflated 87%)
updating: content/data/dump/zh/Lit_BrokenApril.csv (deflated 87%)
updating: content/data/dump/zh/Lit_Alchemist.csv (deflated 86%)
updating: content/data/dump/zh/Ins_HumanRights.csv (deflated 86%)
updating: content/data/dump/zh/Enc_WikiMoon.csv (deflated 82%)
updating: content/data/dump/zh/PopSci_Caveman.csv (deflated 86%)
updating: content/data/dump/zh/Lit_Solaris.csv (deflated 87%)
updating: content/data/dump/zh/PopSci_MultiplEYE.csv (deflated 91%)
updating: content/data/dump/zh/Lit_NorthWind.csv (deflated 82%)
updating: content/data/dump/zh/Ins_LearningMobility.csv (deflated 87%)
updating: content/data/dump/zh/Arg_PISARapaNui.csv (deflated 87%)
updating: content/data/dump/en/ (stored 0%)
updating: content/data/dump/en/Arg_PISACowsMilk.csv (deflated 89%)
updating: cont

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>