In [None]:
import os

LANGS = [x.strip() for x in os.getenv("LANGS", "en,zh").split(",") if x.strip()]
OUT_DIR = os.getenv("OUT_DIR", "data/dump").strip()

print("params:", LANGS, OUT_DIR)
os.makedirs(OUT_DIR, exist_ok=True)

params: ['en', 'zh'] data/dump


## Get spacy models

In [None]:
!python -m spacy download xx_sent_ud_sm

language_models = {
    "en": "en_core_web_lg",
    "zh": "zh_core_web_lg",
}

for lm in LANGS:
    if lm in language_models:
        print(f"downloading {lm}: {language_models[lm]}")
        !python -m spacy download {language_models[lm]}
    else:
        print(f"use blank")

## Get the MultiplEYE json data

In [None]:
! rm -rf languages*
! wget https://github.com/senisioi/repository/releases/download/eyelanguages0/languages_json_all.zip
! unzip languages_json_all.zip

In [None]:
SPACY_LANGUAGES = ["ca", "de", "el", "en", "es", "fr", "hr", "it", "lt", "mk", "nl", "pl", "pt", "ro", "ru", "sl", "sv", "uk", "zh"]

CODE2LANG = {
    "ar": "Arabic",
    "ca": "Catalan",
    "cs": "Czech",
    "de": "German",
    "gsw": "Swiss German",
    "el": "Greek",
    "en": "English",
    #"es": "Spanish",
    "et": "Estonian",
    "eu": "Basque",
    #"fr": "French",
    #"he": "Hebrew",
    "hi": "Hindi",
    "hr": "Croatian",
    "it": "Italian",
    "kl": "Kalaallisut",
    "lt": "Lithuanian",
    "lv": "Latvian",
    "mk": "Macedonian",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "rm": "Romansh",
    "ro": "Romanian",
    "ru": "Russian",
    "sl": "Slovenian",
    "sq": "Albanian",
    "sv": "Swedish",
    "tr": "Turkish",
    "uk": "Ukrainian",
    #"yue": "Cantonese",
    "zh": "Chinese"
}

LANGUAGES = list(CODE2LANG.keys())

## Load data

In [None]:
import os
import json
import spacy

def load_all_json(lang_folder):
    all_data = {}
    for file in os.listdir(lang_folder):
        if file.endswith('.json'):
            lang_code = file.replace('.json', '').replace('multipleye_stimuli_experiment_', '')
            if lang_code == 'zd':
                lang_code = 'gsw'
            if (lang_code not in LANGUAGES) or (lang_code not in LANGS):
                continue
            with open(os.path.join(lang_folder, file), 'r', encoding='utf-8') as f:
                all_data[lang_code] = json.load(f)
    return all_data

In [None]:
all_data = load_all_json('languages_json')
for k,v in all_data.items():
  print(k, v[0])

zh {'stimulus_id': 1, 'stimulus_name': 'PopSci_MultiplEYE', 'stimulus_type': 'experiment', 'pages': ['MultiplEYE项目\n\n“MultiplEYE”这个名字玩了一些文字游戏，把“multilingualism”（多语言）或者“multiple languages”（多种语言）和“eye-tracking”（眼动追踪）中的“eye”（眼睛）巧妙地结合在一起。MultiplEYE是由欧盟资助的COST行动项目。COST行动项目是欧洲科学与技术合作组织（简称COST）支持的研究网络。作为资助机构，COST在欧洲和其他地方提供财务援助，用于扩大研究人员网络，及支持我们的研究人员进行各种合作活动。', '这些活动包括工作组会议、培训年轻研究人员学习新技能的培训学校以及科学研究访问。MultiplEYE COST行动的项目名称是：为人类和机器语言处理研究收集多语言眼动追踪数据收集。这意味着MultiplEYE COST行动旨在建立一个跨学科研究小组网络，收集多语言阅读的眼动追踪数据。', '行动的目标是支持大型多语言眼动追踪语料库的开发，并使研究人员能够在语言学、心理学、言语病理学以及计算机科学等各个领域共享知识，由此来收集数据。这些数据可以在之后用于从心理语言学的角度研究人类语言处理，也可以有助于从机器学习角度改进和评估计算机的语言处理能力。', '那么，什么是“眼动追踪”呢？\n简单说，眼动追踪就是测量人们看向哪里，眼睛是如何在不同点之间移动的过程。我们用一种叫做“眼动追踪仪”的设备来测量眼睛的位置和运动。“眼动追踪仪”有一个红外线摄像头，但是使用的光频率对眼睛是无害的。', '借助图像识别算法，“眼动追踪仪”可以通过了解头部和眼睛的位置、参与者所看屏幕的距离以及眼动追踪仪的位置很准确地估算我们的凝视点。眼动追踪技术有许多用途。比如，它可以帮助检测疲劳驾驶，支持医疗领域以筛查和培训为目的的应用。它也可以用于游戏、市场营销以及人机交互。', '为什么我们的项目特别关注阅读时的眼动追踪呢？\n当您阅读这些文字时，眼动追踪仪会跟随您的眼睛在文本上移动。这提供了一些信息，比如您花了多长时间来看这段文字，或者更具体地说，您每个词花了多长时间，您跳过了哪些词，您专注在哪些词上

## Prepare spaCy code to generate template csv files

In [None]:
LANG_FOLDER = "languages_json"
NLP_MODEL = None
CURRENT_LANG = ''
IN_DIR = 'languages_json/'

from spacy.util import get_lang_class


def exists_spacy_blank(lang_code):
    try:
        get_lang_class(lang_code)
        return True
    except:
        return False

def load_spacy_model(lang_code, small=True):
    model = None
    if lang_code in SPACY_LANGUAGES:
        genre = 'news'
        if lang_code in {'zh', 'en'}:
            genre = 'web'
        if lang_code == 'rm':
            return ''
        model_name = f'{lang_code}_core_{genre}_{"sm" if small else "lg"}'
        print(f"Loading model {model_name} for {lang_code}")
        model = spacy.load(model_name)
        model.add_pipe("sentencizer")
    elif lang_code == "rm":
        model = spacy.load("it_core_news_lg")
        # keep 'morphologizer' ?
        model.disable_pipes('tok2vec', 'tagger', 'parser', 'lemmatizer', 'attribute_ruler', 'ner')
    elif lang_code == 'gsw':
        model = spacy.load('de_core_news_lg')
    elif exists_spacy_blank(lang_code):
        print(f"Loading model blank model for {lang_code}")
        model = spacy.blank(lang_code)
        model.add_pipe("sentencizer")
    else:
        model_name = f'xx_sent_ud_sm'
        print(f"Loading model {model_name} for {lang_code}")
        model = spacy.load(model_name)
        model.add_pipe("sentencizer")
    return model


def get_nlp(lang_code, small=False):
    """To avoid loading all models at the same time
    """
    global NLP_MODEL, CURRENT_LANG
    if lang_code != CURRENT_LANG:
        try:
            print(f"Deleting model for {CURRENT_LANG}")
            del NLP_MODEL
        except:
            print("No model to delete")
        print(f"Loading model for {lang_code}")
        NLP_MODEL = load_spacy_model(lang_code, small=small)
        CURRENT_LANG = lang_code
    return NLP_MODEL

In [None]:
!unzip /content/multipleye_stimuli_zh_segmentation_pages.zip

In [None]:
# preprocess custom tokens
# format should be dict[sname] = [str1, str2, str3, ...] where each index is a different page

import re
from pathlib import Path

root = Path("/content/multipleye_stimuli_zh_segmentation_pages")

def read_text_safely(fp):
    b = fp.read_bytes()
    for enc in ("utf-8", "utf-8-sig", "cp1252", "latin-1"):
        try:
            return b.decode(enc)
        except UnicodeDecodeError:
            pass
    return b.decode("utf-8", errors="replace")

def key_name(p):
    parts = p.stem.split("_")
    return "_".join(parts[:2]) if len(parts) >= 2 else p.stem

page_header = re.compile(r"^===\s*PAGE\s*\d+\s*===\s*$",
                         re.IGNORECASE | re.MULTILINE)

def clean_page_text(block):
    lines = [ln for ln in block.splitlines() if not page_header.match(ln.strip())]
    txt = " ".join(lines)
    txt = txt.replace("***", " ")
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

custom_tokens = {}

files = list(root.rglob("*_seg_pages.txt"))
if not files:
    files = list(root.rglob("*.txt"))

for fp in sorted(files):
    if fp.name.startswith("."):
        continue

    k = key_name(fp)
    raw = read_text_safely(fp).replace("\r\n", "\n").replace("\r", "\n")

    first_nonempty = next((ln for ln in raw.splitlines() if ln.strip()), "")
    if not page_header.match(first_nonempty.strip()):
        raw = "=== PAGE 1 ===\n" + raw

    blocks = [b for b in re.split(page_header, raw) if b.strip()]
    pages = [clean_page_text(b) for b in blocks]

    custom_tokens[k] = pages

In [None]:
# for item, value in custom_tokens.items():
#     for page in value:
#         print(item, page)

In [None]:
from spacy.tokens import Doc
# outputs content from custom_tokens dict by pages
class CustomTokenizer:
    def __init__(self, vocab, pages_by_key):
        self.vocab = vocab
        self.pages_by_key = pages_by_key
        self.key = None
        self.page_idx = 0

    def set_key(self, key):
        if key != self.key:
            self.key = key
            self.page_idx = 0

    def __call__(self, text):
        pages = self.pages_by_key.get(self.key, [])
        words = pages[self.page_idx].split() if self.page_idx < len(pages) else []
        self.page_idx += 1
        spaces = [True] * len(words)
        if spaces: spaces[-1] = False
        return Doc(self.vocab, words=words, spaces=spaces)

In [50]:
def feats_str(token):
    if not token.morph:
        return "_"
    md = token.morph.to_dict()
    if not md:
        return "_"
    bits = []
    for k in sorted(md):
        v = md[k]
        if isinstance(v, (list, tuple)):
            bits.append(f"{k}={','.join(v)}")
        else:
            bits.append(f"{k}={v}")
    return "|".join(bits) if bits else "_"


def get_head(token, sent):
    if token.head == token or token.dep_ == "ROOT":
        head = 0
        deprel = "root"
    else:
        head = (token.head.i - sent.start) + 1  # 1-based in sentence
        deprel = token.dep_.lower() if token.dep_ else "_"
    return head, deprel


def get_misc(token, include_ner=True):
    misc_parts = []
    if not token.whitespace_:
        misc_parts.append("SpaceAfter=No")
    if include_ner and token.ent_iob_ != "O":
        misc_parts.append(f"NER={token.ent_iob_}-{token.ent_type_}")
    misc = "|".join(misc_parts) if misc_parts else "_"
    return misc


def iter_pages(stimuli, nlp, set_key=None):
    for stim in stimuli:
        sid, sname = stim["stimulus_id"], stim["stimulus_name"]
        if set_key: set_key(sname)

        for pnum, page_text in enumerate(stim["pages"], start=1):
            yield sid, sname, pnum, nlp(page_text)

def stimuli2csv(stimuli, lang_code, level="page", small=False):
    rows = []
    nlp = get_nlp(lang_code, small=small)

    if lang_code == "zh":
        zh_tok = CustomTokenizer(nlp.vocab, custom_tokens)
        nlp.tokenizer = zh_tok
        set_key = zh_tok.set_key
    else:
        set_key = None

    for sid, sname, page, doc in iter_pages(stimuli, nlp, set_key=set_key):
        for sent_idx, sentence in enumerate(doc.sents):
            eos = {
              "language": CODE2LANG[lang_code],
              "language_code": lang_code,
              "stimulus_name": sname,
              "page": page,
              #"sent_idx": sent_idx+1,
              "token": "<eos>",
              "is_alpha": False,
              "is_stop": False,
              "is_punct": False,
              "lemma": "",
              "upos": "",
              "xpos": "",
              "feats": "",
              "head": "",
              "deprel": "",
              "deps": "",
              "misc": ""
              }
            for token in sentence:
                head, deprel = get_head(token, sentence)
                rows.append(
                    {
                        #"stimulus_id": sid,
                        "language": CODE2LANG[lang_code],
                        "language_code": lang_code,
                        "stimulus_name": sname,
                        "page": page,
                        #"sent_idx": sent_idx+1,
                        "token": token.text,
                        "is_alpha": token.is_alpha,
                        "is_stop": token.is_stop,
                        "is_punct": token.is_punct,
                        "lemma": token.lemma_,
                        "upos": token.pos_,
                        "xpos": token.tag_,
                        "feats": feats_str(token),
                        "head": head,
                        "deprel": deprel,
                        "deps": "_",
                        "misc": get_misc(token, include_ner=True)
                    }
                )
            rows.append(eos)


    df = pd.DataFrame(rows).sort_values(by=["stimulus_name", "page"])
    df = pd.DataFrame(rows)
    return df

## Generate csv templates

In [51]:
from tqdm import tqdm
import pandas as pd
from collections import defaultdict

preproc = defaultdict(dict)
for lang_code, data in tqdm(all_data.items()):
    if lang_code not in LANGS:
        continue
    preproc[lang_code] = stimuli2csv(data, lang_code, small=False)

 50%|█████     | 1/2 [00:01<00:01,  1.84s/it]

Deleting model for zh
Loading model for en
Loading model en_core_web_lg for en


100%|██████████| 2/2 [00:06<00:00,  3.41s/it]


## Save

In [52]:
import os
from tqdm import tqdm

for lang_code, df in tqdm(preproc.items()):
    lang_out = 'gsw' if lang_code == 'zd' else lang_code
    out_dir = os.path.join(OUT_DIR, lang_out)
    os.makedirs(out_dir, exist_ok=True)

    for stim_name, group in df.groupby('stimulus_name'):
        out_fis = os.path.join(out_dir, f"{stim_name}.csv")
        g = group.copy()
        g['language_code'] = lang_out
        g.to_csv(out_fis, index=False)
        print(out_fis)

100%|██████████| 2/2 [00:00<00:00, 14.92it/s]

data/dump/zh/Arg_PISACowsMilk.csv
data/dump/zh/Arg_PISARapaNui.csv
data/dump/zh/Enc_WikiMoon.csv
data/dump/zh/Ins_HumanRights.csv
data/dump/zh/Ins_LearningMobility.csv
data/dump/zh/Lit_Alchemist.csv
data/dump/zh/Lit_BrokenApril.csv
data/dump/zh/Lit_MagicMountain.csv
data/dump/zh/Lit_NorthWind.csv
data/dump/zh/Lit_Solaris.csv
data/dump/zh/PopSci_Caveman.csv
data/dump/en/Arg_PISACowsMilk.csv
data/dump/en/Arg_PISARapaNui.csv
data/dump/en/Enc_WikiMoon.csv
data/dump/en/Ins_HumanRights.csv
data/dump/en/Ins_LearningMobility.csv
data/dump/en/Lit_Alchemist.csv
data/dump/en/Lit_BrokenApril.csv
data/dump/en/Lit_MagicMountain.csv
data/dump/en/Lit_NorthWind.csv
data/dump/en/Lit_Solaris.csv
data/dump/en/PopSci_Caveman.csv
data/dump/en/PopSci_MultiplEYE.csv





In [None]:
# !zip -r /content/data_dump.zip /content/data/dump
# from google.colab import files
# files.download('/content/data_dump.zip')