In [3]:
import os

LANGS = [x.strip() for x in os.getenv("LANGS", "tr").split(",") if x.strip()]
OUT_DIR = os.getenv("OUT_DIR", "data/dump").strip()

print("params:", LANGS, OUT_DIR)
os.makedirs(OUT_DIR, exist_ok=True)

params: ['tr'] data/dump


## Get spacy models

In [2]:
import sys, subprocess
!python -m spacy download xx_sent_ud_sm
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "spacy==3.4.4", "numpy<1.23"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q",
    "tr_core_news_lg @ https://huggingface.co/turkish-nlp-suite/tr_core_news_lg/resolve/main/tr_core_news_lg-1.0-py3-none-any.whl"])
language_models = {
    "tr": "tr_core_news_lg",
}

Collecting xx-sent-ud-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.8.0/xx_sent_ud_sm-3.8.0-py3-none-any.whl (4.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_sent_ud_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy

## Get the MultiplEYE json data

In [None]:
! rm -rf languages*
! wget https://github.com/senisioi/repository/releases/download/eyelanguages0/languages_json_all.zip
! unzip languages_json_all.zip

In [11]:
SPACY_LANGUAGES = ["tr"]

CODE2LANG = {
    "tr": "Turkish",
}

LANGUAGES = list(CODE2LANG.keys())

## Load data

In [12]:
import os
import json
import spacy

def load_all_json(lang_folder):
    all_data = {}
    for file in os.listdir(lang_folder):
        if file.endswith('.json'):
            lang_code = file.replace('.json', '').replace('multipleye_stimuli_experiment_', '')
            if lang_code == 'zd':
                lang_code = 'gsw'
            if (lang_code not in LANGUAGES) or (lang_code not in LANGS):
                continue
            with open(os.path.join(lang_folder, file), 'r', encoding='utf-8') as f:
                all_data[lang_code] = json.load(f)
    return all_data

In [13]:
all_data = load_all_json('languages_json')
for k,v in all_data.items():
  print(k, v[0])

tr {'stimulus_id': 1, 'stimulus_name': 'PopSci_MultiplEYE', 'stimulus_type': 'experiment', 'pages': ['MultiplEYE Projesi\n\n“MultiplEYE” adı, “çok dillilik” veya “çoklu dil” ile “göz izleme”den gelen “göz”ü\nbirleştiren bir kelime oyunudur. MultiplEYE, Avrupa Birliği tarafından finanse\nedilen bir COST Aksiyonudur. COST Aksiyonları, Bilim ve Teknolojide Avrupa\nİşbirliği, kısaca COST, tarafından desteklenen araştırma ağlarıdır. Bir finansman\nkuruluşu olarak COST, farklı ağ oluşturma faaliyetlerinin yürütülmesi için mali\nyardım sağlayarak Avrupa ve ötesinde büyüyen araştırmacı ağımızı desteklemektedir.', 'Bu faaliyetler arasında çalışma grubu toplantıları, genç araştırmacılarla beceri\npaylaşımı için eğitim okulları ve bilimsel araştırma ziyaretleri yer almaktadır.\nMultiplEYE COST Action’ın proje başlığı: İnsan ve makine dili işleme araştırmaları\niçin çok dilli göz izleme verilerinin toplanmasının sağlanmasıdır. Bu, MultiplEYE\nCOST Aksiyonunun çok sayıda dilde okuma göz izleme veri

## Prepare spaCy code to generate template csv files

In [14]:
LANG_FOLDER = "languages_json"
NLP_MODEL = None
CURRENT_LANG = ''
IN_DIR = 'languages_json/'

from spacy.util import get_lang_class


def exists_spacy_blank(lang_code):
    try:
        get_lang_class(lang_code)
        return True
    except:
        return False

def load_spacy_model(lang_code, small=True):
    model = None
    if lang_code in SPACY_LANGUAGES:
        genre = 'news'
        if lang_code in {'zh', 'en'}:
            genre = 'web'
        if lang_code == 'rm':
            return ''
        model_name = f'{lang_code}_core_{genre}_{"sm" if small else "lg"}'
        print(f"Loading model {model_name} for {lang_code}")
        model = spacy.load(model_name)
        model.add_pipe("sentencizer")
    elif lang_code == "rm":
        model = spacy.load("it_core_news_lg")
        # keep 'morphologizer' ?
        model.disable_pipes('tok2vec', 'tagger', 'parser', 'lemmatizer', 'attribute_ruler', 'ner')
    elif lang_code == 'gsw':
        model = spacy.load('de_core_news_lg')
    elif exists_spacy_blank(lang_code):
        print(f"Loading model blank model for {lang_code}")
        model = spacy.blank(lang_code)
        model.add_pipe("sentencizer")
    else:
        model_name = f'xx_sent_ud_sm'
        print(f"Loading model {model_name} for {lang_code}")
        model = spacy.load(model_name)
        model.add_pipe("sentencizer")
    return model


def get_nlp(lang_code, small=False):
    """To avoid loading all models at the same time
    """
    global NLP_MODEL, CURRENT_LANG
    if lang_code != CURRENT_LANG:
        try:
            print(f"Deleting model for {CURRENT_LANG}")
            del NLP_MODEL
        except:
            print("No model to delete")
        print(f"Loading model for {lang_code}")
        NLP_MODEL = load_spacy_model(lang_code, small=small)
        CURRENT_LANG = lang_code
    return NLP_MODEL


In [15]:
def feats_str(token):
    if not token.morph:
        return "_"
    md = token.morph.to_dict()
    if not md:
        return "_"
    bits = []
    for k in sorted(md):
        v = md[k]
        if isinstance(v, (list, tuple)):
            bits.append(f"{k}={','.join(v)}")
        else:
            bits.append(f"{k}={v}")
    return "|".join(bits) if bits else "_"


def get_head(token, sent):
    if token.head == token or token.dep_ == "ROOT":
        head = 0
        deprel = "root"
    else:
        head = (token.head.i - sent.start) + 1  # 1-based in sentence
        deprel = token.dep_.lower() if token.dep_ else "_"
    return head, deprel


def get_misc(token, include_ner=True):
    misc_parts = []
    if not token.whitespace_:
        misc_parts.append("SpaceAfter=No")
    if include_ner and token.ent_iob_ != "O":
        misc_parts.append(f"NER={token.ent_iob_}-{token.ent_type_}")
    misc = "|".join(misc_parts) if misc_parts else "_"
    return misc


def iter_pages(stimuli, nlp):
    for stim in stimuli:
        sid, sname = stim["stimulus_id"], stim["stimulus_name"]
        for pnum, page_text in enumerate(stim["pages"], start=1):
            yield sid, sname, pnum, nlp(page_text)

def stimuli2csv(stimuli, lang_code, level="page", small=False):
    rows = []
    nlp = get_nlp(lang_code, small=small)
    for sid, sname, page, doc in iter_pages(stimuli, nlp):
        ptext = doc.text
        document = nlp(ptext)
        for sent_idx, sentence in enumerate(document.sents):
            eos = {
              "language": CODE2LANG[lang_code],
              "language_code": lang_code,
              "stimulus_name": sname,
              "page": page,
              #"sent_idx": sent_idx+1,
              "token": "<eos>",
              "is_alpha": False,
              "is_stop": False,
              "is_punct": False,
              "lemma": "",
              "upos": "",
              "xpos": "",
              "feats": "",
              "head": "",
              "deprel": "",
              "deps": "",
              "misc": ""
              }
            for token in sentence:
                head, deprel = get_head(token, sentence)
                rows.append(
                    {
                        #"stimulus_id": sid,
                        "language": CODE2LANG[lang_code],
                        "language_code": lang_code,
                        "stimulus_name": sname,
                        "page": page,
                        #"sent_idx": sent_idx+1,
                        "token": token.text,
                        "is_alpha": token.is_alpha,
                        "is_stop": token.is_stop,
                        "is_punct": token.is_punct,
                        "lemma": token.lemma_,
                        "upos": token.pos_,
                        "xpos": token.tag_,
                        "feats": feats_str(token),
                        "head": head,
                        "deprel": deprel,
                        "deps": "_",
                        "misc": get_misc(token, include_ner=True)
                    }
                )
            rows.append(eos)


    df = pd.DataFrame(rows).sort_values(by=["stimulus_name", "page"])
    df = pd.DataFrame(rows)
    return df

In [None]:
from tqdm import tqdm
import pandas as pd
from collections import defaultdict

preproc = defaultdict(dict)
for lang_code, data in tqdm(all_data.items()):
    if lang_code not in LANGS:
        continue
    preproc[lang_code] = stimuli2csv(data, lang_code, small=False)

## Generate csv templates

## Save

In [None]:
import os
from tqdm import tqdm

for lang_code, df in tqdm(preproc.items()):
    lang_out = 'gsw' if lang_code == 'zd' else lang_code
    out_dir = os.path.join(OUT_DIR, lang_out)
    os.makedirs(out_dir, exist_ok=True)

    for stim_name, group in df.groupby('stimulus_name'):
        out_fis = os.path.join(out_dir, f"{stim_name}.csv")
        g = group.copy()
        g['language_code'] = lang_out
        g.to_csv(out_fis, index=False)
        print(out_fis)

100%|██████████| 2/2 [00:00<00:00, 13.38it/s]

data/dump/ro/Arg_PISACowsMilk.csv
data/dump/ro/Arg_PISARapaNui.csv
data/dump/ro/Enc_WikiMoon.csv
data/dump/ro/Ins_HumanRights.csv
data/dump/ro/Ins_LearningMobility.csv
data/dump/ro/Lit_Alchemist.csv
data/dump/ro/Lit_BrokenApril.csv
data/dump/ro/Lit_MagicMountain.csv
data/dump/ro/Lit_NorthWind.csv
data/dump/ro/Lit_Solaris.csv
data/dump/ro/PopSci_Caveman.csv
data/dump/ro/PopSci_MultiplEYE.csv
data/dump/en/Arg_PISACowsMilk.csv
data/dump/en/Arg_PISARapaNui.csv
data/dump/en/Enc_WikiMoon.csv
data/dump/en/Ins_HumanRights.csv
data/dump/en/Ins_LearningMobility.csv
data/dump/en/Lit_Alchemist.csv
data/dump/en/Lit_BrokenApril.csv
data/dump/en/Lit_MagicMountain.csv
data/dump/en/Lit_NorthWind.csv
data/dump/en/Lit_Solaris.csv
data/dump/en/PopSci_Caveman.csv
data/dump/en/PopSci_MultiplEYE.csv



