In [17]:
import pandas as pd
import json
import re
import spacy
import os

file = "multipleye_stimuli_experiment_en.xlsx"
excel_path = "/content/" + file

In [18]:
df = pd.read_excel(excel_path)

stimuli = []
for _, row in df.iterrows():
    pages = []
    for col in df.columns:
        if col.startswith("page_") and pd.notna(row[col]):
            pages.append(str(row[col]).strip())
    stimuli.append({
        "stimulus_id": int(row["stimulus_id"]),
        "stimulus_name": row["stimulus_name"],
        "stimulus_type": row["stimulus_type"],
        "pages": pages
    })

len(stimuli)

12

In [22]:
json_file = os.path.splitext(file)[0] + ".json"

with open("/content/" + json_file, "w", encoding="utf-8") as f:
    json.dump(stimuli, f, indent=2, ensure_ascii=False)

print("saved", json_file)

saved multipleye_stimuli_experiment_en.json


In [23]:
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text) #remove punctuation
    text = re.sub(r"\s+", " ", text).strip() #replace multiple spaces with only one
    return text

def ttr_from_text(text):
    doc = nlp(text)
    tokens = [t.text for t in doc if t.is_alpha]
    if not tokens:
        return 0.0, 0, 0
    return len(set(tokens)) / len(tokens), len(tokens), len(set(tokens))

In [24]:
rows = []

for stim in stimuli:
    sid = stim["stimulus_id"]
    sname = stim["stimulus_name"]

    #ttr for full text
    full = preprocess(" ".join(stim["pages"]))
    ttr_full, n_tok_full, n_types_full = ttr_from_text(full)
    rows.append({
        "stimulus_id": sid,
        "stimulus_name": sname,
        "scope": "full_text",
        "page": None,
        "num_tokens": n_tok_full,
        "num_types": n_types_full,
        "ttr": ttr_full
    })

    #ttr for every page
    for i, page in enumerate(stim["pages"], start=1):
        ptext = preprocess(page)
        ttr_page, n_tok_page, n_types_page = ttr_from_text(ptext)
        rows.append({
            "stimulus_id": sid,
            "stimulus_name": sname,
            "scope": "page",
            "page": i,
            "num_tokens": n_tok_page,
            "num_types": n_types_page,
            "ttr": ttr_page
        })

In [25]:
csv_file = os.path.splitext(file)[0] + "_ttr.csv"

out = pd.DataFrame(rows).sort_values(["stimulus_id", "scope", "page"], ignore_index=True)
out.to_csv("/content/" + csv_file, index=False, encoding="utf-8")

print("saved", csv_file)
out.head(12)

saved multipleye_stimuli_experiment_en_ttr.csv


Unnamed: 0,stimulus_id,stimulus_name,scope,page,num_tokens,num_types,ttr
0,1,PopSci_MultiplEYE,full_text,,814,334,0.410319
1,1,PopSci_MultiplEYE,page,1.0,71,53,0.746479
2,1,PopSci_MultiplEYE,page,2.0,67,53,0.791045
3,1,PopSci_MultiplEYE,page,3.0,68,52,0.764706
4,1,PopSci_MultiplEYE,page,4.0,65,43,0.661538
5,1,PopSci_MultiplEYE,page,5.0,85,57,0.670588
6,1,PopSci_MultiplEYE,page,6.0,75,56,0.746667
7,1,PopSci_MultiplEYE,page,7.0,84,55,0.654762
8,1,PopSci_MultiplEYE,page,8.0,86,61,0.709302
9,1,PopSci_MultiplEYE,page,9.0,91,70,0.769231
