In [3]:
import json
import spacy
from transformers import AutoTokenizer

import metrics

In [23]:
with open("/content/multipleye_stimuli_experiment_en.json", "r", encoding="utf-8") as f:
    en_data = json.load(f)

with open("/content/multipleye_stimuli_experiment_RO.json", "r", encoding="utf-8") as f:
    ro_data = json.load(f)

with open("/content/multipleye_stimuli_experiment_de.json", "r", encoding="utf-8") as f:
    de_data = json.load(f)

with open("/content/multipleye_stimuli_experiment_el.json", "r", encoding="utf-8") as f:
    el_data = json.load(f)

with open("/content/multipleye_stimuli_experiment_fr.json", "r", encoding="utf-8") as f:
    fr_data = json.load(f)

with open("/content/multipleye_stimuli_experiment_zh.json", "r", encoding="utf-8") as f:
    zh_data = json.load(f)

In [None]:
!python -m spacy download ro_core_news_sm
!python -m spacy download de_core_news_sm
!python -m spacy download el_core_news_sm
!python -m spacy download fr_core_news_sm
!python -m spacy download zh_core_web_sm

In [24]:
tok = AutoTokenizer.from_pretrained("gpt2")

nlp_en = spacy.load("en_core_web_sm")
nlp_ro = spacy.load("ro_core_news_sm")
nlp_de = spacy.load("de_core_news_sm")
nlp_el = spacy.load("el_core_news_sm")
nlp_fr = spacy.load("fr_core_news_sm")
nlp_zh = spacy.load("zh_core_web_sm")

In [25]:
def collect_pronouns(data, nlp, label=None, verbose=False):
    r_lang = metrics.pronouns(data, nlp, level="lang")
    r_doc = metrics.pronouns(data, nlp, level="doc")
    r_page = metrics.pronouns(data, nlp, level="page")
    r_sentence = metrics.pronouns(data, nlp, level="sentence")

    if verbose:
        if label:
            print(label)
        print(r_lang, "\n")
        print(r_doc.head(), "\n")
        print(r_page.head(), "\n")
        print(r_sentence.head(), "\n")

    return r_lang, r_doc, r_page, r_sentence

pronouns_lang_en, pronouns_doc_en, pronouns_page_en, pronouns_sentence_en = collect_pronouns(en_data, nlp_en, "ENGLISH")
pronouns_lang_ro, pronouns_doc_ro, pronouns_page_ro, pronouns_sentence_ro = collect_pronouns(ro_data, nlp_ro, "ROMANIAN")
pronouns_lang_de, pronouns_doc_de, pronouns_page_de, pronouns_sentence_de = collect_pronouns(de_data, nlp_de, "GERMAN")
pronouns_lang_el, pronouns_doc_el, pronouns_page_el, pronouns_sentence_el = collect_pronouns(el_data, nlp_el, "GREEK")
pronouns_lang_fr, pronouns_doc_fr, pronouns_page_fr, pronouns_sentence_fr = collect_pronouns(fr_data, nlp_fr, "FRENCH")
pronouns_lang_zh, pronouns_doc_zh, pronouns_page_zh, pronouns_sentence_zh = collect_pronouns(zh_data, nlp_zh, "CHINESE")

In [26]:
import pandas as pd

sent_by_lang = {
    "English": pronouns_sentence_en,
    "Romanian": pronouns_sentence_ro,
    "German": pronouns_sentence_de,
    "Greek": pronouns_sentence_el,
    "French": pronouns_sentence_fr,
    "Chinese": pronouns_sentence_zh,
}

lang_rows = []
for lang, df in sent_by_lang.items():
    lang_rows.append({
        "lang": lang,
        "mean": df["pronouns"].mean(),
        "std": df["pronouns"].std()
    })
lang_df = pd.DataFrame(lang_rows)

doc_stats = {}
for lang, df in sent_by_lang.items():
    agg = (df.groupby("stimulus_name")
             .agg(mean=("pronouns","mean"),
                  std=("pronouns","std"))
             .reset_index())
    doc_stats[lang] = agg

all_docs = sorted(set().union(*[d["stimulus_name"] for d in doc_stats.values()]))
lang_order = list(lang_df["lang"])

In [27]:
import plotly.graph_objects as go

LANG_COLORS = {
    "English":  "#ef4444",
    "Romanian": "#f59e0b",
    "German":   "#10b981",
    "Greek":    "#3b82f6",
    "French":   "#ec4899",
    "Chinese":  "#22d3ee",
}
DEFAULT_COLOR = "#f97316"

lang_order = ["English", "Romanian", "German", "Greek", "French", "Chinese"]

def color_for(lang):
    return LANG_COLORS.get(lang, DEFAULT_COLOR)

fig_lang = go.Figure()
for lang in lang_order:
    row = lang_df[lang_df["lang"] == lang].iloc[0]
    fig_lang.add_trace(go.Bar(
        x=[lang],
        y=[row["mean"]],
        error_y=dict(type="data", array=[row["std"]]),
        name=lang,
        marker_color=color_for(lang),
    ))

fig_lang.update_layout(
    barmode="group",
    yaxis_title="Avg pronouns per sentence",
    title="Pronouns per sentence by language",
    template="plotly_dark",
)
fig_lang.update_xaxes(categoryorder="array", categoryarray=lang_order)
fig_lang.show()

fig_all_docs = go.Figure()

for lang in lang_order:
    df = doc_stats[lang]
    fig_all_docs.add_trace(go.Bar(
        x=df["stimulus_name"],
        y=df["mean"],
        error_y=dict(type="data", array=df["std"]),
        name=lang,
        marker_color=color_for(lang)
    ))

fig_all_docs.update_layout(
    barmode="group",
    yaxis_title="Avg pronouns per sentence",
    title="All documents pronouns per sentence by language",
    template="plotly_dark",
)
fig_lang.update_xaxes(categoryorder="array", categoryarray=lang_order)
fig_all_docs.show()

def doc_arrays(doc_name):
    ys, es = [], []
    for lang in lang_order:
        df = doc_stats[lang]
        row = df[df["stimulus_name"] == doc_name]
        if row.empty:
            ys.append(None)
            es.append(None)
        else:
            ys.append(row["mean"].values[0])
            es.append(row["std"].values[0])
    return ys, es

initial_doc = all_docs[0]
y0, e0 = doc_arrays(initial_doc)

fig_doc = go.Figure()

for i, lang in enumerate(lang_order):
    fig_doc.add_trace(go.Bar(
        x=[lang],
        y=[y0[i]],
        error_y=dict(type="data", array=[e0[i]]),
        name=lang,
        marker_color=color_for(lang),
    ))

buttons = []
for doc_name in all_docs:
    ys, es = doc_arrays(doc_name)
    buttons.append(dict(
        label=doc_name,
        method="restyle",
        args=[{
            "y": [[v] for v in ys],
            "error_y.array": [[s if (s is not None) else 0] for s in es],
        }],
    ))

fig_doc.update_layout(
    updatemenus=[dict(
        buttons=buttons,
        direction="down",
        x=0, y=1.15, xanchor="left", yanchor="top"
    )],
    yaxis_title="Avg pronouns per sentence",
    title=f"Document pronouns per sentence by language: {initial_doc}",
    template="plotly_dark",
)
fig_lang.update_xaxes(categoryorder="array", categoryarray=lang_order)
fig_doc.show()