In [1]:
import json
import spacy
from transformers import AutoTokenizer

import metrics

In [3]:
with open("/content/multipleye_stimuli_experiment_en.json", "r", encoding="utf-8") as f:
    en_data = json.load(f)

with open("/content/multipleye_stimuli_experiment_RO.json", "r", encoding="utf-8") as f:
    ro_data = json.load(f)

In [None]:
!python -m spacy download ro_core_news_sm

In [None]:
nlp_en = spacy.load("en_core_web_sm")
nlp_ro = spacy.load("ro_core_news_sm")
tok = AutoTokenizer.from_pretrained("gpt2")

In [8]:
en_vals, en_avg = metrics.pronouns_per_sentence(en_data, nlp_en)
ro_vals, ro_avg = metrics.pronouns_per_sentence(ro_data, nlp_ro)
print("English average pronouns per sentence:", en_avg)
print("Romanian average pronouns per sentence:", ro_avg)

en_vals, en_avg = metrics.punctuation_per_sentence(en_data, nlp_en)
ro_vals, ro_avg = metrics.punctuation_per_sentence(ro_data, nlp_ro)
print("English average pronouns per sentence:", en_avg)
print("Romanian average pronouns per sentence:", ro_avg)

English average pronouns per sentence: 1.6079136690647482
Romanian average pronouns per sentence: 1.4631578947368422
English average pronouns per sentence: 3.129496402877698
Romanian average pronouns per sentence: 2.9298245614035086


In [12]:
en_vals, en_avg = metrics.fertility(en_data, nlp_en, tok)
print("Fertility values:", en_vals[:10])
print("Average fertility:", en_avg)

ro_vals, ro_avg = metrics.fertility(ro_data, nlp_ro, tok)
print("Romanian fertility values:", ro_vals[:10])
print("Romanian average fertility:", ro_avg)

Fertility values: [1, 4, 1, 1, 1, 4, 1, 1, 2, 2]
Average fertility: 1.2954688483322845
Romanian fertility values: [4, 4, 3, 4, 1, 1, 2, 1, 4, 1]
Romanian average fertility: 2.8054406617881007


In [14]:
df_en, vals_en, avg_en = metrics.compute_ttr(en_data, nlp_en)
print("Average English TTR - pages:", avg_en)
print(df_en.head(10))
df_ro, vals_ro, avg_ro = metrics.compute_ttr(ro_data, nlp_ro)
print("\nAverage Romanian TTR - pages:", avg_ro)
print(df_ro.head(10))

Average English TTR - pages: 0.7625672295427283
   stimulus_id      stimulus_name      scope  page  num_tokens  num_types  \
0            1  PopSci_MultiplEYE  full_text   NaN         814        334   
1            1  PopSci_MultiplEYE       page   1.0          71         53   
2            1  PopSci_MultiplEYE       page   2.0          67         53   
3            1  PopSci_MultiplEYE       page   3.0          68         52   
4            1  PopSci_MultiplEYE       page   4.0          65         43   
5            1  PopSci_MultiplEYE       page   5.0          85         57   
6            1  PopSci_MultiplEYE       page   6.0          75         56   
7            1  PopSci_MultiplEYE       page   7.0          84         55   
8            1  PopSci_MultiplEYE       page   8.0          86         61   
9            1  PopSci_MultiplEYE       page   9.0          91         70   

        ttr  
0  0.410319  
1  0.746479  
2  0.791045  
3  0.764706  
4  0.661538  
5  0.670588  
6  0.7