In [131]:
import json
import spacy
from transformers import AutoTokenizer
import metrics

In [132]:
with open("/content/multipleye_stimuli_experiment_en.json", "r", encoding="utf-8") as f:
    en_data = json.load(f)

with open("/content/multipleye_stimuli_experiment_RO.json", "r", encoding="utf-8") as f:
    ro_data = json.load(f)

In [None]:
!python -m spacy download ro_core_news_sm

In [134]:
nlp_en = spacy.load("en_core_web_sm")
nlp_ro = spacy.load("ro_core_news_sm")
tok = AutoTokenizer.from_pretrained("gpt2")

In [129]:
print('ENGLISH')

pronouns_lang_en = metrics.pronouns(en_data, nlp_en, level="lang")
print(pronouns_lang_en, '\n')

pronouns_doc_en = metrics.pronouns(en_data, nlp_en, level="doc")
print(pronouns_doc_en.head(), '\n')

pronouns_page_en = metrics.pronouns(en_data, nlp_en, level="page")
print(pronouns_page_en.head(), '\n')

pronouns_sentence_en = metrics.pronouns(en_data, nlp_en, level="sentence")
print(pronouns_sentence_en.head(), '\n')

print('ROMANIAN')

pronouns_lang_ro = metrics.pronouns(ro_data, nlp_ro, level="lang")
print(pronouns_lang_ro, '\n')

pronouns_doc_ro = metrics.pronouns(ro_data, nlp_ro, level="doc")
print(pronouns_doc_ro.head(), '\n')

pronouns_page_ro = metrics.pronouns(ro_data, nlp_ro, level="page")
print(pronouns_page_ro.head(), '\n')

pronouns_sentence_ro = metrics.pronouns(ro_data, nlp_ro, level="sentence")
print(pronouns_sentence_ro.head(), '\n')

ENGLISH
  level  total_pronouns  n_sentences  total_words
0  lang             447          288         6356 

   stimulus_id         stimulus_name  total_pronouns  n_sentences  total_words
0            1     PopSci_MultiplEYE              43           37          811
1            2       Ins_HumanRights               9            9          327
2            3  Ins_LearningMobility              10           20          531
3            4         Lit_Alchemist              77           25          494
4            6     Lit_MagicMountain              54           17          508 

   stimulus_id      stimulus_name  page  total_pronouns  n_sentences  \
0            1  PopSci_MultiplEYE     1               1            4   
1            1  PopSci_MultiplEYE     2               1            3   
2            1  PopSci_MultiplEYE     3               1            2   
3            1  PopSci_MultiplEYE     4               4            3   
4            1  PopSci_MultiplEYE     5               

In [136]:
print('ENGLISH')

punctuation_lang_en = metrics.punctuation(en_data, nlp_en, level="lang")
print(punctuation_lang_en, '\n')

punctuation_doc_en = metrics.punctuation(en_data, nlp_en, level="doc")
print(punctuation_doc_en.head(), '\n')

punctuation_page_en = metrics.punctuation(en_data, nlp_en, level="page")
print(punctuation_page_en.head(), '\n')

print('ROMANIAN')

punctuation_lang_ro = metrics.punctuation(ro_data, nlp_ro, level="lang")
print(punctuation_lang_ro, '\n')

punctuation_doc_ro = metrics.punctuation(ro_data, nlp_ro, level="doc")
print(punctuation_doc_ro.head(), '\n')

punctuation_page_ro = metrics.punctuation(ro_data, nlp_ro, level="page")
print(punctuation_page_ro.head(), '\n')

ENGLISH
  level  total_punct  n_sentences
0  lang          870          288 

   stimulus_id         stimulus_name  total_punct  n_sentences
0            1     PopSci_MultiplEYE          109           37
1            2       Ins_HumanRights           28            9
2            3  Ins_LearningMobility           76           20
3            4         Lit_Alchemist           60           25
4            6     Lit_MagicMountain           76           17 

   stimulus_id      stimulus_name  page  total_punct  n_sentences
0            1  PopSci_MultiplEYE     1           16            4
1            1  PopSci_MultiplEYE     2            8            3
2            1  PopSci_MultiplEYE     3            8            2
3            1  PopSci_MultiplEYE     4           12            3
4            1  PopSci_MultiplEYE     5           14            4 

ROMANIAN
  level  total_punct  n_sentences
0  lang          835          301 

   stimulus_id         stimulus_name  total_punct  n_sentences
0 

In [138]:
print('ENGLISH')

fertility_lang_en = metrics.fertility(en_data, nlp_en, tok, level="lang")
print(fertility_lang_en, '\n')

fertility_doc_en = metrics.fertility(en_data, nlp_en, tok, level="doc")
print(fertility_doc_en.head(), '\n')

fertility_page_en = metrics.fertility(en_data, nlp_en, tok, level="page")
print(fertility_page_en.head(), '\n')

print('ROMANIAN')

fertility_lang_ro = metrics.fertility(ro_data, nlp_ro, tok, level="lang")
print(fertility_lang_ro, '\n')

fertility_doc_ro = metrics.fertility(ro_data, nlp_ro, tok, level="doc")
print(fertility_doc_ro.head(), '\n')

fertility_page_ro = metrics.fertility(ro_data, nlp_ro, tok, level="page")
print(fertility_page_ro.head(), '\n')

ENGLISH
  level  total_words  total_llm_tokens
0  lang         6356              8234 

   stimulus_id         stimulus_name  total_words  total_llm_tokens
0            1     PopSci_MultiplEYE          811              1048
1            2       Ins_HumanRights          327               420
2            3  Ins_LearningMobility          531               679
3            4         Lit_Alchemist          494               598
4            6     Lit_MagicMountain          508               616 

   stimulus_id      stimulus_name  page  total_words  total_llm_tokens
0            1  PopSci_MultiplEYE     1           71               104
1            1  PopSci_MultiplEYE     2           67                92
2            1  PopSci_MultiplEYE     3           68                88
3            1  PopSci_MultiplEYE     4           65                77
4            1  PopSci_MultiplEYE     5           84               107 

ROMANIAN
  level  total_words  total_llm_tokens
0  lang         6286      

In [140]:
print('ENGLISH')

ttr_lang_en = metrics.ttr(en_data, nlp_en, level="lang")
print(ttr_lang_en, '\n')

ttr_doc_en = metrics.ttr(en_data, nlp_en, level="doc")
print(ttr_doc_en.head(), '\n')

ttr_page_en = metrics.ttr(en_data, nlp_en, level="page")
print(ttr_page_en.head(), '\n')

print('ROMANIAN')

ttr_lang_ro = metrics.ttr(ro_data, nlp_ro, level="lang")
print(ttr_lang_ro, '\n')

ttr_doc_ro = metrics.ttr(ro_data, nlp_ro, level="doc")
print(ttr_doc_ro.head(), '\n')

ttr_page_ro = metrics.ttr(ro_data, nlp_ro, level="page")
print(ttr_page_ro.head(), '\n')

ENGLISH
  level  num_tokens  num_types       ttr
0  lang        6439       4748  0.737382 

   stimulus_id         stimulus_name  num_tokens  num_types       ttr
0            1     PopSci_MultiplEYE         814        596  0.732187
1            2       Ins_HumanRights         327        241  0.737003
2            3  Ins_LearningMobility         534        381  0.713483
3            4         Lit_Alchemist         495        327  0.660606
4            6     Lit_MagicMountain         509        362  0.711198 

   stimulus_id      stimulus_name  page  num_tokens  num_types       ttr
0            1  PopSci_MultiplEYE     1          71         53  0.746479
1            1  PopSci_MultiplEYE     2          67         53  0.791045
2            1  PopSci_MultiplEYE     3          68         52  0.764706
3            1  PopSci_MultiplEYE     4          65         43  0.661538
4            1  PopSci_MultiplEYE     5          85         57  0.670588 

ROMANIAN
  level  num_tokens  num_types       

In [143]:
#total pronouns / total sentences

pronouns_avg_en = pronouns_lang_en["total_pronouns"].sum() / pronouns_lang_en["n_sentences"].sum()
print("English average pronouns per sentence:", pronouns_avg_en)

pronouns_avg_ro = pronouns_lang_ro["total_pronouns"].sum() / pronouns_lang_ro["n_sentences"].sum()
print("Romanian average pronouns per sentence:", pronouns_avg_ro)

#Average pronouns per sentence (equal weight)

pronouns_avg_en2 = pronouns_sentence_en["pronouns"].mean()
print("English average pronouns per sentence (equal weight):", pronouns_avg_en2)

pronouns_avg_ro2 = pronouns_sentence_ro["pronouns"].mean()
print("Romanian average pronouns per sentence (equal weight):", pronouns_avg_ro2)




#Average punctuation per sentence

punctuation_avg_en = punctuation_lang_en["total_punct"].sum() / punctuation_lang_en["n_sentences"].sum()
print("English average punctuation per sentence:", punctuation_avg_en)

punctuation_avg_ro = punctuation_lang_ro["total_punct"].sum() / punctuation_lang_ro["n_sentences"].sum()
print("Romanian average punctuation per sentence:", punctuation_avg_ro)




#Average fertility per word

fertility_avg_en = fertility_lang_en["total_llm_tokens"].sum() / fertility_lang_en["total_words"].sum()
print("English average fertility per word:", fertility_avg_en)

fertility_avg_ro = fertility_lang_ro["total_llm_tokens"].sum() / fertility_lang_ro["total_words"].sum()
print("Romanian average fertility per word:", fertility_avg_ro)




#Average ttr per word

ttr_avg_en = ttr_lang_en["num_types"].sum() / ttr_lang_en["num_tokens"].sum()
print("English average ttr per word:", ttr_avg_en)

ttr_avg_ro = ttr_lang_ro["num_types"].sum() / ttr_lang_ro["num_tokens"].sum()
print("Romanian average ttr per word:", ttr_avg_ro)

English average pronouns per sentence: 1.5520833333333333
Romanian average pronouns per sentence: 1.1727574750830565
English average pronouns per sentence (equal weight): 1.6079136690647482
Romanian average pronouns per sentence (equal weight): 1.2385964912280703
English average punctuation per sentence: 3.0208333333333335
Romanian average punctuation per sentence: 2.7740863787375414
English average fertility per word: 1.2954688483322845
Romanian average fertility per word: 2.8054406617881007
English average ttr per word: 0.7373815809908371
Romanian average ttr per word: 0.7630527437400106
