# Compare Lexicons

We compare the streams generated with 

1. controlled lexicons (ours),
2. random baseline streams, and
3. streams generated based on reference lexicons from the literature

based on the repetitiveness of the phoneme features.

### Collect Results

We collect all stream generation results and their feature repetitiveness scores in a dataframe. To generate the data, you must run 
```bash
./generate_analysis_datasets.sh
```

In [6]:
import pandas as pd
from arpac import load_streams

data = {"Control": [], "Lexicon": [], "Cum_Feat_Rep": [], "Feature": [], "PRI": [], "Stream TP mode": [], "Stream": []}

mode_to_mode = {  # TP-random position-random; TP-random position-fixed and TP-structured
    "random": "TP-random position-random",
    "word_structured": "TP-structured",
    "position_controlled": "TP-random position-fixed"
}
controlled_streams_german = load_streams("results/default_german_2025-02-28_17-15-48/_arpac/streams.json")
controlled_streams_english = load_streams("results/default_english_2025-02-28_17-23-18/_arpac/streams.json")
random_streams_german = load_streams("results/random_german_2025-02-28_17-30-39/_arpac/streams.json")
random_streams_english = load_streams("results/random_english_2025-02-28_18-42-08/_arpac/streams.json")
literature_streams = load_streams("results/literature_streams_2025-02-28_19-09-40/_arpac/streams.json")

In [8]:

for control, streams in [
    ("Controlled lexicons (ARPAC, german)", controlled_streams_german), 
    ("Controlled lexicons (ARPAC, english)", controlled_streams_english), 
    ("Random lexicons (Baseline, german)", random_streams_german),
    ("Random lexicons (Baseline, english)", random_streams_english),
    ("Reference lexicons (Literature)", literature_streams), 
]:
    for stream in streams:
        for k, v in stream.info["rhythmicity_indexes"].items():
            data["Feature"].append(k)
            data["PRI"].append(v)
            data["Control"].append(control)
            data["Lexicon"].append(str(stream.info["lexicon"]))
            data["Stream TP mode"].append(mode_to_mode[stream.info["stream_tp_mode"]])
            data["Stream"].append("|".join(syll.id for syll in stream))
            data["Cum_Feat_Rep"].append(stream.info["lexicon_info"]["cumulative_feature_repetitiveness"])
        data["Feature"].append("max")
        data["PRI"].append(max(stream.info["rhythmicity_indexes"].values()))
        data["Control"].append(control)
        data["Lexicon"].append(str(stream.info["lexicon"]))
        data["Stream TP mode"].append(mode_to_mode[stream.info["stream_tp_mode"]])
        data["Stream"].append("|".join(syll.id for syll in stream))
        data["Cum_Feat_Rep"].append(stream.info["lexicon_info"]["cumulative_feature_repetitiveness"])

df = pd.DataFrame(data).sort_values(["Control", "Lexicon", "Stream TP mode"]).reset_index(drop=True)

import os
os.makedirs("results/", exist_ok=True)
df.to_csv("results/analysis_full_dataset.csv")

df

Unnamed: 0,Control,Lexicon,Cum_Feat_Rep,Feature,PRI,Stream TP mode,Stream
0,"Controlled lexicons (ARPAC, english)",biːɲɑːθɜː|tɔːhiːfɑː|mɔːsɔːkɔː|sɜːpɜːhuː,2,phon_1_son,0.052910,TP-random position-fixed,mɔː|pɜː|θɜː|sɜː|sɔː|huː|tɔː|ɲɑː|fɑː|biː|hiː|kɔ...
1,"Controlled lexicons (ARPAC, english)",biːɲɑːθɜː|tɔːhiːfɑː|mɔːsɔːkɔː|sɜːpɜːhuː,2,phon_1_back,0.007937,TP-random position-fixed,mɔː|pɜː|θɜː|sɜː|sɔː|huː|tɔː|ɲɑː|fɑː|biː|hiː|kɔ...
2,"Controlled lexicons (ARPAC, english)",biːɲɑːθɜː|tɔːhiːfɑː|mɔːsɔːkɔː|sɜːpɜːhuː,2,phon_1_hi,0.023810,TP-random position-fixed,mɔː|pɜː|θɜː|sɜː|sɔː|huː|tɔː|ɲɑː|fɑː|biː|hiː|kɔ...
3,"Controlled lexicons (ARPAC, english)",biːɲɑːθɜː|tɔːhiːfɑː|mɔːsɔːkɔː|sɜːpɜːhuː,2,phon_1_lab,0.076720,TP-random position-fixed,mɔː|pɜː|θɜː|sɜː|sɔː|huː|tɔː|ɲɑː|fɑː|biː|hiː|kɔ...
4,"Controlled lexicons (ARPAC, english)",biːɲɑːθɜː|tɔːhiːfɑː|mɔːsɔːkɔː|sɜːpɜːhuː,2,phon_1_cor,0.058201,TP-random position-fixed,mɔː|pɜː|θɜː|sɜː|sɔː|huː|tɔː|ɲɑː|fɑː|biː|hiː|kɔ...
...,...,...,...,...,...,...,...
98525,Reference lexicons (Literature),ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku,4,phon_1_voi,0.034392,TP-structured,ɡoʊ|la|tu|da|ɹoʊ|pi|ti|bu|doʊ|pa|bi|ku|da|ɹoʊ|...
98526,Reference lexicons (Literature),ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku,4,phon_2_back,0.000000,TP-structured,ɡoʊ|la|tu|da|ɹoʊ|pi|ti|bu|doʊ|pa|bi|ku|da|ɹoʊ|...
98527,Reference lexicons (Literature),ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku,4,phon_2_hi,0.000000,TP-structured,ɡoʊ|la|tu|da|ɹoʊ|pi|ti|bu|doʊ|pa|bi|ku|da|ɹoʊ|...
98528,Reference lexicons (Literature),ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku,4,phon_2_lo,0.156085,TP-structured,ɡoʊ|la|tu|da|ɹoʊ|pi|ti|bu|doʊ|pa|bi|ku|da|ɹoʊ|...


In [15]:
df[df["Feature"] == "max"].min()

Control                        Controlled lexicons (ARPAC, english)
Lexicon                     baːsuːhoː|siːmuːkɛː|huːsiːbyː|riːkuːvaː
Cum_Feat_Rep                                                      0
Feature                                                         max
PRI                                                        0.039683
Stream TP mode                             TP-random position-fixed
Stream            ba|fi|mu|noʊ|fu|ɡi|poʊ|koʊ|ka|vi|vu|ɡa|vi|koʊ|...
dtype: object

In [9]:
df_lexicons = df[["Control", "Lexicon"]].drop_duplicates().reset_index(drop=True)
df_lexicons.to_csv("results/analysis_all_lexicons.csv")
df_lexicons

Unnamed: 0,Control,Lexicon
0,"Controlled lexicons (ARPAC, english)",biːɲɑːθɜː|tɔːhiːfɑː|mɔːsɔːkɔː|sɜːpɜːhuː
1,"Controlled lexicons (ARPAC, english)",biːʃɔːhuː|dɔːfɜːɲɑː|fɔːɲiːtɜː|hiːθɜːpɜː
2,"Controlled lexicons (ARPAC, english)",biːʃɔːhuː|dɔːfɜːɲɑː|kɔːmɑːʃiː|hiːθɜːpɜː
3,"Controlled lexicons (ARPAC, english)",biːʃɔːhuː|kɔːmiːθɜː|tɔːfɜːɲiː|hiːtuːfɑː
4,"Controlled lexicons (ARPAC, english)",biːʃɔːhuː|kɜːluːfɜː|θɜːpɜːɲɑː|hiːfɔːtɔː
...,...,...
100,Reference lexicons (Literature),pukemi|rafinu|binapo|medoxi
101,Reference lexicons (Literature),tifaxu|soduxi|melubo|ɡanipe
102,Reference lexicons (Literature),tupiɹoʊ|ɡoʊlabu|padoʊti|bidaku
103,Reference lexicons (Literature),ɡonabe|mudila|ronixe|pikusa


In [14]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from pingouin import ttest

# There is always randomness in the generation of the lexicons etc., so if you want the exact data from the publication uncomment below 
# df = pd.read_csv("full_dataset.csv")

tp_modes_pretty = ["TP-random position-random", "TP-random position-fixed", "TP-structured"]
dfs = []

for i, tp_mode in enumerate(tp_modes_pretty):
    df2 = df[(df["Stream TP mode"] == tp_mode) & (df["Feature"] == "max")]
    cat1 = df2[df2['Control']=='Controlled lexicons (ARPAC, german)']["PRI"]
    cat2 = df2[df2['Control']=='Reference lexicons (Literature)']["PRI"]
    this = ttest(list(cat1), list(cat2), alternative="two-sided")
    this.index = [f"controlled vs. reference {tp_mode}"]
    dfs.append(this)

print("")

for i, tp_mode in enumerate(tp_modes_pretty):
    df2 = df[(df["Stream TP mode"] == tp_mode) & (df["Feature"] == "max")]
    cat1 = df2[df2['Control']=='Controlled lexicons (ARPAC, german)']["PRI"]
    cat2 = df2[df2['Control']=='Random lexicons (Baseline, german)']["PRI"]
    this = ttest(list(cat1), list(cat2), alternative="two-sided")
    this.index = [f"controlled vs. random baseline {tp_mode}"]
    dfs.append(this)

print("")

for i, tp_mode in enumerate(tp_modes_pretty):
    df2 = df[(df["Stream TP mode"] == tp_mode) & (df["Feature"] == "max")]
    cat1 = df2[df2['Control']=='Reference lexicons (Literature)']["PRI"]
    cat2 = df2[df2['Control']=='Random lexicons (Baseline, german)']["PRI"]
    this = ttest(list(cat1), list(cat2), alternative="two-sided")
    this.index = [f"reference vs. random baseline {tp_mode}"]
    dfs.append(this)

for i, tp_mode in enumerate(tp_modes_pretty):
    df2 = df[(df["Stream TP mode"] == tp_mode) & (df["Feature"] == "max")]
    cat1 = df2[df2['Control']=='Controlled lexicons (ARPAC, english)']["PRI"]
    cat2 = df2[df2['Control']=='Reference lexicons (Literature)']["PRI"]
    this = ttest(list(cat1), list(cat2), alternative="two-sided")
    this.index = [f"controlled vs. reference {tp_mode}"]
    dfs.append(this)

print("")

for i, tp_mode in enumerate(tp_modes_pretty):
    df2 = df[(df["Stream TP mode"] == tp_mode) & (df["Feature"] == "max")]
    cat1 = df2[df2['Control']=='Controlled lexicons (ARPAC, english)']["PRI"]
    cat2 = df2[df2['Control']=='Random lexicons (Baseline, english)']["PRI"]
    this = ttest(list(cat1), list(cat2), alternative="two-sided")
    this.index = [f"controlled vs. random baseline {tp_mode}"]
    dfs.append(this)

print("")

for i, tp_mode in enumerate(tp_modes_pretty):
    df2 = df[(df["Stream TP mode"] == tp_mode) & (df["Feature"] == "max")]
    cat1 = df2[df2['Control']=='Reference lexicons (Literature)']["PRI"]
    cat2 = df2[df2['Control']=='Random lexicons (Baseline, english)']["PRI"]
    this = ttest(list(cat1), list(cat2), alternative="two-sided")
    this.index = [f"reference vs. random baseline {tp_mode}"]
    dfs.append(this)

ttest_df = pd.concat(dfs).rename({"dof": "df"}, axis=1)

display(ttest_df)

ttest_df.to_csv("results/ttest_results.csv")







Unnamed: 0,T,df,alternative,p-val,CI95%,cohen-d,BF10,power
controlled vs. reference TP-random position-random,3.004819,838.0,two-sided,0.002736547,"[0.0, 0.01]",0.207352,6.388,0.851158
controlled vs. reference TP-random position-fixed,-22.303592,838.0,two-sided,7.161424999999999e-87,"[-0.12, -0.1]",1.539094,1.083e+83,1.0
controlled vs. reference TP-structured,-14.362819,798.0,two-sided,8.905879999999999e-42,"[-0.11, -0.08]",1.015605,2.046e+38,1.0
controlled vs. random baseline TP-random position-random,-2.105103,838.0,two-sided,0.03557944,"[-0.0, -0.0]",0.145266,0.678,0.556771
controlled vs. random baseline TP-random position-fixed,-26.016093,838.0,two-sided,7.592947e-110,"[-0.24, -0.21]",1.795281,7.651e+105,1.0
controlled vs. random baseline TP-structured,-44.081092,404.209421,two-sided,1.725366e-156,"[-0.33, -0.3]",3.105423,8.396e+211,1.0
reference vs. random baseline TP-random position-random,-4.941127,838.0,two-sided,9.379209e-07,"[-0.01, -0.0]",0.34097,10760.0,0.998537
reference vs. random baseline TP-random position-fixed,-11.682956,838.0,two-sided,2.541858e-29,"[-0.14, -0.1]",0.806201,1.055e+26,1.0
reference vs. random baseline TP-structured,-22.73676,794.958295,two-sided,1.5012619999999999e-88,"[-0.24, -0.2]",1.605392,6.536e+84,1.0
controlled vs. reference TP-random position-random,1.415171,838.0,two-sided,0.1573898,"[-0.0, 0.0]",0.097656,0.206,0.292762


## Example arpac Lexicon From Appendix