# Compare Lexicons

We compare the streams generated with 

1. controlled lexicons (ours),
2. random baseline streams, and
3. streams generated based on reference lexicons from the literature

based on the repetitiveness of the phoneme features.

### Collect Results

We collect all stream generation results and their feature repetitiveness scores in a dataframe. To generate the data, you must run 
```bash
./generate_all_datasets.sh
```

In [1]:
import glob
import pandas as pd
from arpac import load_streams

all_streams = [
    ("ALPARC-DEU", load_streams(glob.glob("results/default_german_*")[-1] + "/_arpac/streams.json")), 
    ("ALPARC-DEU-3w", load_streams(glob.glob("results/default_3words_no_rnd_*")[-1] + "/_arpac/streams.json")),
    ("ALPARC-DEU-5w", load_streams(glob.glob("results/default_5words_no_rnd_*")[-1] + "/_arpac/streams.json")), 
    ("ALPARC-DEU-2s", load_streams(glob.glob("results/default_2syllables_no_rnd_*")[-1] + "/_arpac/streams.json")),
    ("ALPARC-DEU-4s", load_streams(glob.glob("results/default_4syllables_no_rnd_*")[-1] + "/_arpac/streams.json")),
    ("ALPARC-ENG", load_streams(glob.glob("results/default_english_*")[-1] + "/_arpac/streams.json")), 
    ("ALPARC-RND", load_streams(glob.glob("results/random_german_*")[-1] + "/_arpac/streams.json")),
    ("ALPARC-RND", load_streams(glob.glob("results/random_english_*")[-1] + "/_arpac/streams.json")),
    ("BENCHMARK", load_streams(glob.glob("results/literature_streams_*")[-1] + "/_arpac/streams.json")), 
]

In [None]:
data = {"Control": [], "Lexicon": [], "Cumulative feature overlap": [], "Feature": [], "PRI": [], "Stream TP mode": [], "Stream": []}

mode_to_mode = {  # TP-uniform position-random; TP-uniform position-fixed and TP-structured
    "random": "TP-uniform position-random",
    "word_structured": "TP-structured",
    "position_controlled": "TP-uniform position-fixed"
}

for control, streams in all_streams:
    for stream in streams:
        for k, v in stream.info["rhythmicity_indexes"].items():
            data["Feature"].append(k)
            data["PRI"].append(v)
            data["Control"].append(control)
            data["Lexicon"].append(str(stream.info["lexicon"]))
            data["Stream TP mode"].append(mode_to_mode[stream.info["stream_tp_mode"]])
            data["Stream"].append("|".join(syll.id for syll in stream))
            data["Cumulative feature overlap"].append(stream.info["lexicon_info"]["cumulative_feature_repetitiveness"])
        data["Feature"].append("max")
        data["PRI"].append(max(stream.info["rhythmicity_indexes"].values()))
        data["Control"].append(control)
        data["Lexicon"].append(str(stream.info["lexicon"]))
        data["Stream TP mode"].append(mode_to_mode[stream.info["stream_tp_mode"]])
        data["Stream"].append("|".join(syll.id for syll in stream))
        data["Cumulative feature overlap"].append(stream.info["lexicon_info"]["cumulative_feature_repetitiveness"])

df = pd.DataFrame(data)

df.Control = df.Control.astype("category")
df.Control = df.Control.cat.set_categories([
    "ALPARC-DEU", 
    "ALPARC-DEU-3w",
    "ALPARC-DEU-5w", 
    "ALPARC-DEU-2s",
    "ALPARC-DEU-4s",
    "ALPARC-ENG", 
    "ALPARC-RND",
    "BENCHMARK",
])

df = df.sort_values(["Control", "Lexicon", "Stream TP mode"]).reset_index(drop=True)

import os
os.makedirs("results/", exist_ok=True)
df.to_csv("results/analysis_full_dataset.csv")

df

Unnamed: 0,Control,Lexicon,Cumulative feature overlap,Feature,PRI,Stream TP mode,Stream
0,ALPARC-DEU,biːçaːroː|fyːløːkuː|ʃaːhøːpoː|huːpiːzɛː,2,phon_1_son,0.121849,TP-structured,fyː|løː|kuː|huː|piː|zɛː|ʃaː|høː|poː|biː|çaː|ro...
1,ALPARC-DEU,biːçaːroː|fyːløːkuː|ʃaːhøːpoː|huːpiːzɛː,2,phon_1_back,0.000000,TP-structured,fyː|løː|kuː|huː|piː|zɛː|ʃaː|høː|poː|biː|çaː|ro...
2,ALPARC-DEU,biːçaːroː|fyːløːkuː|ʃaːhøːpoː|huːpiːzɛː,2,phon_1_hi,0.000000,TP-structured,fyː|løː|kuː|huː|piː|zɛː|ʃaː|høː|poː|biː|çaː|ro...
3,ALPARC-DEU,biːçaːroː|fyːløːkuː|ʃaːhøːpoː|huːpiːzɛː,2,phon_1_lab,0.088235,TP-structured,fyː|løː|kuː|huː|piː|zɛː|ʃaː|høː|poː|biː|çaː|ro...
4,ALPARC-DEU,biːçaːroː|fyːløːkuː|ʃaːhøːpoː|huːpiːzɛː,2,phon_1_cor,0.096639,TP-structured,fyː|løː|kuː|huː|piː|zɛː|ʃaː|høː|poː|biː|çaː|ro...
...,...,...,...,...,...,...,...
74806,BENCHMARK,ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku,4,phon_1_voi,0.039216,TP-uniform position-random,la|da|pa|ɹoʊ|ɡoʊ|pi|tu|doʊ|bi|bu|ku|ti|ɡoʊ|bu|...
74807,BENCHMARK,ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku,4,phon_2_back,0.007003,TP-uniform position-random,la|da|pa|ɹoʊ|ɡoʊ|pi|tu|doʊ|bi|bu|ku|ti|ɡoʊ|bu|...
74808,BENCHMARK,ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku,4,phon_2_hi,0.029412,TP-uniform position-random,la|da|pa|ɹoʊ|ɡoʊ|pi|tu|doʊ|bi|bu|ku|ti|ɡoʊ|bu|...
74809,BENCHMARK,ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku,4,phon_2_lo,0.049020,TP-uniform position-random,la|da|pa|ɹoʊ|ɡoʊ|pi|tu|doʊ|bi|bu|ku|ti|ɡoʊ|bu|...


In [11]:
df_lexicons = df[["Control", "Lexicon"]].drop_duplicates().reset_index(drop=True)
df_lexicons.to_csv("results/analysis_all_lexicons.csv")
df_lexicons

Unnamed: 0,Control,Lexicon
0,ALPARC-DEU,biːçaːroː|fyːløːkuː|ʃaːhøːpoː|huːpiːzɛː
1,ALPARC-DEU,boːhiːseː|laːfoːɡɛː|heːʃoːbøː|peːsaːhuː
2,ALPARC-DEU,hiːʃoːpeː|boːçaːluː|ʃaːhøːpoː|nøːfoːkaː
3,ALPARC-DEU,høːsuːpoː|fuːkøːraː|ɡaːmeːzɛː|ʃøːpaːhuː
4,ALPARC-DEU,kaːmyːʃiː|ryːfuːɡiː|buːzyːhoː|huːseːboː
...,...,...
182,BENCHMARK,pukemi|rafinu|binapo|medoxi
183,BENCHMARK,tifaxu|soduxi|melubo|ɡanipe
184,BENCHMARK,tupiɹoʊ|ɡoʊlabu|padoʊti|bidaku
185,BENCHMARK,ɡonabe|mudila|ronixe|pikusa


In [34]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from pingouin import ttest
import itertools

tp_modes = ["TP-uniform position-random", "TP-uniform position-fixed", "TP-structured"]
dfs = []

for i, tp_mode in enumerate(tp_modes):
    for one, two in itertools.combinations([
    "ALPARC-DEU",
    "ALPARC-ENG", 
    "ALPARC-RND",
    "BENCHMARK"], 2):
        df2 = df[(df["Stream TP mode"] == tp_mode) & (df["Feature"] == "max")]
        cat1 = df2[df2['Control'] == one]["PRI"]
        cat2 = df2[df2['Control'] == two]["PRI"]
        print(tp_mode, one, two)
        this = ttest(list(cat1), list(cat2), alternative="two-sided")
        this.index = pd.MultiIndex.from_tuples([(tp_mode, f"{one} vs. {two}")], names=["Stream TP mode", "Controls"])
        dfs.append(this)

ttest_df = pd.concat(dfs).rename({"dof": "df"}, axis=1)

display(ttest_df)

ttest_df.to_csv("results/table_1.csv", index=True)

TP-uniform position-random ALPARC-DEU ALPARC-ENG
TP-uniform position-random ALPARC-DEU ALPARC-RND
TP-uniform position-random ALPARC-DEU BENCHMARK
TP-uniform position-random ALPARC-ENG ALPARC-RND
TP-uniform position-random ALPARC-ENG BENCHMARK
TP-uniform position-random ALPARC-RND BENCHMARK
TP-uniform position-fixed ALPARC-DEU ALPARC-ENG
TP-uniform position-fixed ALPARC-DEU ALPARC-RND
TP-uniform position-fixed ALPARC-DEU BENCHMARK
TP-uniform position-fixed ALPARC-ENG ALPARC-RND
TP-uniform position-fixed ALPARC-ENG BENCHMARK
TP-uniform position-fixed ALPARC-RND BENCHMARK
TP-structured ALPARC-DEU ALPARC-ENG
TP-structured ALPARC-DEU ALPARC-RND
TP-structured ALPARC-DEU BENCHMARK
TP-structured ALPARC-ENG ALPARC-RND
TP-structured ALPARC-ENG BENCHMARK
TP-structured ALPARC-RND BENCHMARK


Unnamed: 0_level_0,Unnamed: 1_level_0,T,df,alternative,p-val,CI95%,cohen-d,BF10,power
Stream TP mode,Controls,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TP-uniform position-random,ALPARC-DEU vs. ALPARC-ENG,-2.454391,418.0,two-sided,0.01451915,"[-0.0, -0.0]",0.239524,1.959,0.687508
TP-uniform position-random,ALPARC-DEU vs. ALPARC-RND,-1.150183,449.286552,two-sided,0.2506802,"[-0.0, 0.0]",0.094636,0.179,0.200957
TP-uniform position-random,ALPARC-DEU vs. BENCHMARK,4.319465,418.0,two-sided,1.956037e-05,"[0.0, 0.01]",0.421537,763.645,0.990602
TP-uniform position-random,ALPARC-ENG vs. ALPARC-RND,1.654204,400.208618,two-sided,0.09887012,"[-0.0, 0.0]",0.142135,0.356,0.389582
TP-uniform position-random,ALPARC-ENG vs. BENCHMARK,6.39427,418.0,two-sided,4.320744e-10,"[0.01, 0.01]",0.624017,19310000.0,0.999995
TP-uniform position-random,ALPARC-RND vs. BENCHMARK,5.854505,408.217566,two-sided,9.84272e-09,"[0.0, 0.01]",0.499277,1163000.0,0.999959
TP-uniform position-fixed,ALPARC-DEU vs. ALPARC-ENG,-2.142001,418.0,two-sided,0.03277069,"[-0.01, -0.0]",0.209038,0.985,0.570311
TP-uniform position-fixed,ALPARC-DEU vs. ALPARC-RND,-22.306127,429.681158,two-sided,8.961254e-74,"[-0.36, -0.31]",1.338898,5.893e+77,1.0
TP-uniform position-fixed,ALPARC-DEU vs. BENCHMARK,-16.904015,418.0,two-sided,3.171439e-49,"[-0.13, -0.1]",1.649663,4.301e+45,1.0
TP-uniform position-fixed,ALPARC-ENG vs. ALPARC-RND,-21.931067,431.303307,two-sided,3.644631e-72,"[-0.36, -0.3]",1.31734,5.6869999999999995e+75,1.0
