# Compare Lexicons

We compare the streams generated with 

1. controlled lexicons (ours),
2. random baseline streams, and
3. streams generated based on reference lexicons from the literature

based on the repetitiveness of the phoneme features.

In [2]:
EXPORT_SSML = False

In [None]:
from arpac import load_phonemes, make_syllables, make_words

import os

results_path = "results"

os.makedirs(results_path, exist_ok=True)

german_syllables = make_syllables(load_phonemes(language_control=True), lang="deu")
print(f"Generate German Syllables: {german_syllables}")
german_syllables.save(os.path.join(results_path, "german_syllables.json"))

english_syllables = make_syllables(load_phonemes(language_control=False), lang="eng")
print(f"Generate English Syllables: {english_syllables}")
english_syllables.save(os.path.join(results_path, "english_syllables.json"))

if EXPORT_SSML:
    from arpac.io import export_speech_synthesizer
    export_speech_synthesizer(german_syllables, syllables_dir=os.path.join(results_path, "ssml", "german"))
    export_speech_synthesizer(english_syllables, syllables_dir=os.path.join(results_path, "ssml", "english"))

print(f"Generate German Words: ...")
german_words = make_words(german_syllables)
print(f"Words: {german_words}")
german_words.save(os.path.join(results_path, "german_words.json"))

print(f"Generate English Words: ...")
english_words = make_words(english_syllables, lang="eng")
print(f"Words: {english_words}")
english_words.save(os.path.join(results_path, "english_words.json"))

Generate German Syllables: ɡaː|ɡiː|ɡyː|ɡɛː|kaː|koː|kuː|køː|kɛː|baː|... (75 elements total)
Generate English Syllables: cʔː|cɥː|cɰː|cʋː|cʍː|cjː|cwː|cɹː|cɻː|cɑː|... (2294 elements total)
Generate German Words: ...


100%|██████████| 10000/10000 [00:11<00:00, 847.89it/s]


Words: siːmoːɡaː|ʃaːhiːbyː|nuːɡɛːfyː|køːnoːvaː|siːmɛːkoː|hiːʃaːbøː|ɡaːluːfyː|ɡaːlyːfiː|doːfaːhiː|lyːçaːbøː|... (10000 elements total)
Generate English Words: ...


100%|██████████| 10000/10000 [00:52<00:00, 191.40it/s]


AssertionError: To check for phoneme position probability, your phonemes need the 'word_position_prob' info key. Before creating syllables and words from your phonemes run `phonemes = phonemes.intersection(read_phoneme_corpus())`.

In [5]:
from copy import copy
import pandas as pd
from arpac.types.base_types import Register
import numpy as np

from arpac.core.syllable import LABELS_C, LABELS_V, syllable_from_phonemes
from arpac.core.word import Word, word_overlap_matrix

phonemes = load_phonemes()

syll_feature_labels = [LABELS_C, LABELS_V]
syllable_type = "cv"

def to_phoneme(phoneme):
    return phoneme

def to_syllable(syllable):
    if len(syllable) == 3 and not syllable.endswith("ː"):
        syllable_obj = syllable_from_phonemes(phonemes, syllable[:2], syll_feature_labels)
        syllable_obj.id = syllable
        return syllable_obj
    return syllable_from_phonemes(phonemes, syllable, syll_feature_labels)

def to_word(word):
    syllables_list = list(map(to_syllable, word))
    word_id = "".join(s.id for s in syllables_list)
    word_features = list(list(tup) for tup in zip(*[s.info["binary_features"] for s in syllables_list]))
    return Word(id=word_id, info={"binary_features": word_features}, syllables=syllables_list)

def to_lexicon(lexicon):
    word_objs_list = list(map(to_word, lexicon))
    lexicon = Register({w.id:  w for w in word_objs_list})
    lexicon.info.update({"syllable_feature_labels": [LABELS_C, LABELS_V],  "syllable_type": syllable_type})
    overlap = word_overlap_matrix(lexicon)
    lexicon.info["cumulative_feature_repetitiveness"] = np.triu(overlap, 1).sum()
    lexicon.info["max_pairwise_feature_repetitiveness"] = np.triu(overlap, 1).max()
    return lexicon

In [6]:
N_LEXICONS = 21  # number of lexicons per TP mode
N_WORDS_PER_LEXICON = 4  
N_REPS = 5  # how often to randomize the lexicon to build the total stream, 
            # i.e. how long will the streams be in lexicon lengths N_REPS*N_WORDS_PER_LEXICON = n words in the stream
N_STREAMS_PER_INPUT = 5

In [7]:
def print_stream_info(stream):
    print("Stream:", "|".join([syll.id for syll in stream]))
    print("TP mode:", stream.info["stream_tp_mode"])
    print("Lexicon:", stream.info["lexicon"])
    print("Feature PRIs:", stream.info["rhythmicity_indexes"])
    print("")

### arpac Lexicons

In [None]:
from arpac import make_streams
from arpac import make_lexicons, Register, load_phonemes, load_words

print("Load words...")
words = load_words("words.json")

print("Make lexicons...")
controlled_lexicons = make_lexicons(words, n_lexicons=N_LEXICONS, n_words=N_WORDS_PER_LEXICON, control_features=True)

for i, lex in enumerate(controlled_lexicons):
    print(i, lex)

print("")
print("Example Lexicon:", controlled_lexicons[0])
print("Info:", controlled_lexicons[0].info)
print("")


In [None]:
controlled_streams = Register()
for _ in range(N_STREAMS_PER_INPUT):
    for stream in make_streams(
        controlled_lexicons, 
        max_rhythmicity=None, 
        num_repetitions=N_REPS
        ):
        controlled_streams.append(stream)

print_stream_info(controlled_streams[0])

len(controlled_streams)

### Random / uncontrolled lexicons (baseline)

In [None]:
random_lexicons = make_lexicons(words, n_lexicons=N_LEXICONS, n_words=N_WORDS_PER_LEXICON, control_features=False)

for i, lex in enumerate(random_lexicons):
    print(i, lex)

print("")
print("Example Lexicon:", random_lexicons[0])
print("Info:", random_lexicons[0].info)
print("")

In [None]:
random_streams = Register()
for _ in range(N_STREAMS_PER_INPUT):
    for stream in make_streams(
        random_lexicons, 
        max_rhythmicity=None, 
        num_repetitions=N_REPS
        ):
        random_streams.append(stream)
        
print_stream_info(random_streams[0])

len(random_streams)

### Reference lexicons from the literature

In [None]:
lexicons = [
 [['pi', 'ɾu', 'ta'],
  ['ba', 'ɡo', 'li'],
  ['to', 'ku', 'da'],
  ['ɡu', 'ki', 'bo']],
 [['pa', 'be', 'la'],
  ['di', 'ne', 'ka'],
  ['lu', 'fa', 'ri'],
  ['xi', 'so', 'du']],
 [['ma', 'xu', 'pe'],
  ['xe', 'ro', 'ɡa'],
  ['de', 'mu', 'si'],
  ['fo', 'le', 'ti']],
 [['pu', 'ke', 'mi'],
  ['ra', 'fi', 'nu'],
  ['bi', 'na', 'po'],
  ['me', 'do', 'xi']],
 [['no', 'ni', 'xe'],
  ['bu', 'lo', 'te'],
  ['re', 'mo', 'fu'],
  ['ko', 'tu', 'sa']],
 [['mi', 'lo', 'de'],
  ['da', 'le', 'bu'],
  ['no', 'ru', 'pa'],
  ['ka', 'te', 'xi']],
 [['ne', 'do', 'li'],
  ['ri', 'fo', 'nu'],
  ['ba', 'to', 'ɡu'],
  ['ki', 'ra', 'pu']],
 [['ɡo', 'na', 'be'],
  ['mu', 'di', 'la'],
  ['ro', 'ni', 'xe'],
  ['pi', 'ku', 'sa']],
 [['fu', 'bi', 're'],
  ['xe', 'tu', 'si'],
  ['ta', 'fi', 'ko'],
  ['ke', 'ma', 'po']],
 [['ti', 'fa', 'xu'],
  ['so', 'du', 'xi'],
  ['me', 'lu', 'bo'],
  ['ɡa', 'ni', 'pe']],
 [['mi', 'po', 'la'],
  ['za', 'bɛ', 'tu'],
  ['ʁo', 'ki', 'sɛ'],
  ['nu', 'ɡa', 'di']],
 [['dɛ', 'mʊ', 'ri'],
  ['sɛ', 'ni', 'ɡɛ'],
  ['ræ', 'ku', 'səʊ'],
  ['pi', 'lɛ', 'ru']],
 [['ki', 'fəʊ', 'bu'],
  ['lu', 'fɑ', 'ɡi'],
  ['pæ', 'beɪ', 'lɑ'],
  ['tɑ', 'ɡəʊ', 'fʊ']],
 [['bi', 'du', 'pɛ'],
  ['məʊ', 'bɑ', 'li'],
  ['rɛ', 'ɡæ', 'tʊ'],
  ['sæ', 'tɛ', 'kəʊ']],
 [['bəʊ', 'dɑ', 'mɛ'],
  ['fi', 'nəʊ', 'pɑ'],
  ['ɡʊ', 'rɑ', 'təʊ'],
  ['ləʊ', 'kæ', 'neɪ']],
 [['fɛ', 'si', 'nɑ'],
  ['kɛ', 'su', 'dəʊ'],
  ['mæ', 'pʊ', 'di'],
  ['ti', 'mi', 'nu']],
 [['tu', 'pi', 'ɹoʊ'],
  ['ɡoʊ', 'la', 'bu'],
  ['pa', 'doʊ', 'ti'],
  ['bi', 'da', 'ku']],
 [['meɪ', 'lu', 'ɡi'],
  ['ɹa', 'fi', 'nu'],
  ['pu', 'keɪ', 'mi'],
  ['toʊ', 'na', 'poʊ']],
 [['ɡoʊ', 'la', 'tu'],
  ['da', 'ɹoʊ', 'pi'],
  ['ti', 'bu', 'doʊ'],
  ['pa', 'bi', 'ku']],
 [['poʊ', 'fi', 'mu'],
  ['noʊ', 'vu', 'ka'],
  ['vi', 'koʊ', 'ɡa'],
  ['ba', 'fu', 'ɡi']],
 [['ma', 'nu', 'toʊ'],
  ['ni', 'moʊ', 'lu'],
  ['voʊ', 'ɹi', 'fa'],
  ['li', 'du', 'ɹa']]]

ref_lexicons = list(map(to_lexicon, lexicons))

for i, lex in enumerate(ref_lexicons):
    print(i, lex)

print("")
print("Example Lexicon:", ref_lexicons[0])
print("Info:", ref_lexicons[0].info)
print("")

In [None]:
ref_streams = Register()
for _ in range(N_STREAMS_PER_INPUT):
    for stream in make_streams(
        ref_lexicons, 
        max_rhythmicity=None, 
        num_repetitions=N_REPS
        ):
        ref_streams.append(stream)

print_stream_info(ref_streams[0])

len(ref_streams)

### Collect Results

We collect all stream generation results and their feature repetitiveness scores in a dataframe.

In [None]:
import pandas as pd

data = {"Control": [], "Lexicon": [], "Feature": [], "PRI": [], "Stream TP mode": [], "Stream": []}

mode_to_mode = {  # TP-random position-random; TP-random position-fixed and TP-structured
    "random": "TP-random position-random",
    "word_structured": "TP-structured",
    "position_controlled": "TP-random position-fixed"
}

for control, streams in [("Controlled lexicons (arpac)", controlled_streams), ("Reference lexicons (Literature)", ref_streams), ("Random lexicons (Baseline)", random_streams)]:
    for stream in streams:
        for k, v in stream.info["rhythmicity_indexes"].items():
            data["Feature"].append(k)
            data["PRI"].append(v)
            data["Control"].append(control)
            data["Lexicon"].append(str(stream.info["lexicon"]))
            data["Stream TP mode"].append(mode_to_mode[stream.info["stream_tp_mode"]])
            data["Stream"].append("|".join(syll.id for syll in stream))
        data["Feature"].append("max")
        data["PRI"].append(max(stream.info["rhythmicity_indexes"].values()))
        data["Control"].append(control)
        data["Lexicon"].append(str(stream.info["lexicon"]))
        data["Stream TP mode"].append(mode_to_mode[stream.info["stream_tp_mode"]])
        data["Stream"].append("|".join(syll.id for syll in stream))

df = pd.DataFrame(data).sort_values(["Control", "Lexicon", "Stream TP mode"]).reset_index(drop=True)

import os
os.makedirs("results/", exist_ok=True)
df.to_csv("results/full_dataset.csv")

df

In [None]:
# There is always randomness in the generation of the lexicons etc., so if you want the exact data from the publication uncomment below 
# df = pd.read_csv("full_dataset.csv")

df_lexicons = df[["Control", "Lexicon"]].drop_duplicates().reset_index(drop=True)
df_lexicons.to_csv("results/all_lexicons.csv")
df_lexicons

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from pingouin import ttest

# There is always randomness in the generation of the lexicons etc., so if you want the exact data from the publication uncomment below 
# df = pd.read_csv("full_dataset.csv")

tp_modes_pretty = ["TP-random position-random", "TP-random position-fixed", "TP-structured"]
dfs = []

for i, tp_mode in enumerate(tp_modes_pretty):
    df2 = df[(df["Stream TP mode"] == tp_mode) & (df["Feature"] == "max")]
    cat1 = df2[df2['Control']=='Controlled lexicons (arpac)']["PRI"]
    cat2 = df2[df2['Control']=='Reference lexicons (Literature)']["PRI"]
    this = ttest(list(cat1), list(cat2), alternative="less")
    this.index = [f"controlled vs. reference {tp_mode}"]
    dfs.append(this)

print("")

for i, tp_mode in enumerate(tp_modes_pretty):
    df2 = df[(df["Stream TP mode"] == tp_mode) & (df["Feature"] == "max")]
    cat1 = df2[df2['Control']=='Controlled lexicons (arpac)']["PRI"]
    cat2 = df2[df2['Control']=='Random lexicons (Baseline)']["PRI"]
    this = ttest(list(cat1), list(cat2), alternative="less")
    this.index = [f"controlled vs. random baseline {tp_mode}"]
    dfs.append(this)

print("")

for i, tp_mode in enumerate(tp_modes_pretty):
    df2 = df[(df["Stream TP mode"] == tp_mode) & (df["Feature"] == "max")]
    cat1 = df2[df2['Control']=='Reference lexicons (Literature)']["PRI"]
    cat2 = df2[df2['Control']=='Random lexicons (Baseline)']["PRI"]
    this = ttest(list(cat1), list(cat2), alternative="less")
    this.index = [f"reference vs. random baseline {tp_mode}"]
    dfs.append(this)

ttest_df = pd.concat(dfs)

display(ttest_df)

ttest_df.to_csv("results/ttest_results.csv")

## Example arpac Lexicon From Appendix

In [None]:
example_arpac_lexicon = to_lexicon([["heː", "doː", "faː"], ["riː", "foː", "ɡyː"], ["ʃuː", "hiː", "boː"], ["vaː", "kuː", "niː"]])
print("Example Lexicon:", example_arpac_lexicon)

streams = make_streams(
        [example_arpac_lexicon], 
        max_rhythmicity=None, 
        num_repetitions=N_REPS
        )

for stream in streams:
        print_stream_info(stream)