# Compare Lexicons

We compare the streams generated with 

1. controlled lexicons (ours),
2. random baseline streams, and
3. streams generated based on reference lexicons from the literature

based on the repetitiveness of the phoneme features.

In [4]:
from arc import load_phonemes, make_syllables, make_words
from arc import make_lexicons, load_words

import numpy as np 
import random

words = load_words("data_submission/words.json")

In [5]:
from copy import copy
import pandas as pd
from arc.types.base_types import Register
import numpy as np

from arc.core.syllable import LABELS_C, LABELS_V, syllable_from_phonemes
from arc.core.word import Word, word_overlap_matrix

phonemes = load_phonemes()

syll_feature_labels = [LABELS_C, LABELS_V]
syllable_type = "cv"

def to_phoneme(phoneme):
    return phoneme

def to_syllable(syllable):
    if len(syllable) == 3 and not syllable.endswith("ː"):
        syllable_obj = syllable_from_phonemes(phonemes, syllable[:2], syll_feature_labels)
        syllable_obj.id = syllable
        return syllable_obj
    return syllable_from_phonemes(phonemes, syllable, syll_feature_labels)

def to_word(word):
    syllables_list = list(map(to_syllable, word))
    word_id = "".join(s.id for s in syllables_list)
    word_features = list(list(tup) for tup in zip(*[s.info["binary_features"] for s in syllables_list]))
    return Word(id=word_id, info={"binary_features": word_features}, syllables=syllables_list)

def to_lexicon(lexicon):
    word_objs_list = list(map(to_word, lexicon))
    lexicon = Register({w.id:  w for w in word_objs_list})
    lexicon.info.update({"syllable_feature_labels": [LABELS_C, LABELS_V],  "syllable_type": syllable_type})
    overlap = word_overlap_matrix(lexicon)
    lexicon.info["cumulative_feature_repetitiveness"] = np.triu(overlap, 1).sum()
    lexicon.info["max_pairwise_feature_repetitiveness"] = np.triu(overlap, 1).max()
    return lexicon

In [6]:
N_LEXICONS = 21  # number of lexicons per TP mode
N_WORDS_PER_LEXICON = 4  
N_REPS = 5  # how often to randomize the lexicon to build the total stream, 
            # i.e. how long will the streams be in lexicon lengths N_REPS*N_WORDS_PER_LEXICON = n words in the stream
N_STREAMS_PER_INPUT = 5

In [7]:
def print_stream_info(stream):
    print("Stream:", "|".join([syll.id for syll in stream]))
    print("TP mode:", stream.info["stream_tp_mode"])
    print("Lexicon:", stream.info["lexicon"])
    print("Feature PRIs:", stream.info["rhythmicity_indexes"])
    print("")

### ARC Lexicons

In [8]:
from arc import make_streams
from arc import make_lexicons, Register, load_phonemes, load_words

print("Load words...")
words = load_words("data_submission/words.json")

print("Make lexicons...")
controlled_lexicons = make_lexicons(words, n_lexicons=N_LEXICONS, n_words=N_WORDS_PER_LEXICON, control_features=True)

for i, lex in enumerate(controlled_lexicons):
    print(i, lex)

print("")
print("Example Lexicon:", controlled_lexicons[0])
print("Info:", controlled_lexicons[0].info)
print("")


Load words...
Make lexicons...


Increasing allowed overlaps: MAX_PAIRWISE_OVERLAP=1, MAX_CUMULATIVE_OVERLAP=1
Increasing allowed overlaps: MAX_PAIRWISE_OVERLAP=1, MAX_CUMULATIVE_OVERLAP=2


0 hiːboːzyː|kuːnyːfoː|vaːtyːhøː|ʃoːkaːmɛː
1 peːhoːʃaː|kaːlyːfuː|zuːpoːhøː|hiːboːzyː
2 hiːboːzyː|ʃoːmeːɡaː|luːkɛːfoː|vaːdeːhoː
3 hiːboːzyː|vaːtoːhøː|peːhuːʃoː|tyːhoːfaː
4 huːpoːʃøː|tɛːheːvaː|ʃaːbyːhiː|fiːloːɡyː
5 ʃoːmeːɡaː|vaːtoːhuː|moːɡiːʃøː|kuːraːfoː
6 hiːboːzyː|ʃeːhoːpaː|vaːnuːɡiː|zuːpoːhøː
7 vaːnuːɡiː|ʃeːhoːbyː|zuːpoːhøː|heːbiːʃoː
8 ʃiːbeːhøː|huːfiːdoː|hiːtuːfoː|faːroːɡyː
9 kaːlyːfuː|huːfiːdoː|poːʃuːhøː|niːkoːvaː
10 hiːboːzyː|vaːdeːhoː|reːkoːfaː|kuːraːfoː
11 ʃuːhoːbøː|reːfyːkoː|heːdoːfiː|faːɡiːryː
12 foːkaːnuː|hiːbyːʃoː|ɡaːluːfyː|zuːpoːhøː
13 fyːhoːtiː|ʃoːmeːɡaː|huːzyːpoː|kaːfuːreː
14 ɡaːmoːzɛː|huːdoːfaː|reːvaːkɛː|poːʃuːhøː
15 peːhoːʃaː|hiːboːzyː|ʃoːkaːmɛː|kuːnɛːfoː
16 nyːfuːɡiː|hiːtuːfoː|vaːkoːryː|tyːhoːfaː
17 hiːboːzyː|huːdoːfaː|ʃaːmoːɡɛː|poːʃuːhøː
18 beːhoːzɛː|ʃoːmeːɡaː|huːzyːpoː|kaːfuːreː
19 ɡaːmoːzɛː|ruːfyːɡiː|hiːtoːfaː|fiːkaːryː
20 ʃuːhoːpeː|vaːkoːryː|heːpoːzɛː|foːniːkaː

Example Lexicon: hiːboːzyː|kuːnyːfoː|vaːtyːhøː|ʃoːkaːmɛː
Info: {'phoneme_feature_labels': ['syl', 'son', '

In [9]:
controlled_streams = Register()
for _ in range(N_STREAMS_PER_INPUT):
    for stream in make_streams(
        controlled_lexicons, 
        max_rhythmicity=None, 
        num_repetitions=N_REPS
        ):
        controlled_streams.append(stream)

print_stream_info(controlled_streams[0])

len(controlled_streams)

Stream: boː|hiː|kuː|kaː|foː|nyː|zyː|mɛː|tyː|høː|vaː|ʃoː|nyː|ʃoː|foː|vaː|kuː|hiː|tyː|kaː|høː|mɛː|boː|zyː|foː|høː|boː|kuː|tyː|mɛː|hiː|nyː|kaː|ʃoː|zyː|vaː|zyː|høː|kuː|mɛː|nyː|foː|boː|vaː|kaː|hiː|ʃoː|tyː|nyː|tyː|hiː|høː|foː|zyː|kaː|kuː|vaː|boː|mɛː|ʃoː|vaː|hiː|mɛː|foː|kaː|boː|ʃoː|kuː|nyː|høː|zyː|tyː|kuː|boː|høː|tyː|vaː|mɛː|zyː|nyː|hiː|foː|ʃoː|kaː|mɛː|høː|nyː|vaː|foː|kuː|ʃoː|boː|kaː|tyː|zyː|hiː|kaː|nyː|boː|tyː|foː|mɛː|kuː|zyː|ʃoː|hiː|vaː|høː|kaː|zyː|kuː|foː|tyː|ʃoː|høː|hiː|boː|nyː|mɛː|vaː|nyː|kuː|høː|ʃoː|mɛː|kaː|vaː|tyː|boː|foː|hiː|zyː|boː|kaː|foː|mɛː|høː|zyː|kuː|ʃoː|tyː|hiː|vaː|nyː|zyː|kaː|vaː|hiː|boː|foː|ʃoː|kuː|nyː|tyː|høː|mɛː|kuː|mɛː|vaː|zyː|nyː|ʃoː|kaː|boː|tyː|foː|hiː|høː|nyː|foː|zyː|boː|ʃoː|hiː|mɛː|tyː|kuː|vaː|kaː|høː|boː|nyː|hiː|kaː|zyː|foː|høː|kuː|tyː|vaː|mɛː|ʃoː|zyː|hiː|kuː|høː|tyː|kaː|nyː|boː|mɛː|foː|vaː|ʃoː|mɛː|kaː|tyː|ʃoː|foː|boː|hiː|nyː|vaː|kuː|zyː|høː|ʃoː|høː|foː|tyː|mɛː|nyː|kaː|hiː|zyː|vaː|boː|kuː|hiː|ʃoː|vaː|høː|kaː|mɛː|boː|zyː|tyː|nyː|kuː|foː|kuː|kaː|ʃoː|boː|vaː|foː|nyː|høː|

315

### Random / uncontrolled lexicons (baseline)

In [10]:
random_lexicons = make_lexicons(words, n_lexicons=N_LEXICONS, n_words=N_WORDS_PER_LEXICON, control_features=False)

for i, lex in enumerate(random_lexicons):
    print(i, lex)

print("")
print("Example Lexicon:", random_lexicons[0])
print("Info:", random_lexicons[0].info)
print("")

0 faːɡiːryː|riːfaːkɛː|faːdoːheː|fuːriːkaː
1 roːfuːɡiː|høːtoːfuː|roːfaːkøː|koːvaːlyː
2 vaːtiːhøː|nuːɡiːfaː|huːfoːtɛː|beːzuːhøː
3 riːfaːkøː|kɛːfoːniː|køːvaːnyː|faːniːkuː
4 ɡiːfoːryː|fyːhiːtuː|riːfoːkuː|piːzuːheː
5 beːhoːzuː|ʃeːhoːbøː|fuːriːkaː|ɡiːlyːfoː
6 tiːheːfoː|fiːlaːkøː|ryːkoːfaː|ryːfoːɡɛː
7 niːfoːkaː|ɡiːfaːlyː|kaːfoːniː|tiːheːvaː
8 vaːriːkoː|huːpoːzɛː|heːpoːʃøː|hoːfiːtɛː
9 foːhøːtiː|ɡaːreːfiː|faːdoːheː|faːhøːtuː
10 heːtyːfaː|kuːlyːfoː|tyːfaːhoː|koːnøːvaː
11 ɡiːfoːryː|fuːtyːhiː|foːheːtɛː|huːfoːtiː
12 baːzuːheː|heːtuːfiː|foːdeːhøː|reːkoːfyː
13 fyːraːkɛː|kɛːfaːnyː|piːzuːheː|foːtiːheː
14 koːfiːnøː|ɡaːlyːfuː|tiːheːfoː|fyːriːkoː
15 ɡaːryːfoː|foːtiːheː|ʃiːpoːhøː|vaːniːkuː
16 faːtiːhøː|faːdeːhuː|ɡaːryːfoː|hoːfiːtɛː
17 koːfiːnøː|hoːtyːvaː|foːkuːriː|heːtyːfaː
18 tuːfoːhøː|reːfoːɡyː|luːɡaːfoː|zuːhiːboː
19 ɡiːfoːryː|tyːfaːhoː|koːfiːlyː|fiːkuːnøː
20 reːɡaːfiː|tuːfaːheː|faːtoːhiː|faːhøːtuː

Example Lexicon: faːɡiːryː|riːfaːkɛː|faːdoːheː|fuːriːkaː
Info: {'phoneme_feature_labels': ['syl', 'son', '

In [11]:
random_streams = Register()
for _ in range(N_STREAMS_PER_INPUT):
    for stream in make_streams(
        random_lexicons, 
        max_rhythmicity=None, 
        num_repetitions=N_REPS
        ):
        random_streams.append(stream)
        
print_stream_info(random_streams[0])

len(random_streams)

Stream: riː|doː|fuː|ɡiː|heː|faː|faː|ryː|riː|faː|kɛː|kaː|faː|kaː|ryː|doː|ɡiː|faː|riː|fuː|heː|kɛː|riː|faː|fuː|faː|ɡiː|ryː|faː|kɛː|heː|riː|riː|kaː|faː|doː|faː|riː|kɛː|faː|faː|heː|kaː|riː|doː|ryː|ɡiː|fuː|riː|ryː|kaː|doː|kɛː|faː|fuː|faː|ɡiː|riː|faː|heː|ɡiː|doː|riː|heː|fuː|ryː|kɛː|riː|faː|faː|kaː|faː|kɛː|fuː|doː|faː|riː|riː|heː|ryː|faː|faː|ɡiː|kaː|ɡiː|kɛː|ryː|riː|faː|faː|riː|kaː|fuː|faː|doː|heː|faː|faː|riː|ɡiː|faː|riː|fuː|kaː|kɛː|doː|heː|ryː|faː|doː|kaː|heː|faː|ryː|fuː|riː|ɡiː|riː|kɛː|faː|ryː|heː|doː|faː|fuː|kɛː|ɡiː|faː|riː|faː|kaː|riː|ryː|faː|faː|fuː|ɡiː|faː|heː|riː|kaː|kɛː|riː|doː|riː|ryː|ɡiː|kaː|heː|faː|riː|kɛː|fuː|doː|faː|faː|kaː|doː|fuː|riː|faː|riː|faː|heː|kɛː|faː|ɡiː|ryː|doː|faː|faː|kaː|ryː|heː|riː|riː|faː|kɛː|ɡiː|fuː|ryː|fuː|riː|ɡiː|doː|riː|faː|kɛː|faː|heː|kaː|faː|doː|riː|kaː|faː|faː|ɡiː|kɛː|heː|fuː|faː|riː|ryː|riː|faː|kɛː|kaː|fuː|faː|ryː|faː|heː|ɡiː|riː|doː|ryː|kɛː|riː|fuː|faː|ɡiː|riː|heː|faː|kaː|faː|doː|kɛː|ryː|kaː|riː|faː|faː|doː|ɡiː|heː|faː|riː|fuː|heː|doː|kaː|riː|riː|ɡiː|faː|ryː|

315

### Reference lexicons from the literature

In [12]:
lexicons = [
 [['pi', 'ɾu', 'ta'],
  ['ba', 'ɡo', 'li'],
  ['to', 'ku', 'da'],
  ['ɡu', 'ki', 'bo']],
 [['pa', 'be', 'la'],
  ['di', 'ne', 'ka'],
  ['lu', 'fa', 'ri'],
  ['xi', 'so', 'du']],
 [['ma', 'xu', 'pe'],
  ['xe', 'ro', 'ɡa'],
  ['de', 'mu', 'si'],
  ['fo', 'le', 'ti']],
 [['pu', 'ke', 'mi'],
  ['ra', 'fi', 'nu'],
  ['bi', 'na', 'po'],
  ['me', 'do', 'xi']],
 [['no', 'ni', 'xe'],
  ['bu', 'lo', 'te'],
  ['re', 'mo', 'fu'],
  ['ko', 'tu', 'sa']],
 [['mi', 'lo', 'de'],
  ['da', 'le', 'bu'],
  ['no', 'ru', 'pa'],
  ['ka', 'te', 'xi']],
 [['ne', 'do', 'li'],
  ['ri', 'fo', 'nu'],
  ['ba', 'to', 'ɡu'],
  ['ki', 'ra', 'pu']],
 [['ɡo', 'na', 'be'],
  ['mu', 'di', 'la'],
  ['ro', 'ni', 'xe'],
  ['pi', 'ku', 'sa']],
 [['fu', 'bi', 're'],
  ['xe', 'tu', 'si'],
  ['ta', 'fi', 'ko'],
  ['ke', 'ma', 'po']],
 [['ti', 'fa', 'xu'],
  ['so', 'du', 'xi'],
  ['me', 'lu', 'bo'],
  ['ɡa', 'ni', 'pe']],
 [['mi', 'po', 'la'],
  ['za', 'bɛ', 'tu'],
  ['ʁo', 'ki', 'sɛ'],
  ['nu', 'ɡa', 'di']],
 [['dɛ', 'mʊ', 'ri'],
  ['sɛ', 'ni', 'ɡɛ'],
  ['ræ', 'ku', 'səʊ'],
  ['pi', 'lɛ', 'ru']],
 [['ki', 'fəʊ', 'bu'],
  ['lu', 'fɑ', 'ɡi'],
  ['pæ', 'beɪ', 'lɑ'],
  ['tɑ', 'ɡəʊ', 'fʊ']],
 [['bi', 'du', 'pɛ'],
  ['məʊ', 'bɑ', 'li'],
  ['rɛ', 'ɡæ', 'tʊ'],
  ['sæ', 'tɛ', 'kəʊ']],
 [['bəʊ', 'dɑ', 'mɛ'],
  ['fi', 'nəʊ', 'pɑ'],
  ['ɡʊ', 'rɑ', 'təʊ'],
  ['ləʊ', 'kæ', 'neɪ']],
 [['fɛ', 'si', 'nɑ'],
  ['kɛ', 'su', 'dəʊ'],
  ['mæ', 'pʊ', 'di'],
  ['ti', 'mi', 'nu']],
 [['tu', 'pi', 'ɹoʊ'],
  ['ɡoʊ', 'la', 'bu'],
  ['pa', 'doʊ', 'ti'],
  ['bi', 'da', 'ku']],
 [['meɪ', 'lu', 'ɡi'],
  ['ɹa', 'fi', 'nu'],
  ['pu', 'keɪ', 'mi'],
  ['toʊ', 'na', 'poʊ']],
 [['ɡoʊ', 'la', 'tu'],
  ['da', 'ɹoʊ', 'pi'],
  ['ti', 'bu', 'doʊ'],
  ['pa', 'bi', 'ku']],
 [['poʊ', 'fi', 'mu'],
  ['noʊ', 'vu', 'ka'],
  ['vi', 'koʊ', 'ɡa'],
  ['ba', 'fu', 'ɡi']],
 [['ma', 'nu', 'toʊ'],
  ['ni', 'moʊ', 'lu'],
  ['voʊ', 'ɹi', 'fa'],
  ['li', 'du', 'ɹa']]]

ref_lexicons = list(map(to_lexicon, lexicons))

for i, lex in enumerate(ref_lexicons):
    print(i, lex)

print("")
print("Example Lexicon:", ref_lexicons[0])
print("Info:", ref_lexicons[0].info)
print("")

0 piɾuta|baɡoli|tokuda|ɡukibo
1 pabela|dineka|lufari|xisodu
2 maxupe|xeroɡa|demusi|foleti
3 pukemi|rafinu|binapo|medoxi
4 nonixe|bulote|remofu|kotusa
5 milode|dalebu|norupa|katexi
6 nedoli|rifonu|batoɡu|kirapu
7 ɡonabe|mudila|ronixe|pikusa
8 fubire|xetusi|tafiko|kemapo
9 tifaxu|soduxi|melubo|ɡanipe
10 mipola|zabɛtu|ʁokisɛ|nuɡadi
11 dɛmʊri|sɛniɡɛ|rækusəʊ|pilɛru
12 kifəʊbu|lufɑɡi|pæbeɪlɑ|tɑɡəʊfʊ
13 bidupɛ|məʊbɑli|rɛɡætʊ|sætɛkəʊ
14 bəʊdɑmɛ|finəʊpɑ|ɡʊrɑtəʊ|ləʊkæneɪ
15 fɛsinɑ|kɛsudəʊ|mæpʊdi|timinu
16 tupiɹoʊ|ɡoʊlabu|padoʊti|bidaku
17 meɪluɡi|ɹafinu|pukeɪmi|toʊnapoʊ
18 ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku
19 poʊfimu|noʊvuka|vikoʊɡa|bafuɡi
20 manutoʊ|nimoʊlu|voʊɹifa|liduɹa

Example Lexicon: piɾuta|baɡoli|tokuda|ɡukibo
Info: {'syllable_feature_labels': [['son', 'back', 'hi', 'lab', 'cor', 'cont', 'lat', 'nas', 'voi'], ['back', 'hi', 'lo', 'lab', 'tense', 'long']], 'syllable_type': 'cv', 'cumulative_feature_repetitiveness': 5, 'max_pairwise_feature_repetitiveness': 2}



In [13]:
ref_streams = Register()
for _ in range(N_STREAMS_PER_INPUT):
    for stream in make_streams(
        ref_lexicons, 
        max_rhythmicity=None, 
        num_repetitions=N_REPS
        ):
        ref_streams.append(stream)

print_stream_info(ref_streams[0])

len(ref_streams)

Stream: ki|ɡo|bo|ku|da|to|ta|pi|ba|ɡu|li|ɾu|bo|ɾu|ɡo|li|ku|ki|da|ɡu|ta|to|ba|pi|ɡo|to|li|pi|ki|ku|ba|bo|ɡu|ɾu|ta|da|ɾu|pi|bo|to|da|ɡo|ki|li|ba|ta|ɡu|ku|ɡu|ba|li|da|bo|pi|ku|ɾu|to|ɡo|ta|ki|ba|ɡo|ɾu|li|to|ɡu|bo|ki|ta|ku|pi|da|pi|li|bo|ta|ɾu|da|ba|ku|to|ki|ɡu|ɡo|pi|to|ku|li|ɡo|ɡu|da|ta|ba|ɾu|ki|bo|ɡo|da|li|ɡu|pi|ta|bo|ba|ki|to|ɾu|ku|ta|ɡo|ba|da|ku|bo|li|ki|ɾu|ɡu|to|pi|ɾu|ba|to|bo|da|ki|pi|ɡu|ɡo|ku|li|ta|li|bo|ɡu|ki|ta|pi|ba|ku|ɡo|to|da|ɾu|ɡo|bo|li|da|pi|ɾu|ta|ku|to|ba|ɡu|ki|ku|ta|da|ɡu|ba|li|ɾu|to|pi|ɡo|ki|bo|pi|li|ba|bo|ɾu|da|to|ɡo|ku|ki|ɡu|ta|bo|ba|ɡo|ta|li|ɡu|ɾu|pi|to|ku|da|ki|to|bo|da|li|ɡo|ɡu|pi|ta|ba|ki|ɾu|ku|ɡu|ku|ɾu|ba|to|li|ki|ɡo|pi|da|bo|ta|ɡo|ɾu|bo|to|ki|pi|ku|ba|da|ta|ɡu|li|pi|ki|ba|ɾu|li|ta|to|ɡu|bo|ɡo|da|ku|bo|ku|ɡo|ba|ta|ki|li|to|ɾu|ɡu|da|pi|ɡu|to|ta|ɾu|ki|da|ɡo|li|ku|pi|bo|ba|pi|ɾu|ɡu|da|ba|to|ta|bo|ki|ku|ɡo|li|da|ku|ɾu|bo|pi|ɡu|li|ta|ɡo|ba|ki|to|ki|pi|bo|ɡu|ɾu|ɡo|ta|da|li|ba|ku|to|li|pi|da|ɾu|to|ɡu|ki|bo|ta|ku|ba|ɡo|bo|li|ku|da|ɡu|pi|ɡo|to|ba|ta|ɾu|ki|ɾu|ta|ki|ɡu|ɡo|da|to

315

### Collect Results

We collect all stream generation results and their feature repetitiveness scores in a dataframe.

In [14]:
import pandas as pd

data = {"Control": [], "Lexicon": [], "Feature": [], "PRI": [], "Stream TP mode": [], "Stream": []}

mode_to_mode = {  # TP-random position-random; TP-random position-fixed and TP-structured
    "random": "TP-random position-random",
    "word_structured": "TP-structured",
    "position_controlled": "TP-random position-fixed"
}

for control, streams in [("Controlled lexicons (ARC)", controlled_streams), ("Reference lexicons (Literature)", ref_streams), ("Random lexicons (Baseline)", random_streams)]:
    for stream in streams:
        for k, v in stream.info["rhythmicity_indexes"].items():
            data["Feature"].append(k)
            data["PRI"].append(v)
            data["Control"].append(control)
            data["Lexicon"].append(str(stream.info["lexicon"]))
            data["Stream TP mode"].append(mode_to_mode[stream.info["stream_tp_mode"]])
            data["Stream"].append("|".join(syll.id for syll in stream))
        data["Feature"].append("max")
        data["PRI"].append(max(stream.info["rhythmicity_indexes"].values()))
        data["Control"].append(control)
        data["Lexicon"].append(str(stream.info["lexicon"]))
        data["Stream TP mode"].append(mode_to_mode[stream.info["stream_tp_mode"]])
        data["Stream"].append("|".join(syll.id for syll in stream))

df = pd.DataFrame(data).sort_values(["Control", "Lexicon", "Stream TP mode"]).reset_index(drop=True)

import os
os.makedirs("results/", exist_ok=True)
df.to_csv("results/full_dataset.csv")

df

Unnamed: 0,Control,Lexicon,Feature,PRI,Stream TP mode,Stream
0,Controlled lexicons (ARC),beːhoːzɛː|ʃoːmeːɡaː|huːzyːpoː|kaːfuːreː,phon_1_son,0.054622,TP-random position-fixed,kaː|fuː|poː|huː|hoː|zɛː|ʃoː|zyː|ɡaː|beː|meː|re...
1,Controlled lexicons (ARC),beːhoːzɛː|ʃoːmeːɡaː|huːzyːpoː|kaːfuːreː,phon_1_back,0.021008,TP-random position-fixed,kaː|fuː|poː|huː|hoː|zɛː|ʃoː|zyː|ɡaː|beː|meː|re...
2,Controlled lexicons (ARC),beːhoːzɛː|ʃoːmeːɡaː|huːzyːpoː|kaːfuːreː,phon_1_hi,0.021008,TP-random position-fixed,kaː|fuː|poː|huː|hoː|zɛː|ʃoː|zyː|ɡaː|beː|meː|re...
3,Controlled lexicons (ARC),beːhoːzɛː|ʃoːmeːɡaː|huːzyːpoː|kaːfuːreː,phon_1_lab,0.035014,TP-random position-fixed,kaː|fuː|poː|huː|hoː|zɛː|ʃoː|zyː|ɡaː|beː|meː|re...
4,Controlled lexicons (ARC),beːhoːzɛː|ʃoːmeːɡaː|huːzyːpoː|kaːfuːreː,phon_1_cor,0.068627,TP-random position-fixed,kaː|fuː|poː|huː|hoː|zɛː|ʃoː|zyː|ɡaː|beː|meː|re...
...,...,...,...,...,...,...
15115,Reference lexicons (Literature),ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku,phon_2_lo,0.142857,TP-structured,ti|bu|doʊ|pa|bi|ku|da|ɹoʊ|pi|ɡoʊ|la|tu|da|ɹoʊ|...
15116,Reference lexicons (Literature),ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku,phon_2_lab,0.140056,TP-structured,ti|bu|doʊ|pa|bi|ku|da|ɹoʊ|pi|ɡoʊ|la|tu|da|ɹoʊ|...
15117,Reference lexicons (Literature),ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku,phon_2_tense,0.000000,TP-structured,ti|bu|doʊ|pa|bi|ku|da|ɹoʊ|pi|ɡoʊ|la|tu|da|ɹoʊ|...
15118,Reference lexicons (Literature),ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku,phon_2_long,0.000000,TP-structured,ti|bu|doʊ|pa|bi|ku|da|ɹoʊ|pi|ɡoʊ|la|tu|da|ɹoʊ|...


In [15]:
# There is always randomness in the generation of the lexicons etc., so if you want the exact data from the publication uncomment below 
# df = pd.read_csv("data_submission/full_dataset.csv")

df_lexicons = df[["Control", "Lexicon"]].drop_duplicates().reset_index(drop=True)
df_lexicons.to_csv("results/all_lexicons.csv")
df_lexicons

Unnamed: 0,Control,Lexicon
0,Controlled lexicons (ARC),beːhoːzɛː|ʃoːmeːɡaː|huːzyːpoː|kaːfuːreː
1,Controlled lexicons (ARC),foːkaːnuː|hiːbyːʃoː|ɡaːluːfyː|zuːpoːhøː
2,Controlled lexicons (ARC),fyːhoːtiː|ʃoːmeːɡaː|huːzyːpoː|kaːfuːreː
3,Controlled lexicons (ARC),hiːboːzyː|huːdoːfaː|ʃaːmoːɡɛː|poːʃuːhøː
4,Controlled lexicons (ARC),hiːboːzyː|kuːnyːfoː|vaːtyːhøː|ʃoːkaːmɛː
...,...,...
58,Reference lexicons (Literature),pukemi|rafinu|binapo|medoxi
59,Reference lexicons (Literature),tifaxu|soduxi|melubo|ɡanipe
60,Reference lexicons (Literature),tupiɹoʊ|ɡoʊlabu|padoʊti|bidaku
61,Reference lexicons (Literature),ɡonabe|mudila|ronixe|pikusa


In [16]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from pingouin import ttest

# There is always randomness in the generation of the lexicons etc., so if you want the exact data from the publication uncomment below 
# df = pd.read_csv("data_submission/full_dataset.csv")

tp_modes_pretty = ["TP-random position-random", "TP-random position-fixed", "TP-structured"]
dfs = []

for i, tp_mode in enumerate(tp_modes_pretty):
    df2 = df[(df["Stream TP mode"] == tp_mode) & (df["Feature"] == "max")]
    cat1 = df2[df2['Control']=='Controlled lexicons (ARC)']["PRI"]
    cat2 = df2[df2['Control']=='Reference lexicons (Literature)']["PRI"]
    this = ttest(list(cat1), list(cat2), alternative="less")
    this.index = [f"controlled vs. reference {tp_mode}"]
    dfs.append(this)

print("")

for i, tp_mode in enumerate(tp_modes_pretty):
    df2 = df[(df["Stream TP mode"] == tp_mode) & (df["Feature"] == "max")]
    cat1 = df2[df2['Control']=='Controlled lexicons (ARC)']["PRI"]
    cat2 = df2[df2['Control']=='Random lexicons (Baseline)']["PRI"]
    this = ttest(list(cat1), list(cat2), alternative="less")
    this.index = [f"controlled vs. random baseline {tp_mode}"]
    dfs.append(this)

print("")

for i, tp_mode in enumerate(tp_modes_pretty):
    df2 = df[(df["Stream TP mode"] == tp_mode) & (df["Feature"] == "max")]
    cat1 = df2[df2['Control']=='Reference lexicons (Literature)']["PRI"]
    cat2 = df2[df2['Control']=='Random lexicons (Baseline)']["PRI"]
    this = ttest(list(cat1), list(cat2), alternative="less")
    this.index = [f"reference vs. random baseline {tp_mode}"]
    dfs.append(this)

ttest_df = pd.concat(dfs)

display(ttest_df)

ttest_df.to_csv("results/ttest_results.csv")





Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
controlled vs. reference TP-random position-random,5.509334,208,less,0.9999999,"[-inf, 0.01]",0.76036,4.468e-06,4.789502e-13
controlled vs. reference TP-random position-fixed,-11.446218,208,less,3.804335e-24,"[-inf, -0.1]",1.579728,7.833e+20,1.0
controlled vs. reference TP-structured,-7.541331,208,less,7.096699e-13,"[-inf, -0.08]",1.040803,9374000000.0,1.0
controlled vs. random baseline TP-random position-random,0.666013,208,less,0.7469295,"[-inf, 0.0]",0.091918,0.37,0.01047996
controlled vs. random baseline TP-random position-fixed,-12.08262,208,less,4.11483e-26,"[-inf, -0.3]",1.66756,6.54e+22,1.0
controlled vs. random baseline TP-structured,-16.958679,208,less,2.2042999999999998e-41,"[-inf, -0.37]",2.34052,6.673999999999999e+37,1.0
reference vs. random baseline TP-random position-random,-5.461872,208,less,6.68693e-08,"[-inf, -0.01]",0.75381,179700.0,0.9999274
reference vs. random baseline TP-random position-fixed,-7.665437,208,less,3.350085e-13,"[-inf, -0.18]",1.057931,19250000000.0,1.0
reference vs. random baseline TP-structured,-11.304733,208,less,1.0337850000000001e-23,"[-inf, -0.26]",1.560201,2.951e+20,1.0


## Example ARC Lexicon From Appendix

In [17]:
example_arc_lexicon = to_lexicon([["heː", "doː", "faː"], ["riː", "foː", "ɡyː"], ["ʃuː", "hiː", "boː"], ["vaː", "kuː", "niː"]])
print("Example Lexicon:", example_arc_lexicon)

streams = make_streams(
        [example_arc_lexicon], 
        max_rhythmicity=None, 
        num_repetitions=N_REPS
        )

for stream in streams:
        print_stream_info(stream)

Example Lexicon: heːdoːfaː|riːfoːɡyː|ʃuːhiːboː|vaːkuːniː
Stream: ʃuː|faː|kuː|niː|doː|ɡyː|riː|vaː|foː|boː|heː|hiː|ɡyː|boː|vaː|doː|foː|heː|kuː|riː|niː|faː|hiː|ʃuː|hiː|niː|boː|foː|ɡyː|faː|heː|ʃuː|riː|kuː|doː|vaː|niː|kuː|heː|vaː|ɡyː|doː|faː|foː|ʃuː|boː|hiː|riː|ʃuː|niː|ɡyː|vaː|heː|doː|riː|faː|boː|kuː|foː|hiː|heː|riː|foː|doː|kuː|faː|ɡyː|ʃuː|vaː|hiː|boː|niː|foː|vaː|kuː|boː|ɡyː|heː|niː|ʃuː|doː|hiː|faː|riː|heː|boː|faː|ʃuː|kuː|ɡyː|foː|niː|hiː|vaː|riː|doː|heː|faː|niː|riː|hiː|foː|kuː|vaː|boː|doː|ʃuː|ɡyː|niː|heː|foː|faː|doː|boː|ʃuː|kuː|hiː|ɡyː|riː|vaː|faː|vaː|ʃuː|foː|riː|ɡyː|hiː|doː|niː|boː|heː|kuː|ʃuː|heː|ɡyː|kuː|boː|riː|vaː|foː|faː|doː|hiː|niː|vaː|niː|ʃuː|riː|boː|kuː|ɡyː|foː|heː|doː|faː|hiː|kuː|doː|foː|boː|ɡyː|faː|niː|hiː|ʃuː|heː|vaː|riː|foː|ʃuː|vaː|faː|ɡyː|kuː|riː|niː|heː|hiː|doː|boː|riː|kuː|ʃuː|ɡyː|boː|doː|niː|foː|hiː|vaː|heː|faː|heː|riː|doː|ʃuː|hiː|kuː|faː|vaː|boː|foː|niː|ɡyː|doː|kuː|vaː|ɡyː|ʃuː|foː|riː|faː|boː|hiː|heː|niː|doː|ɡyː|heː|boː|vaː|ʃuː|faː|kuː|niː|riː|hiː|foː|kuː|foː|vaː|hiː|faː|riː