# Compare Lexicons

We compare the streams generated with 

1. controlled lexicons (ours),
2. random baseline streams, and
3. streams generated based on reference lexicons from the literature

based on the repetitiveness of the phoneme features.

In [3]:
from arc import load_phonemes, make_syllables, make_words
from arc import make_lexicons, load_words

import numpy as np 
import random

words = load_words("data_submission/words.json")

In [4]:
from copy import copy
import pandas as pd
from arc.core.base_types import Register
import numpy as np

from arc.core.syllable import LABELS_C, LABELS_V, syllable_from_phonemes
from arc.core.word import Word, word_overlap_matrix

phonemes = load_phonemes()

syll_feature_labels = [LABELS_C, LABELS_V]
syllable_type = "cv"

def to_phoneme(phoneme):
    return phoneme

def to_syllable(syllable):
    if len(syllable) == 3 and not syllable.endswith("ː"):
        syllable_obj = syllable_from_phonemes(phonemes, syllable[:2], syll_feature_labels)
        syllable_obj.id = syllable
        return syllable_obj
    return syllable_from_phonemes(phonemes, syllable, syll_feature_labels)

def to_word(word):
    syllables_list = list(map(to_syllable, word))
    word_id = "".join(s.id for s in syllables_list)
    word_features = list(list(tup) for tup in zip(*[s.info["binary_features"] for s in syllables_list]))
    return Word(id=word_id, info={"binary_features": word_features}, syllables=syllables_list)

def to_lexicon(lexicon):
    word_objs_list = list(map(to_word, lexicon))
    lexicon = Register({w.id:  w for w in word_objs_list})
    lexicon.info.update({"syllable_feature_labels": [LABELS_C, LABELS_V],  "syllable_type": syllable_type})
    overlap = word_overlap_matrix(lexicon)
    lexicon.info["cumulative_feature_repetitiveness"] = np.triu(overlap, 1).sum()
    lexicon.info["max_pairwise_feature_repetitiveness"] = np.triu(overlap, 1).max()
    return lexicon

In [5]:
N_LEXICONS = 21  # number of lexicons per TP mode
N_WORDS_PER_LEXICON = 4  
N_REPS = 5  # how often to randomize the lexicon to build the total stream, 
            # i.e. how long will the streams be in lexicon lengths N_REPS*N_WORDS_PER_LEXICON = n words in the stream
N_STREAMS_PER_INPUT = 5

In [6]:
def print_stream_info(stream):
    print("Stream:", "|".join([syll.id for syll in stream]))
    print("TP mode:", stream.info["stream_tp_mode"])
    print("Lexicon:", stream.info["lexicon"])
    print("Feature PRIs:", stream.info["rhythmicity_indexes"])
    print("")

### ARC Lexicons

In [24]:
from arc import make_streams
from arc import make_lexicons, Register, load_phonemes, load_words

print("Load words...")
words = load_words("data_submission/words.json")

print("Make lexicons...")
controlled_lexicons = make_lexicons(words, n_lexicons=N_LEXICONS, n_words=N_WORDS_PER_LEXICON, control_features=True)

for i, lex in enumerate(controlled_lexicons):
    print(i, lex)

print("")
print("Example Lexicon:", controlled_lexicons[0])
print("Info:", controlled_lexicons[0].info)
print("")


Load words...
Make lexicons...




0 huːfiːtoː|poːzuːhiː|ʃaːmeːɡiː|luːɡaːfyː
1 poːzuːhiː|faːhøːdeː|luːɡaːfyː|ʃiːpeːhuː
2 tuːfyːhiː|huːpoːʃiː|ɡiːroːvaː|beːzuːhoː
3 nuːɡiːfyː|poːzuːhiː|ɡaːlyːfuː|heːboːzɛː
4 fuːtyːhøː|loːɡiːfaː|ʃiːpeːhoː|koːmeːzuː
5 ʃiːpeːhuː|koːmuːzyː|heːdoːfiː|vaːtyːhøː
6 huːpoːʃiː|foːɡaːnyː|ʃaːbyːhøː|ɡiːnɛːfaː
7 huːpoːʃiː|doːheːfyː|peːhoːzɛː|ɡiːzuːmoː
8 ʃiːpeːhuː|faːhøːdeː|hiːvaːtoː|ɡiːzuːmoː
9 biːzuːheː|tuːfyːhiː|koːmuːzyː|hoːʃaːpeː
10 huːtyːfaː|poːzuːhiː|koːmuːzyː|ʃuːɡaːmeː
11 tuːfyːhiː|huːpoːʃiː|doːheːvaː|fiːloːɡyː
12 tuːfyːhiː|beːhuːʃiː|ɡaːryːfuː|heːpiːʃaː
13 ɡaːfuːnyː|poːzuːhiː|ryːɡiːvaː|biːhoːʃuː
14 tuːfyːhiː|fiːroːkuː|ryːɡiːvaː|heːboːzɛː
15 faːluːkoː|tuːfyːhiː|toːheːvaː|huːpoːʃiː
16 tuːfyːhiː|huːpoːʃiː|fiːryːkoː|hoːʃaːpeː
17 peːhoːzɛː|poːzuːhiː|nyːfaːkoː|huːtiːfyː
18 nøːvaːɡiː|faːhøːdeː|ʃuːhoːpaː|poːzuːhiː
19 fuːriːɡaː|tuːfyːhiː|fiːkoːryː|hoːʃaːpeː
20 ʃiːpeːhuː|heːboːzɛː|doːhiːvaː|fuːnyːkoː

Example Lexicon: huːfiːtoː|poːzuːhiː|ʃaːmeːɡiː|luːɡaːfyː
Info: {'phoneme_feature_labels': ['syl', 'son', '

In [8]:
controlled_streams = Register()
for _ in range(N_STREAMS_PER_INPUT):
    for stream in make_streams(
        controlled_lexicons, 
        max_rhythmicity=None, 
        num_repetitions=N_REPS
        ):
        controlled_streams.append(stream)

print_stream_info(controlled_streams[0])

len(controlled_streams)

Stream: hoː|meː|zyː|koː|riː|tɛː|høː|kaː|faː|peː|fyː|ʃoː|riː|faː|høː|zyː|tɛː|meː|kaː|ʃoː|hoː|fyː|peː|koː|ʃoː|høː|riː|meː|hoː|kaː|tɛː|zyː|fyː|faː|koː|peː|ʃoː|meː|tɛː|peː|hoː|koː|kaː|høː|faː|zyː|riː|fyː|meː|ʃoː|kaː|fyː|tɛː|riː|hoː|peː|zyː|høː|koː|faː|tɛː|kaː|hoː|faː|fyː|høː|peː|meː|riː|koː|zyː|ʃoː|zyː|faː|meː|koː|tɛː|hoː|riː|høː|ʃoː|fyː|kaː|peː|kaː|zyː|peː|riː|ʃoː|tɛː|faː|hoː|høː|meː|fyː|koː|høː|fyː|zyː|hoː|ʃoː|peː|tɛː|koː|meː|faː|riː|kaː|koː|hoː|zyː|kaː|meː|høː|tɛː|fyː|riː|peː|faː|ʃoː|koː|fyː|hoː|tɛː|ʃoː|faː|kaː|riː|zyː|meː|peː|høː|hoː|faː|fyː|ʃoː|koː|kaː|zyː|peː|tɛː|riː|høː|meː|ʃoː|zyː|fyː|høː|tɛː|koː|faː|kaː|peː|meː|hoː|riː|fyː|peː|koː|hoː|høː|ʃoː|riː|faː|meː|zyː|kaː|tɛː|fyː|riː|kaː|ʃoː|tɛː|hoː|peː|zyː|faː|høː|koː|meː|tɛː|meː|fyː|zyː|høː|kaː|koː|riː|hoː|ʃoː|peː|faː|hoː|zyː|ʃoː|kaː|faː|peː|fyː|koː|tɛː|høː|riː|meː|riː|ʃoː|fyː|faː|tɛː|zyː|meː|kaː|høː|peː|hoː|koː|fyː|meː|peː|kaː|hoː|tɛː|ʃoː|faː|riː|zyː|koː|høː|faː|zyː|riː|peː|høː|fyː|tɛː|kaː|meː|koː|ʃoː|hoː|meː|høː|zyː|hoː|fyː|kaː|riː|tɛː|

315

### Random / uncontrolled lexicons (baseline)

In [25]:
random_lexicons = make_lexicons(words, n_lexicons=N_LEXICONS, n_words=N_WORDS_PER_LEXICON, control_features=False)

for i, lex in enumerate(random_lexicons):
    print(i, lex)

print("")
print("Example Lexicon:", random_lexicons[0])
print("Info:", random_lexicons[0].info)
print("")

0 ʃaːpeːhøː|beːhoːʃøː|foːtyːhiː|vaːdeːhiː
1 ʃaːmoːɡyː|koːvaːreː|tyːhoːfiː|fiːdoːhuː
2 fiːnuːkoː|huːfoːtiː|fuːɡaːniː|koːreːvaː
3 biːhoːʃuː|ʃoːpeːhiː|ʃaːpeːhiː|ʃuːbeːhoː
4 nuːɡiːfaː|vaːdeːhiː|fyːruːkɛː|ɡiːfyːraː
5 hoːtuːfaː|tiːhoːfaː|faːroːɡɛː|kuːfoːnøː
6 kuːfaːreː|tɛːheːfaː|huːdoːfiː|hiːdeːfyː
7 koːvaːlyː|ɡiːmoːzyː|kaːruːfoː|fiːluːkaː
8 fyːriːkoː|fyːruːkɛː|ʃaːpeːhuː|høːtiːfoː
9 fuːɡaːroː|ɡaːreːfiː|huːzyːboː|fiːdoːhøː
10 hoːzyːpiː|huːzyːboː|huːfiːtɛː|laːkuːfyː
11 huːfiːtoː|nyːfoːɡɛː|ʃoːbøːhuː|ʃaːpeːhøː
12 lyːvaːkuː|koːreːfyː|raːkoːfiː|ʃaːpeːhuː
13 ɡiːfyːraː|ryːɡiːfaː|hoːʃaːbuː|koːnyːfaː
14 niːkoːfuː|kuːfoːnɛː|fiːlaːkoː|nɛːfaːkuː
15 fyːraːkɛː|ryːvaːkøː|fyːruːkɛː|riːkoːfuː
16 laːkuːfyː|hiːdeːfoː|deːhiːvaː|kaːzuːmyː
17 tiːheːfyː|fuːɡiːroː|ʃuːkoːmyː|reːkoːfiː
18 ɡiːfyːloː|zyːpeːhiː|fiːkoːnyː|koːvaːlyː
19 huːfiːtoː|nuːɡiːfaː|fiːkoːnyː|poːhuːʃiː
20 koːzyːmuː|høːdeːfoː|huːzyːboː|fiːkoːnyː

Example Lexicon: ʃaːpeːhøː|beːhoːʃøː|foːtyːhiː|vaːdeːhiː
Info: {'phoneme_feature_labels': ['syl', 'son', '

In [10]:
random_streams = Register()
for _ in range(N_STREAMS_PER_INPUT):
    for stream in make_streams(
        random_lexicons, 
        max_rhythmicity=None, 
        num_repetitions=N_REPS
        ):
        random_streams.append(stream)
        
print_stream_info(random_streams[0])

len(random_streams)

Stream: fiː|heː|fyː|kaː|tɛː|biː|loː|fuː|huː|kaː|nuː|ʃuː|nuː|huː|fiː|fyː|biː|tɛː|ʃuː|kaː|heː|fuː|kaː|loː|heː|ʃuː|kaː|biː|fiː|kaː|huː|loː|nuː|tɛː|fyː|fuː|heː|kaː|fiː|tɛː|fuː|biː|nuː|loː|kaː|huː|fyː|ʃuː|fuː|ʃuː|huː|kaː|biː|fyː|fiː|nuː|heː|loː|tɛː|kaː|ʃuː|heː|nuː|biː|kaː|fuː|fyː|huː|tɛː|fiː|loː|kaː|kaː|fiː|huː|fuː|nuː|kaː|loː|biː|ʃuː|fyː|heː|tɛː|nuː|fyː|loː|ʃuː|tɛː|heː|kaː|kaː|fuː|fiː|biː|huː|heː|fiː|ʃuː|biː|fuː|loː|huː|nuː|kaː|tɛː|kaː|fyː|nuː|fuː|tɛː|loː|fyː|kaː|heː|huː|biː|kaː|ʃuː|fiː|fuː|kaː|nuː|fiː|kaː|fyː|tɛː|huː|ʃuː|loː|heː|biː|heː|tɛː|loː|fiː|kaː|nuː|biː|ʃuː|kaː|huː|fuː|fyː|kaː|loː|fuː|ʃuː|fiː|heː|fyː|nuː|tɛː|kaː|biː|huː|fyː|tɛː|fiː|huː|heː|biː|nuː|ʃuː|fuː|kaː|loː|kaː|tɛː|huː|nuː|fuː|fiː|loː|biː|fyː|heː|ʃuː|kaː|kaː|ʃuː|biː|fiː|kaː|fyː|loː|huː|kaː|heː|fuː|tɛː|nuː|fyː|ʃuː|huː|kaː|fuː|biː|loː|nuː|kaː|tɛː|heː|fiː|nuː|huː|tɛː|biː|kaː|heː|kaː|fiː|ʃuː|loː|fyː|fuː|heː|loː|tɛː|ʃuː|nuː|kaː|fiː|fuː|huː|biː|kaː|fyː|huː|ʃuː|fyː|kaː|fuː|loː|fiː|biː|tɛː|kaː|nuː|heː|nuː|fiː|fyː|biː|heː|kaː|kaː|huː|

315

### Reference lexicons from the literature

In [21]:
lexicons = [
 [['pi', 'ɾu', 'ta'],
  ['ba', 'ɡo', 'li'],
  ['to', 'ku', 'da'],
  ['ɡu', 'ki', 'bo']],
 [['pa', 'be', 'la'],
  ['di', 'ne', 'ka'],
  ['lu', 'fa', 'ri'],
  ['xi', 'so', 'du']],
 [['ma', 'xu', 'pe'],
  ['xe', 'ro', 'ɡa'],
  ['de', 'mu', 'si'],
  ['fo', 'le', 'ti']],
 [['pu', 'ke', 'mi'],
  ['ra', 'fi', 'nu'],
  ['bi', 'na', 'po'],
  ['me', 'do', 'xi']],
 [['no', 'ni', 'xe'],
  ['bu', 'lo', 'te'],
  ['re', 'mo', 'fu'],
  ['ko', 'tu', 'sa']],
 [['mi', 'lo', 'de'],
  ['da', 'le', 'bu'],
  ['no', 'ru', 'pa'],
  ['ka', 'te', 'xi']],
 [['ne', 'do', 'li'],
  ['ri', 'fo', 'nu'],
  ['ba', 'to', 'ɡu'],
  ['ki', 'ra', 'pu']],
 [['ɡo', 'na', 'be'],
  ['mu', 'di', 'la'],
  ['ro', 'ni', 'xe'],
  ['pi', 'ku', 'sa']],
 [['fu', 'bi', 're'],
  ['xe', 'tu', 'si'],
  ['ta', 'fi', 'ko'],
  ['ke', 'ma', 'po']],
 [['ti', 'fa', 'xu'],
  ['so', 'du', 'xi'],
  ['me', 'lu', 'bo'],
  ['ɡa', 'ni', 'pe']],
 [['mi', 'po', 'la'],
  ['za', 'bɛ', 'tu'],
  ['ʁo', 'ki', 'sɛ'],
  ['nu', 'ɡa', 'di']],
 [['dɛ', 'mʊ', 'ri'],
  ['sɛ', 'ni', 'ɡɛ'],
  ['ræ', 'ku', 'səʊ'],
  ['pi', 'lɛ', 'ru']],
 [['ki', 'fəʊ', 'bu'],
  ['lu', 'fɑ', 'ɡi'],
  ['pæ', 'beɪ', 'lɑ'],
  ['tɑ', 'ɡəʊ', 'fʊ']],
 [['bi', 'du', 'pɛ'],
  ['məʊ', 'bɑ', 'li'],
  ['rɛ', 'ɡæ', 'tʊ'],
  ['sæ', 'tɛ', 'kəʊ']],
 [['bəʊ', 'dɑ', 'mɛ'],
  ['fi', 'nəʊ', 'pɑ'],
  ['ɡʊ', 'rɑ', 'təʊ'],
  ['ləʊ', 'kæ', 'neɪ']],
 [['fɛ', 'si', 'nɑ'],
  ['kɛ', 'su', 'dəʊ'],
  ['mæ', 'pʊ', 'di'],
  ['ti', 'mi', 'nu']],
 [['tu', 'pi', 'ɹoʊ'],
  ['ɡoʊ', 'la', 'bu'],
  ['pa', 'doʊ', 'ti'],
  ['bi', 'da', 'ku']],
 [['meɪ', 'lu', 'ɡi'],
  ['ɹa', 'fi', 'nu'],
  ['pu', 'keɪ', 'mi'],
  ['toʊ', 'na', 'poʊ']],
 [['ɡoʊ', 'la', 'tu'],
  ['da', 'ɹoʊ', 'pi'],
  ['ti', 'bu', 'doʊ'],
  ['pa', 'bi', 'ku']],
 [['poʊ', 'fi', 'mu'],
  ['noʊ', 'vu', 'ka'],
  ['vi', 'koʊ', 'ɡa'],
  ['ba', 'fu', 'ɡi']],
 [['ma', 'nu', 'toʊ'],
  ['ni', 'moʊ', 'lu'],
  ['voʊ', 'ɹi', 'fa'],
  ['li', 'du', 'ɹa']]]

ref_lexicons = list(map(to_lexicon, lexicons))

for i, lex in enumerate(ref_lexicons):
    print(i, lex)

print("")
print("Example Lexicon:", ref_lexicons[0])
print("Info:", ref_lexicons[0].info)
print("")

0 piɾuta|baɡoli|tokuda|ɡukibo
1 pabela|dineka|lufari|xisodu
2 maxupe|xeroɡa|demusi|foleti
3 pukemi|rafinu|binapo|medoxi
4 nonixe|bulote|remofu|kotusa
5 milode|dalebu|norupa|katexi
6 nedoli|rifonu|batoɡu|kirapu
7 ɡonabe|mudila|ronixe|pikusa
8 fubire|xetusi|tafiko|kemapo
9 tifaxu|soduxi|melubo|ɡanipe
10 mipola|zabɛtu|ʁokisɛ|nuɡadi
11 dɛmʊri|sɛniɡɛ|rækusəʊ|pilɛru
12 kifəʊbu|lufɑɡi|pæbeɪlɑ|tɑɡəʊfʊ
13 bidupɛ|məʊbɑli|rɛɡætʊ|sætɛkəʊ
14 bəʊdɑmɛ|finəʊpɑ|ɡʊrɑtəʊ|ləʊkæneɪ
15 fɛsinɑ|kɛsudəʊ|mæpʊdi|timinu
16 tupiɹoʊ|ɡoʊlabu|padoʊti|bidaku
17 meɪluɡi|ɹafinu|pukeɪmi|toʊnapoʊ
18 ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku
19 poʊfimu|noʊvuka|vikoʊɡa|bafuɡi
20 manutoʊ|nimoʊlu|voʊɹifa|liduɹa

Example Lexicon: piɾuta|baɡoli|tokuda|ɡukibo
Info: {'syllable_feature_labels': [['son', 'back', 'hi', 'lab', 'cor', 'cont', 'lat', 'nas', 'voi'], ['back', 'hi', 'lo', 'lab', 'tense', 'long']], 'syllable_type': 'cv', 'cumulative_feature_repetitiveness': 5, 'max_pairwise_feature_repetitiveness': 2}



In [12]:
ref_streams = Register()
for _ in range(N_STREAMS_PER_INPUT):
    for stream in make_streams(
        ref_lexicons, 
        max_rhythmicity=None, 
        num_repetitions=N_REPS
        ):
        ref_streams.append(stream)

print_stream_info(ref_streams[0])

len(ref_streams)

Stream: ɾu|ku|ba|ɡo|ta|ɡu|ki|pi|to|bo|li|da|ta|ɡo|bo|ku|da|ki|li|to|ɾu|ba|ɡu|pi|da|ɾu|ta|to|ba|li|ɡu|bo|ɡo|pi|ki|ku|to|ku|ki|ta|li|pi|ɾu|ɡo|ɡu|ba|bo|da|bo|pi|ɡo|ku|ta|ɾu|da|ba|to|ki|ɡu|li|ɡo|to|ta|pi|bo|ba|da|ɡu|ku|li|ki|ɾu|ɡu|da|ɡo|li|ɾu|bo|ta|ba|ki|to|pi|ku|pi|li|ba|ɾu|to|da|ku|ɡo|ki|bo|ɡu|ta|ki|da|li|bo|ɾu|pi|ɡu|to|ɡo|ba|ta|ku|ɡu|ɡo|ɾu|li|ku|bo|ki|ba|pi|ta|da|to|li|ta|bo|to|ɡu|ɾu|ki|ɡo|da|pi|ba|ku|ɾu|ki|ɡu|ɡo|ku|ta|ba|li|to|da|pi|bo|ɡo|bo|li|ku|ɾu|ɡu|ta|ki|ba|to|pi|da|ku|bo|pi|ɡo|da|ta|ɾu|li|ba|ɡu|ki|to|ba|ku|ɡo|pi|ɾu|ta|bo|to|ɡu|da|ki|li|bo|ta|li|ɡo|ba|ki|da|ɾu|to|ku|pi|ɡu|pi|ta|da|ba|ɡo|ɾu|ku|li|ki|bo|ɡu|to|ki|ku|ba|ta|to|bo|ɾu|da|ɡo|ɡu|li|pi|li|ta|ku|to|ɡo|ki|pi|ba|bo|da|ɡu|ɾu|ba|da|li|ɡu|bo|ku|ki|ɾu|ɡo|ta|pi|to|li|da|bo|ba|ɾu|pi|ki|ɡo|to|ta|ɡu|ku|ɡu|ba|pi|ku|da|to|ɾu|bo|ki|ta|ɡo|li|ɾu|ku|ɡu|to|ki|ba|ɡo|pi|da|bo|ta|li|da|to|ta|pi|ki|ɾu|li|bo|ba|ɡu|ku|ɡo|to|li|ta|ɡo|bo|ɡu|ki|ku|da|ba|ɾu|pi|to|pi|ba|bo|li|ku|ɾu|ki|ɡo|da|ta|ɡu|pi|ɡu|ta|ku|to|da|ɡo|ba|ki|li|ɾu|bo|pi|ku|ta|to|ɡu|ɾu|ba

315

### Collect Results

We collect all stream generation results and their feature repetitiveness scores in a dataframe.

In [13]:
import pandas as pd

data = {"Control": [], "Lexicon": [], "Feature": [], "PRI": [], "Stream TP mode": [], "Stream": []}

mode_to_mode = {  # TP-random position-random; TP-random position-fixed and TP-structured
    "random": "TP-random position-random",
    "word_structured": "TP-structured",
    "position_controlled": "TP-random position-fixed"
}

for control, streams in [("Controlled lexicons (ARC)", controlled_streams), ("Reference lexicons (Literature)", ref_streams), ("Random lexicons (Baseline)", random_streams)]:
    for stream in streams:
        for k, v in stream.info["rhythmicity_indexes"].items():
            data["Feature"].append(k)
            data["PRI"].append(v)
            data["Control"].append(control)
            data["Lexicon"].append(str(stream.info["lexicon"]))
            data["Stream TP mode"].append(mode_to_mode[stream.info["stream_tp_mode"]])
            data["Stream"].append("|".join(syll.id for syll in stream))
        data["Feature"].append("max")
        data["PRI"].append(max(stream.info["rhythmicity_indexes"].values()))
        data["Control"].append(control)
        data["Lexicon"].append(str(stream.info["lexicon"]))
        data["Stream TP mode"].append(mode_to_mode[stream.info["stream_tp_mode"]])
        data["Stream"].append("|".join(syll.id for syll in stream))

df = pd.DataFrame(data).sort_values(["Control", "Lexicon", "Stream TP mode"]).reset_index(drop=True)

import os
os.makedirs("results/", exist_ok=True)
df.to_csv("results/full_dataset.csv")

df

Unnamed: 0,Control,Lexicon,Feature,PRI,Stream TP mode,Stream
0,Controlled lexicons (ARC),byːhoːzuː|ʃuːkoːmɛː|faːtɛːheː|reːfoːɡyː,phon_1_son,0.058824,TP-random position-fixed,faː|foː|ɡyː|reː|koː|heː|byː|tɛː|mɛː|ʃuː|hoː|zu...
1,Controlled lexicons (ARC),byːhoːzuː|ʃuːkoːmɛː|faːtɛːheː|reːfoːɡyː,phon_1_back,0.011204,TP-random position-fixed,faː|foː|ɡyː|reː|koː|heː|byː|tɛː|mɛː|ʃuː|hoː|zu...
2,Controlled lexicons (ARC),byːhoːzuː|ʃuːkoːmɛː|faːtɛːheː|reːfoːɡyː,phon_1_hi,0.011204,TP-random position-fixed,faː|foː|ɡyː|reː|koː|heː|byː|tɛː|mɛː|ʃuː|hoː|zu...
3,Controlled lexicons (ARC),byːhoːzuː|ʃuːkoːmɛː|faːtɛːheː|reːfoːɡyː,phon_1_lab,0.058824,TP-random position-fixed,faː|foː|ɡyː|reː|koː|heː|byː|tɛː|mɛː|ʃuː|hoː|zu...
4,Controlled lexicons (ARC),byːhoːzuː|ʃuːkoːmɛː|faːtɛːheː|reːfoːɡyː,phon_1_cor,0.051821,TP-random position-fixed,faː|foː|ɡyː|reː|koː|heː|byː|tɛː|mɛː|ʃuː|hoː|zu...
...,...,...,...,...,...,...
15115,Reference lexicons (Literature),ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku,phon_2_lo,0.148459,TP-structured,ɡoʊ|la|tu|ti|bu|doʊ|da|ɹoʊ|pi|pa|bi|ku|ɡoʊ|la|...
15116,Reference lexicons (Literature),ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku,phon_2_lab,0.142857,TP-structured,ɡoʊ|la|tu|ti|bu|doʊ|da|ɹoʊ|pi|pa|bi|ku|ɡoʊ|la|...
15117,Reference lexicons (Literature),ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku,phon_2_tense,0.000000,TP-structured,ɡoʊ|la|tu|ti|bu|doʊ|da|ɹoʊ|pi|pa|bi|ku|ɡoʊ|la|...
15118,Reference lexicons (Literature),ɡoʊlatu|daɹoʊpi|tibudoʊ|pabiku,phon_2_long,0.000000,TP-structured,ɡoʊ|la|tu|ti|bu|doʊ|da|ɹoʊ|pi|pa|bi|ku|ɡoʊ|la|...


In [14]:
# There is always randomness in the generation of the lexicons etc., so if you want the exact data from the publication uncomment below 
# df = pd.read_csv("data_submission/full_dataset.csv")

df_lexicons = df[["Control", "Lexicon"]].drop_duplicates().reset_index(drop=True)
df_lexicons.to_csv("results/all_lexicons.csv")
df_lexicons

Unnamed: 0,Control,Lexicon
0,Controlled lexicons (ARC),byːhoːzuː|ʃuːkoːmɛː|faːtɛːheː|reːfoːɡyː
1,Controlled lexicons (ARC),byːhoːzuː|ʃuːkoːmɛː|høːdeːfaː|reːfoːɡyː
2,Controlled lexicons (ARC),faːhoːtɛː|riːkaːfyː|ʃoːpeːhøː|meːzyːkoː
3,Controlled lexicons (ARC),faːhoːtɛː|roːkuːfyː|ʃuːbiːheː|ɡiːʃoːmyː
4,Controlled lexicons (ARC),fyːkoːruː|høːdeːfaː|tyːhoːfuː|ryːfoːɡiː
...,...,...
58,Reference lexicons (Literature),pukemi|rafinu|binapo|medoxi
59,Reference lexicons (Literature),tifaxu|soduxi|melubo|ɡanipe
60,Reference lexicons (Literature),tupiɹoʊ|ɡoʊlabu|padoʊti|bidaku
61,Reference lexicons (Literature),ɡonabe|mudila|ronixe|pikusa


In [15]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
from pingouin import ttest

# There is always randomness in the generation of the lexicons etc., so if you want the exact data from the publication uncomment below 
# df = pd.read_csv("data_submission/full_dataset.csv")

tp_modes_pretty = ["TP-random position-random", "TP-random position-fixed", "TP-structured"]
dfs = []

for i, tp_mode in enumerate(tp_modes_pretty):
    df2 = df[(df["Stream TP mode"] == tp_mode) & (df["Feature"] == "max")]
    cat1 = df2[df2['Control']=='Controlled lexicons (ARC)']["PRI"]
    cat2 = df2[df2['Control']=='Reference lexicons (Literature)']["PRI"]
    this = ttest(list(cat1), list(cat2), alternative="less")
    this.index = [f"controlled vs. reference {tp_mode}"]
    dfs.append(this)

print("")

for i, tp_mode in enumerate(tp_modes_pretty):
    df2 = df[(df["Stream TP mode"] == tp_mode) & (df["Feature"] == "max")]
    cat1 = df2[df2['Control']=='Controlled lexicons (ARC)']["PRI"]
    cat2 = df2[df2['Control']=='Random lexicons (Baseline)']["PRI"]
    this = ttest(list(cat1), list(cat2), alternative="less")
    this.index = [f"controlled vs. random baseline {tp_mode}"]
    dfs.append(this)

print("")

for i, tp_mode in enumerate(tp_modes_pretty):
    df2 = df[(df["Stream TP mode"] == tp_mode) & (df["Feature"] == "max")]
    cat1 = df2[df2['Control']=='Reference lexicons (Literature)']["PRI"]
    cat2 = df2[df2['Control']=='Random lexicons (Baseline)']["PRI"]
    this = ttest(list(cat1), list(cat2), alternative="less")
    this.index = [f"reference vs. random baseline {tp_mode}"]
    dfs.append(this)

ttest_df = pd.concat(dfs)

display(ttest_df)

ttest_df.to_csv("results/ttest_results.csv")





Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
controlled vs. reference TP-random position-random,4.71353,208,less,0.9999978,"[-inf, 0.01]",0.650529,0.0,1.125394e-10
controlled vs. reference TP-random position-fixed,-10.011785,208,less,8.238281999999999e-20,"[-inf, -0.08]",1.381758,4.673e+16,1.0
controlled vs. reference TP-structured,-7.973772,208,less,5.04853e-14,"[-inf, -0.08]",1.100485,118400000000.0,1.0
controlled vs. random baseline TP-random position-random,0.180549,208,less,0.5715513,"[-inf, 0.0]",0.024918,0.305,0.03401436
controlled vs. random baseline TP-random position-fixed,-9.361886,208,less,6.592708e-18,"[-inf, -0.22]",1.292063,664900000000000.0,1.0
controlled vs. random baseline TP-structured,-13.470629,208,less,1.886071e-30,"[-inf, -0.29]",1.859123,1.167e+27,1.0
reference vs. random baseline TP-random position-random,-4.206095,208,less,1.930524e-05,"[-inf, -0.0]",0.580496,953.452,0.9945752
reference vs. random baseline TP-random position-fixed,-5.430899,208,less,7.786872e-08,"[-inf, -0.11]",0.749535,155900.0,0.9999178
reference vs. random baseline TP-structured,-8.222437,208,less,1.06786e-14,"[-inf, -0.18]",1.134804,527600000000.0,1.0


## Example ARC Lexicon From Appendix

In [16]:
example_arc_lexicon = to_lexicon([["heː", "doː", "faː"], ["riː", "foː", "ɡyː"], ["ʃuː", "hiː", "boː"], ["vaː", "kuː", "niː"]])
print("Example Lexicon:", example_arc_lexicon)

streams = make_streams(
        [example_arc_lexicon], 
        max_rhythmicity=None, 
        num_repetitions=N_REPS
        )

for stream in streams:
        print_stream_info(stream)

Example Lexicon: heːdoːfaː|riːfoːɡyː|ʃuːhiːboː|vaːkuːniː
Stream: faː|kuː|heː|hiː|ʃuː|ɡyː|foː|boː|niː|riː|vaː|doː|ʃuː|doː|faː|boː|riː|hiː|niː|ɡyː|kuː|vaː|heː|foː|kuː|niː|vaː|ɡyː|faː|riː|doː|hiː|boː|heː|ʃuː|foː|riː|foː|ʃuː|hiː|faː|heː|doː|kuː|ɡyː|boː|vaː|niː|faː|foː|vaː|kuː|doː|boː|ʃuː|niː|hiː|ɡyː|heː|riː|heː|boː|doː|foː|hiː|kuː|faː|ɡyː|ʃuː|vaː|riː|niː|kuː|foː|faː|niː|heː|ɡyː|doː|riː|ʃuː|boː|hiː|vaː|ʃuː|kuː|boː|foː|doː|heː|vaː|faː|hiː|riː|ɡyː|niː|boː|faː|doː|ɡyː|vaː|foː|heː|niː|ʃuː|riː|kuː|hiː|heː|faː|vaː|hiː|doː|niː|foː|ɡyː|riː|boː|kuː|ʃuː|faː|ʃuː|heː|kuː|riː|foː|niː|doː|vaː|boː|ɡyː|hiː|foː|boː|faː|doː|vaː|kuː|ʃuː|niː|ɡyː|hiː|heː|riː|faː|foː|doː|ɡyː|boː|kuː|niː|hiː|riː|vaː|ʃuː|heː|doː|foː|faː|vaː|niː|boː|hiː|ʃuː|ɡyː|heː|kuː|riː|niː|heː|boː|ɡyː|riː|doː|ʃuː|hiː|foː|kuː|vaː|faː|riː|boː|foː|hiː|doː|kuː|ɡyː|vaː|heː|niː|faː|ʃuː|riː|hiː|niː|doː|heː|ɡyː|foː|ʃuː|faː|kuː|boː|vaː|ɡyː|niː|kuː|heː|foː|vaː|doː|riː|faː|hiː|boː|ʃuː|vaː|hiː|kuː|faː|ɡyː|doː|niː|foː|riː|ʃuː|boː|heː|faː|heː|hiː|vaː|riː|ɡyː