In [1]:
import csv # for writing dataframes to csv
import random # for making a random choice
import os # for scanning directories
import itertools
import string # for generating strings

import kintypes as kt # bringing large lists of kin types into the namespace
import math # for calculating logs

# Internal co-selection

Internal co-selection refers to the tendency for kinship systems to have cross-generational consistency in where distinctions or mergers are made. That is, if your parents' elder brothers share a kin term, then so too will their children. If your parents' sisters are distinguished from your parents' brothers, so too will their children be distinguished. We can test the robustness of this tendency using our frankenlanguages, to see whether internal co-selection occurs at a higher rate than chance.

We will measure internal co-selection in terms of the **mutual information** between Generation N and Generation N+1 in a particular kinship system. That is, we need to work out the conditional entropy between every possible pair of parent and child terms, and the entropy over an entire generation. This will tell us how much information is shared across the two generations; or how much we can predict about one generation given the other.

To do this, we need to do the following:

* Get a list of parent-child pairs for each language.
* Work out the conditional probabilities for each pair.
* Work out the probabilities of each individual term in a generation.
* Calculate the entropy of 2 and 3.
* Calculate the mutual information of the system.

Luckily, we can re-use some of the infrastructure we already have. For ease, I will write out again the functions that extract kin terms from a kinbank file.

In [2]:
# to get a list of all the kinbank filenames

def get_kb_files():
    files = []
    path = '../languages/kinbank'
    directory = os.scandir(path)
    for file in directory:
        files.append(file.name)
    return files

In [3]:
# to pick a file at random

def random_language(all_data):
    language = random.choice(all_data)
    # print(language)
    return language

In [4]:
# to extract kin terms from one of those files

def get_kin_terms(filepath):
    kin_system = {}
    with open(filepath, encoding='utf8') as f:
        csv_reader = csv.DictReader(f)
        next(csv_reader) # to skip the header row
        for line in csv_reader:
            kin_type = line['parameter']
            kin_term = line['word']
            kin_system[kin_type] = kin_term
    return kin_system

For testing purposes throughout this notebook, let's pick a random language and extract its kin terms.

In [5]:
all_kb_files = get_kb_files()

random.seed(47)
file = random_language(all_kb_files)
filepath = '../languages/kinbank/'

l = get_kin_terms(filepath + file)

print(file,l)

Njebi_njeb1242.csv {'meB': 'mukulu', 'myB': 'meghèghè', 'meZ': 'kédi', 'myZ': 'kédi', 'mF': "tat'", 'mM': 'ngu', 'mS': 'mwana', 'mPP': 'kagha', 'mFF': 'kagha', 'mFM': 'kagha', 'mMF': 'kagha', 'mMM': 'kagha', 'mSD': 'mutégheda', 'mFB': "tat'", 'mFZ': "to-m'kas", 'mMB': 'kètshi', 'mMZ': 'ngu', 'mZS': 'nzaba', 'mMBS': 'mwana', 'mFeB': "tat'", 'mFyB': "tat'", 'mFeZ': "to-m'kas", 'mFyZ': "to-m'kas", 'mMeZ': 'ngu', 'mMyZ': 'ngu', 'mMeB': 'kètshi', 'mMyB': 'kètshi', 'meZS': 'nzaba', 'myZS': 'nzaba', 'mMeBS': 'mwana', 'mMyBS': 'mwana', 'mMBeS': 'mwana', 'mMByS': 'mwana', 'feB': 'ndumi', 'fyB': 'ndumi', 'feZ': 'mukulu', 'fyZ': 'meghèghè'}


## Getting the pairs

~~The first thing we need to do is write a function that takes a dictionary of kin terms like `l`, and outputs a list of the relevant terms for each generation. We also need a function that pairs up those terms into parent-child pairs. In `kintypes`, we have a list of pairs of codes for parent and child terms, so we just need to cross reference these.~~

To avoid weird gaps, we're going to slightly simplify things! First, we're going to extract the relevant pairs of terms from a dictionary of kin terms - e.g. for English, it extracts the pair `('uncle','cousin')`. Notably, our function checks whether **both** of the relevant terms exist in the dictionary before trying to extract them! Then, we'll split these pairs in half to get the list of terms in each generation. This way, any gaps in the kinbank data - e.g. the term for 'uncle' is there, but the term for 'cousin' is not - cause us less strife, at the expense of a little bit of accuracy.

In [6]:
# def filter_generations(ks):
#     N = []
#     N1 = []
#     for kin_type in ks:
#         if kin_type in kt.generation_n:
#             N.append(ks[kin_type])
#         elif kin_type in kt.generation_n1:
#             N1.append(ks[kin_type])
#         else:
#             pass
    
#     return list(set(N)), list(set(N1))

~~Let's test this function on `l`.~~

In [7]:
# lN, lN1 = filter_generations(l)

# print(lN,lN1)

In [33]:
def get_pairs(ks,pairs):
    pairs_of_terms = []
    placeholder = [] 
    for pair in pairs:
        if pair[0] in ks and pair[1] in ks:
            if pair in placeholder:
                pass
            else:
#             print(pair[0],pair[1])
                pairs_of_terms.append((ks[pair[0]],ks[pair[1]]))
                placeholder.append(pair)
            
    return pairs_of_terms

In [34]:
l_pairs = get_pairs(l,kt.ics_pairs)

print(l_pairs)

[('ngughu balagha', 'mwana'), ('ngughu balagha', 'mwana'), ('tara', 'okulu'), ('tara', 'nkéri'), ('tara', 'okulu'), ('tara', 'nkéri'), ('tata okasi', 'okulu'), ('tata okasi', 'nkéri'), ('tata okasi', 'okulu'), ('tata okasi', 'nkéri')]


Now we have a way to get a list of pairs, let's split them up into `GN` for Ego's generation and `GN1` for ego's parents' generation.

In [10]:
def split_generations(pairs):
    GN = []
    GN1 = []
    for pair in pairs:
        GN.append(pair[1])
        GN1.append(pair[0])
    
    return GN,GN1

In [11]:
l_GN, l_GN1 = split_generations(l_pairs)

print(l_GN,l_GN1)

['mwana', 'mwana'] ['kètshi', 'kètshi']


## Calculating probabilities

Now we have lists of terms for each generation, we can calculate the probability of each term within its generation, by counting up the number of times that term appears divided by the total number of terms in the generation.

In [12]:
def calculate_probs(terms):
    probs = []
    for term in set(terms):
        probs.append(terms.count(term)/len(terms))
#         print('probability of ', term, ' is ', terms.count(term)/len(terms))
    return probs

In [76]:
l_probs = calculate_probs(l_GN1)

print(l_probs)

[1.0]


We can use this function to calculate the conditional probability of each term given the term it is paired with: the probability of the pair (given all the possible pairs) over the probability of the "given" term. Let's do this "bottom-up" for ease - so for English, calculating the probability of *uncle* given *cousin*. Since mutual information is not directional, we can make this arbitrary choice without repercussions later.

~~Reader, there were repercussions. In my first pass, I got it the wrong way around! Ack.~~

In [75]:
def calculate_conditional_probs(pairs,terms):
    cond_probs = []
    probs = calculate_probs(terms)
#     print(terms)
#     print(probs)
    
    for pair in set(pairs): # for each unique pair
        p_pair = pairs.count(pair)/len(pairs) # calculate the probability of the pair
        # term_index = list(set(terms)).index(pair[0]) # get the index of the parent term in the terms list
        term_index = list(set(terms)).index(pair[1]) # for testing!!!
        p_term = probs[term_index] # then use that index to get the probability of that term from probs!
        cond_probs.append(p_pair/p_term) # the probability of the pair (B and A) over the probability of the parent term (B) gives us the conditional probability of child given parent
        # print('pair: ', pair, 'p(', pair[1], '|', pair[0], ') = ', p_pair/p_term)
        print('pair: ', pair, 'p(', pair[0], '|', pair[1], ') = ', p_pair/p_term)
        
    return cond_probs
    

In [15]:
l_cond_probs = calculate_conditional_probs(l_pairs,l_GN1)

print(l_cond_probs)

pair:  ('kètshi', 'mwana') p( mwana | kètshi ) =  1.0
[1.0]


## Calculating entropy and mutual information

Now that we have a way to extract probability distributions from a kinship system, we can feed that in to a function that calculates entropy. The entropy scores can then be fed to a function that calculates mutual information (equations to follow).

In [16]:
def calculate_entropy(probs):
    entropy = 0
    for p in probs:
        if p != 0:
            entropy += p*math.log2(p)
    return -entropy

If we feed in our probability distributions, `l_probs` ~~and `l_cond_probs`~~, the function above will spit out the entropy of `l`'s generation N ~~and the conditional entropy of `l`'s Generation N given `l`'s Generation N+1 respectively.~~

In [17]:
l_entropy = calculate_entropy(l_probs)

# l_cond_entropy = calculate_entropy(l_cond_probs)

print(l_entropy)

-0.0


We need a second function to calculate conditional entropy, which is slightly more complex. Here, we calculate the probability of B and A multiplied by the probability of B and A over B.

In [151]:
def calculate_cond_entropy(pairs,terms):
    p_pairs = []
    for pair in set(pairs):
        p_pairs.append(pairs.count(pair)/len(pairs))
    
    print(pairs, p_pairs)
    
#     print('pairs: ', pairs, 'probabilities of pairs: ', p_pairs)
        
    p_cond = calculate_conditional_probs(pairs,terms)
#     print('terms: ', terms, 'probabilities of terms: blank', 'conditional_probabilities: ', p_cond)
    
    entropy = 0
    
    for p in p_cond:
        index = p_cond.index(p)
        entropy += p_pairs[index]*math.log2(p)
        print('p(a,b) = ', p_pairs[index], 'p(a|b) = ', p)
            
    return -entropy
    

These two values can then be used to calculate mutual information. Because of our 'top-down' approach, this will be equal to the entropy of `l`'s Generation N minus the conditional entropy of `l`.

In [74]:
def calculate_mi(pairs,terms1,terms2):
    # probs = calculate_probs(terms1)
    probs = calculate_probs(terms2) # for testing
    print('terms: ', terms2, 'probability distribution of GN terms: ', probs)
    entropy = calculate_entropy(probs)
    # conditional_entropy = calculate_cond_entropy(pairs,terms2)
    conditional_entropy = calculate_cond_entropy(pairs,terms1) # for testing
    print('entropy of GN1 = ', entropy, 'conditional entropy of system = ', conditional_entropy)
    return entropy - conditional_entropy

In [20]:
l_mi = calculate_mi(l_pairs,l_GN,l_GN1)

print('mi = ', l_mi)

terms:  ['mwana', 'mwana'] probability distribution of GN terms:  [1.0]
pair:  ('kètshi', 'mwana') p( mwana | kètshi ) =  1.0
entropy of GN =  -0.0 conditional entropy of system =  -0.0
mi =  0.0


## Getting neat and tidy

Now let's write a function that wraps up aaaaall of the above neatly.

In [21]:
def calculate_ics(ks):
    pairs = get_pairs(ks,kt.ics_pairs)
    GN,GN1 = split_generations(pairs)
    mi = calculate_mi(pairs,GN,GN1)
    return mi

In [22]:
l_ics = calculate_ics(l)

print(l_ics)

terms:  ['mwana', 'mwana'] probability distribution of GN terms:  [1.0]
pair:  ('kètshi', 'mwana') p( mwana | kètshi ) =  1.0
entropy of GN =  -0.0 conditional entropy of system =  -0.0
0.0


## Testing, testing

And now to test until we brute force our way to working code!

In [73]:
random.seed(5)
file = random_language(all_kb_files)
filepath = '../languages/kinbank/'

l = get_kin_terms(filepath + file)

l_ics = calculate_ics(l)

print(l_ics)


terms:  ['ngughu balagha', 'ngughu balagha', 'tara', 'tara', 'tara', 'tara', 'tata okasi', 'tata okasi', 'tata okasi', 'tata okasi'] probability distribution of GN terms:  [0.4, 0.2, 0.4]
pair:  ('tara', 'okulu') p( okulu | tara ) =  0.5
pair:  ('tara', 'nkéri') p( nkéri | tara ) =  0.5
pair:  ('tata okasi', 'okulu') p( okulu | tata okasi ) =  0.5
pair:  ('tata okasi', 'nkéri') p( nkéri | tata okasi ) =  0.5
pair:  ('ngughu balagha', 'mwana') p( mwana | ngughu balagha ) =  1.0
entropy of GN =  1.5219280948873621 conditional entropy of system =  0.8
0.7219280948873621


In [62]:
calculate_ics(l)

terms:  ['ngughu balagha', 'ngughu balagha', 'tara', 'tara', 'tara', 'tara', 'tata okasi', 'tata okasi', 'tata okasi', 'tata okasi'] probability distribution of GN terms:  [0.2, 0.4, 0.4]
pair:  ('tara', 'okulu') p( tara | okulu ) =  0.5
pair:  ('tara', 'nkéri') p( tara | nkéri ) =  0.5
pair:  ('tata okasi', 'okulu') p( tata okasi | okulu ) =  0.5
pair:  ('tata okasi', 'nkéri') p( tata okasi | nkéri ) =  0.5
pair:  ('ngughu balagha', 'mwana') p( ngughu balagha | mwana ) =  1.0
entropy of GN1 =  1.5219280948873621 conditional entropy of system =  0.8


0.7219280948873621

In [35]:

fakelish = {
    'mMeB': 'aunt',
    'mMeZ': 'aunt',
    'mFeB': 'uncle',
    'mFeZ': 'aunt',
    'mMeBS': 'cousin',
    'mMeZS': 'cousin',
    'mFeBS': 'cousin',
    'mFeZS': 'cousin',
    'mMeBD': 'cousines',
    'mMeZD': 'cousin',
    'mFeBD': 'cousin',
    'mFeZD': 'cousin',
    'mMyB': 'aunt',
    'mMyZ': 'aunt',
    'mFyB': 'uncle',
    'mFyZ': 'aunt',
    'mMyBS': 'cousin',
    'mMyZS': 'cousin',
    'mFyBS': 'cousin',
    'mFyZS': 'cousin',
    'mMyBD': 'cousines',
    'mMyZD': 'cousin',
    'mFyBD': 'cousin',
    'mFyZD': 'cousin'
}

english = get_kin_terms(filepath + 'English_stan1293.csv')
hindi = get_kin_terms(filepath + 'Hindi_hind1269.csv')
aguaruna = get_kin_terms(filepath + 'Aguaruna_agua1253.csv')

calculate_ics(hindi)

terms:  ['mamera bhai', 'mameri bahan', 'mamera bhai', 'mameri bahan', 'cacera bhai', 'caceri bahan', 'cacera bhai', 'caceri bahan', 'mɔsera bhai', 'mɔseri bahan', 'mɔsera bhai', 'mɔseri bahan', 'phuphera bhai', 'phupheri bahan', 'phuphera bhai', 'phupheri bahan', 'mamera bhai', 'mameri bahan', 'mamera bhai', 'mameri bahan', 'cacera bhai', 'caceri bahan', 'cacera bhai', 'caceri bahan', 'mɔsera bhai', 'mɔseri bahan', 'mɔsera bhai', 'mɔseri bahan', 'phuphera bhai', 'phupheri bahan', 'phuphera bhai', 'phupheri bahan'] probability distribution of GN terms:  [0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125]
pair:  ('māmā', 'mameri bahan') p( mameri bahan | māmā ) =  0.5
pair:  ('cācā', 'cacera bhai') p( cacera bhai | cācā ) =  0.5
pair:  ('tāū', 'cacera bhai') p( cacera bhai | tāū ) =  0.5
pair:  ('mausī', 'mɔsera bhai') p( mɔsera bhai | mausī ) =  0.5
pair:  ('buā', 'phuphera bhai') p( phuphera bhai | buā ) =  0.5
pair:  ('māmā', 'mamera bhai') p( mamera bhai | māmā ) =  0.5
pair:  

1.75

In [63]:
calculate_ics(hindi)

terms:  ['māmā', 'māmā', 'māmā', 'māmā', 'tāū', 'tāū', 'cācā', 'cācā', 'mausī', 'mausī', 'mausī', 'mausī', 'buā', 'buā', 'buā', 'buā', 'māmā', 'māmā', 'māmā', 'māmā', 'tāū', 'tāū', 'cācā', 'cācā', 'mausī', 'mausī', 'mausī', 'mausī', 'buā', 'buā', 'buā', 'buā'] probability distribution of GN terms:  [0.25, 0.125, 0.125, 0.25, 0.25]
pair:  ('māmā', 'mameri bahan') p( māmā | mameri bahan ) =  1.0
pair:  ('cācā', 'cacera bhai') p( cācā | cacera bhai ) =  0.5
pair:  ('tāū', 'cacera bhai') p( tāū | cacera bhai ) =  0.5
pair:  ('mausī', 'mɔsera bhai') p( mausī | mɔsera bhai ) =  1.0
pair:  ('buā', 'phuphera bhai') p( buā | phuphera bhai ) =  1.0
pair:  ('māmā', 'mamera bhai') p( māmā | mamera bhai ) =  1.0
pair:  ('cācā', 'caceri bahan') p( cācā | caceri bahan ) =  0.5
pair:  ('buā', 'phupheri bahan') p( buā | phupheri bahan ) =  1.0
pair:  ('mausī', 'mɔseri bahan') p( mausī | mɔseri bahan ) =  1.0
pair:  ('tāū', 'caceri bahan') p( tāū | caceri bahan ) =  0.5
entropy of GN1 =  2.25 conditiona

2.0

In [25]:
calculate_ics(english)

terms:  ['cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin', 'cousin'] probability distribution of GN terms:  [1.0]
pair:  ('aunt', 'cousin') p( cousin | aunt ) =  1.0
pair:  ('uncle', 'cousin') p( cousin | uncle ) =  1.0
entropy of GN =  -0.0 conditional entropy of system =  -0.0


0.0

In [64]:
calculate_ics(english)

terms:  ['uncle', 'uncle', 'uncle', 'uncle', 'uncle', 'uncle', 'uncle', 'uncle', 'aunt', 'aunt', 'aunt', 'aunt', 'aunt', 'aunt', 'aunt', 'aunt', 'uncle', 'uncle', 'uncle', 'uncle', 'uncle', 'uncle', 'uncle', 'uncle', 'aunt', 'aunt', 'aunt', 'aunt', 'aunt', 'aunt', 'aunt', 'aunt'] probability distribution of GN terms:  [0.5, 0.5]
pair:  ('aunt', 'cousin') p( aunt | cousin ) =  0.5
pair:  ('uncle', 'cousin') p( uncle | cousin ) =  0.5
entropy of GN1 =  1.0 conditional entropy of system =  1.0


0.0

In [26]:
calculate_ics(aguaruna)

terms:  ["i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t", "i'ka 'yac̷u-t"] probability distribution of GN terms:  [1.0]
pair:  ('yac̷u-t', "i'ka 'yac̷u-t") p( i'ka 'yac̷u-t | yac̷u-t ) =  1.0
pair:  ("apa-'hu yači", "i'ka 'yac̷u-t") p( i'ka 'yac̷u-t

0.0

In [65]:
calculate_ics(aguaruna)

terms:  ['yac̷u-t', 'yac̷u-t', 'yac̷u-t', 'yac̷u-t', "apa-'hu yači", "apa-'hu yači", "apa-'hu yači", "apa-'hu yači", 'uma-yui', 'uma-yui', 'uma-yui', 'uma-yui', "apa-'hu uma-yi", "apa-'hu uma-yi", "apa-'hu uma-yi", "apa-'hu uma-yi", 'yac̷u-t', 'yac̷u-t', 'yac̷u-t', 'yac̷u-t', "apa-'hu yači", "apa-'hu yači", "apa-'hu yači", "apa-'hu yači", 'uma-yui', 'uma-yui', 'uma-yui', 'uma-yui', "apa-'hu uma-yi", "apa-'hu uma-yi", "apa-'hu uma-yi", "apa-'hu uma-yi"] probability distribution of GN terms:  [0.25, 0.25, 0.25, 0.25]
pair:  ('yac̷u-t', "i'ka 'yac̷u-t") p( yac̷u-t | i'ka 'yac̷u-t ) =  0.25
pair:  ("apa-'hu yači", "i'ka 'yac̷u-t") p( apa-'hu yači | i'ka 'yac̷u-t ) =  0.25
pair:  ('uma-yui', "i'ka 'yac̷u-t") p( uma-yui | i'ka 'yac̷u-t ) =  0.25
pair:  ("apa-'hu uma-yi", "i'ka 'yac̷u-t") p( apa-'hu uma-yi | i'ka 'yac̷u-t ) =  0.25
entropy of GN1 =  2.0 conditional entropy of system =  2.0


0.0

~~Ok. We have something that works ish. But mutual information should always be positive, so something is wrong with the way we are calculating probabilities. Maybe I've misunderstood what the entropy of a generation is? Here I've just fed in the probability distribution of terms in generation (e.g. for English there is 1 term with 1.0 probability, hence `[1]` gets fed to the `calculate_entropy` function. Equally maybe you can't calculate conditional entropy with a list of conditional probabilities?~~

Solved! I was calculating conditional probability wrong. For each pair it needed to be p(pair) * log2 p(pair[0] | pair[1])

In [27]:
random.seed(11)

file = random_language(all_kb_files)
filepath = '../languages/kinbank/'

a = get_kin_terms(filepath + file)

calculate_ics(a)

terms:  ['tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tua', 'finfiⁿdi', 'finfiⁿdi', 'tua', 'finfiⁿdi', 'finfiⁿdi', 'tua', 'finfiⁿdi', 'finfiⁿdi', 'tua', 'finfiⁿdi', 'finfiⁿdi', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'finfiⁿdi', 'tua', 'tua', 'finfiⁿdi', 'tua', 'tua', 'finfiⁿdi', 'tua', 'tua', 'finfiⁿdi', 'tua', 'tua', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum', 'tuatua tavalaxum'] probability distribution of GN terms:  [0.25, 0.5, 0.25]
pair:  ('taraᵐbe', 'tuatua tavalaxum') p( tuatua tavalaxum | taraᵐbe ) =  1.0
pair:  ('tata', 'finfiⁿdi') p( finfiⁿdi | tata ) =  0.5
pair:  ('raβe', 'tuatua tavalaxum') p( tuatua tavalaxum | raβe ) =  1.0
pair:  ('ᵐbotai', 'f

1.0

In [66]:
calculate_ics(a)

terms:  ['taraᵐbe', 'taraᵐbe', 'taraᵐbe', 'taraᵐbe', 'tata', 'tata', 'tata', 'tata', 'ᵐbotai', 'ᵐbotai', 'ᵐbotai', 'ᵐbotai', 'raβe', 'raβe', 'raβe', 'raβe', 'taraᵐbe', 'taraᵐbe', 'taraᵐbe', 'taraᵐbe', 'tata', 'tata', 'tata', 'tata', 'ᵐbotai', 'ᵐbotai', 'ᵐbotai', 'ᵐbotai', 'raβe', 'raβe', 'raβe', 'raβe'] probability distribution of GN terms:  [0.25, 0.25, 0.25, 0.25]
pair:  ('taraᵐbe', 'tuatua tavalaxum') p( taraᵐbe | tuatua tavalaxum ) =  0.5
pair:  ('tata', 'finfiⁿdi') p( tata | finfiⁿdi ) =  0.5
pair:  ('raβe', 'tuatua tavalaxum') p( raβe | tuatua tavalaxum ) =  0.5
pair:  ('ᵐbotai', 'finfiⁿdi') p( ᵐbotai | finfiⁿdi ) =  0.5
pair:  ('ᵐbotai', 'tua') p( ᵐbotai | tua ) =  0.5
pair:  ('tata', 'tua') p( tata | tua ) =  0.5
entropy of GN1 =  2.0 conditional entropy of system =  1.5


0.5

### What score do we get for a language that co-selects perfectly?

In [67]:
ics_lang = {
    'mMeB': 'aaa',
    'mMeZ': 'bbb',
    'mFeB': 'ccc',
    'mFeZ': 'ddd',
    'mMeBS': 'aaas',
    'mMeZS': 'bbbs',
    'mFeBS': 'cccs',
    'mFeZS': 'ddds',
    'mMeBD': 'aaad',
    'mMeZD': 'bbbd',
    'mFeBD': 'cccd',
    'mFeZD': 'dddd',
    'mMyB': 'eee',
    'mMyZ': 'fff',
    'mFyB': 'ggg',
    'mFyZ': 'hhh',
    'mMyBS': 'eees',
    'mMyZS': 'fffs',
    'mFyBS': 'gggs',
    'mFyZS': 'hhhs',
    'mMyBD': 'eeed',
    'mMyZD': 'fffd',
    'mFyBD': 'gggd',
    'mFyZD': 'hhhd'
}

# get_pairs(ics_lang,kt.ics_pairs)

calculate_ics(ics_lang)

terms:  ['aaa', 'aaa', 'eee', 'eee', 'ccc', 'ccc', 'ggg', 'ggg', 'bbb', 'bbb', 'fff', 'fff', 'ddd', 'ddd', 'hhh', 'hhh'] probability distribution of GN terms:  [0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125]
pair:  ('eee', 'eees') p( eee | eees ) =  1.0
pair:  ('bbb', 'bbbs') p( bbb | bbbs ) =  1.0
pair:  ('fff', 'fffs') p( fff | fffs ) =  1.0
pair:  ('aaa', 'aaad') p( aaa | aaad ) =  1.0
pair:  ('ddd', 'dddd') p( ddd | dddd ) =  1.0
pair:  ('hhh', 'hhhs') p( hhh | hhhs ) =  1.0
pair:  ('hhh', 'hhhd') p( hhh | hhhd ) =  1.0
pair:  ('ccc', 'cccs') p( ccc | cccs ) =  1.0
pair:  ('aaa', 'aaas') p( aaa | aaas ) =  1.0
pair:  ('bbb', 'bbbd') p( bbb | bbbd ) =  1.0
pair:  ('ggg', 'gggs') p( ggg | gggs ) =  1.0
pair:  ('ggg', 'gggd') p( ggg | gggd ) =  1.0
pair:  ('ddd', 'ddds') p( ddd | ddds ) =  1.0
pair:  ('eee', 'eeed') p( eee | eeed ) =  1.0
pair:  ('ccc', 'cccd') p( ccc | cccd ) =  1.0
pair:  ('fff', 'fffd') p( fff | fffd ) =  1.0
entropy of GN1 =  3.0 conditional entropy of sy

3.0

Does that change if the son and daughter terms are the same?

In [68]:
ics_lang_2 = {
    'mMeB': 'aaa',
    'mMeZ': 'bbb',
    'mFeB': 'ccc',
    'mFeZ': 'ddd',
    'mMeBS': 'aaac',
    'mMeZS': 'bbbc',
    'mFeBS': 'cccc',
    'mFeZS': 'dddc',
    'mMeBD': 'aaac',
    'mMeZD': 'bbbc',
    'mFeBD': 'cccc',
    'mFeZD': 'dddc',
    'mMyB': 'eee',
    'mMyZ': 'fff',
    'mFyB': 'ggg',
    'mFyZ': 'hhh',
    'mMyBS': 'eeec',
    'mMyZS': 'fffc',
    'mFyBS': 'gggc',
    'mFyZS': 'hhhc',
    'mMyBD': 'eeec',
    'mMyZD': 'fffc',
    'mFyBD': 'gggc',
    'mFyZD': 'hhhc'
}

calculate_ics(ics_lang_2)

terms:  ['aaa', 'aaa', 'eee', 'eee', 'ccc', 'ccc', 'ggg', 'ggg', 'bbb', 'bbb', 'fff', 'fff', 'ddd', 'ddd', 'hhh', 'hhh'] probability distribution of GN terms:  [0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125]
pair:  ('bbb', 'bbbc') p( bbb | bbbc ) =  1.0
pair:  ('hhh', 'hhhc') p( hhh | hhhc ) =  1.0
pair:  ('ccc', 'cccc') p( ccc | cccc ) =  1.0
pair:  ('ddd', 'dddc') p( ddd | dddc ) =  1.0
pair:  ('aaa', 'aaac') p( aaa | aaac ) =  1.0
pair:  ('fff', 'fffc') p( fff | fffc ) =  1.0
pair:  ('ggg', 'gggc') p( ggg | gggc ) =  1.0
pair:  ('eee', 'eeec') p( eee | eeec ) =  1.0
entropy of GN1 =  3.0 conditional entropy of system =  -0.0


3.0

And what about a non co-selecting language?

In [152]:
bad_ics_lang = {
    'mMeB': 'aaa',
    'mMeZ': 'bbb',
    'mFeB': 'ddd',
    'mFeZ': 'ddd',
    'mMeBS': 'eed',
    'mMeZS': 'aaas',
    'mFeBS': 'aaas',
    'mFeZS': 'ddds',
    'mMeBD': 'aaad',
    'mMeZD': 'aaad',
    'mFeBD': 'dddd',
    'mFeZD': 'aaad',
    'mMyB': 'eee',
    'mMyZ': 'fff',
    'mFyB': 'ddd',
    'mFyZ': 'hhh',
    'mMyBS': 'aaas',
    'mMyZS': 'eee',
    'mFyBS': 'aaas',
    'mFyZS': 'gggs',
    'mMyBD': 'eeed',
    'mMyZD': 'aaad',
    'mFyBD': 'aaad',
    'mFyZD': 'gggd'
}

calculate_ics(bad_ics_lang)

terms:  ['aaa', 'aaa', 'eee', 'eee', 'ddd', 'ddd', 'ddd', 'ddd', 'bbb', 'bbb', 'fff', 'fff', 'ddd', 'ddd', 'hhh', 'hhh'] probability distribution of GN terms:  [0.375, 0.125, 0.125, 0.125, 0.125, 0.125]
[('aaa', 'eed'), ('aaa', 'aaad'), ('eee', 'aaas'), ('eee', 'eeed'), ('ddd', 'aaas'), ('ddd', 'dddd'), ('ddd', 'aaas'), ('ddd', 'aaad'), ('bbb', 'aaas'), ('bbb', 'aaad'), ('fff', 'eee'), ('fff', 'aaad'), ('ddd', 'ddds'), ('ddd', 'aaad'), ('hhh', 'gggs'), ('hhh', 'gggd')] [0.0625, 0.125, 0.0625, 0.125, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625, 0.0625]
pair:  ('aaa', 'eed') p( aaa | eed ) =  1.0
pair:  ('ddd', 'aaas') p( ddd | aaas ) =  0.5
pair:  ('fff', 'eee') p( fff | eee ) =  1.0
pair:  ('ddd', 'aaad') p( ddd | aaad ) =  0.4
pair:  ('bbb', 'aaas') p( bbb | aaas ) =  0.25
pair:  ('bbb', 'aaad') p( bbb | aaad ) =  0.2
pair:  ('aaa', 'aaad') p( aaa | aaad ) =  0.2
pair:  ('ddd', 'ddds') p( ddd | ddds ) =  1.0
pair:  ('hhh', 'gggd') p( hhh | gggd ) =  1.0
pair

1.4300365325772657

It's not super intuitive to me that the "worst" languages are those with entropy 0 in one of the generations. A system like English is at least consistent - if the motivation for ICS is that it increases simplicity, then this seems entirely opposite to what we want!

## What about randomly generated languages?

Let's write a function to randomly generate a language so we can get a feel for what good / bad mutual information scores look like.

In [89]:
def generate_language(kin_types):
    language = {}
    words = []
    letters = string.ascii_lowercase
    
    for i in range(len(kin_types)):
        word = ''.join(random.choice(letters) for i in range(4))
        words.append(word)
        
    print(words)
        
    for kt in kin_types:
        term = random.choice(words)
        language[kt] = term
        
    return language

In [104]:
random.seed(1)
x = generate_language(ics_lang.keys())

get_pairs(x,kt.ics_pairs)
      
calculate_ics(x)

['eszy', 'cidp', 'yopu', 'mzgd', 'pamn', 'tyya', 'woix', 'zhsd', 'kaaa', 'uram', 'vgnx', 'aqhy', 'oprh', 'lhvh', 'yoja', 'nrud', 'fuxj', 'dxkx', 'wqnq', 'vgjj', 'spqm', 'sbph', 'xzmn', 'vflr']
terms:  ['xzmn', 'xzmn', 'aqhy', 'aqhy', 'vflr', 'vflr', 'vflr', 'vflr', 'sbph', 'sbph', 'nrud', 'nrud', 'aqhy', 'aqhy', 'eszy', 'eszy'] probability distribution of GN terms:  [0.125, 0.25, 0.25, 0.125, 0.125, 0.125]
pair:  ('xzmn', 'mzgd') p( xzmn | mzgd ) =  1.0
pair:  ('sbph', 'yoja') p( sbph | yoja ) =  1.0
pair:  ('aqhy', 'fuxj') p( aqhy | fuxj ) =  0.5
pair:  ('aqhy', 'oprh') p( aqhy | oprh ) =  0.5
pair:  ('vflr', 'fuxj') p( vflr | fuxj ) =  0.5
pair:  ('nrud', 'cidp') p( nrud | cidp ) =  1.0
pair:  ('xzmn', 'yopu') p( xzmn | yopu ) =  1.0
pair:  ('eszy', 'oprh') p( eszy | oprh ) =  0.5
pair:  ('aqhy', 'vgjj') p( aqhy | vgjj ) =  1.0
pair:  ('sbph', 'tyya') p( sbph | tyya ) =  1.0
pair:  ('nrud', 'wqnq') p( nrud | wqnq ) =  0.5
pair:  ('eszy', 'xzmn') p( eszy | xzmn ) =  1.0
pair:  ('vflr'

2.125

In [94]:
mini_ics = {
    'mMeB': 'aaa',
    'mMeZ': 'bbb',
    'mFeB': 'ccc',
    'mFeZ': 'ddd',
    'mMeBS': 'aaac',
    'mMeZS': 'bbbc',
    'mFeBS': 'cccc',
    'mFeZS': 'dddc',
    'mMeBD': 'aaac',
    'mMeZD': 'bbbc',
    'mFeBD': 'cccc',
    'mFeZD': 'dddc'
}

calculate_ics(mini_ics)

terms:  ['aaa', 'aaa', 'ccc', 'ccc', 'bbb', 'bbb', 'ddd', 'ddd'] probability distribution of GN terms:  [0.25, 0.25, 0.25, 0.25]
pair:  ('aaa', 'aaac') p( aaa | aaac ) =  1.0
pair:  ('ccc', 'cccc') p( ccc | cccc ) =  1.0
pair:  ('ddd', 'dddc') p( ddd | dddc ) =  1.0
pair:  ('bbb', 'bbbc') p( bbb | bbbc ) =  1.0
entropy of GN1 =  2.0 conditional entropy of system =  -0.0


2.0

In [153]:
mini_bad_ics = {
    'mMeB': 'aaa',
    'mMeZ': 'aaa',
    'mFeB': 'aaa',
    'mFeZ': 'aaa',
    'mMeBS': 'aaac',
    'mMeZS': 'aaac',
    'mFeBS': 'aaac',
    'mFeZS': 'bbbc',
    'mMeBD': 'aaac',
    'mMeZD': 'aaac',
    'mFeBD': 'aaac',
    'mFeZD': 'bbbc'
}

calculate_ics(mini_bad_ics)

terms:  ['aaa', 'aaa', 'aaa', 'aaa', 'aaa', 'aaa', 'aaa', 'aaa'] probability distribution of GN terms:  [1.0]
[('aaa', 'aaac'), ('aaa', 'aaac'), ('aaa', 'aaac'), ('aaa', 'aaac'), ('aaa', 'aaac'), ('aaa', 'aaac'), ('aaa', 'bbbc'), ('aaa', 'bbbc')] [0.75, 0.25]
pair:  ('aaa', 'aaac') p( aaa | aaac ) =  1.0
pair:  ('aaa', 'bbbc') p( aaa | bbbc ) =  1.0
p(a,b) =  0.75 p(a|b) =  1.0
p(a,b) =  0.75 p(a|b) =  1.0
entropy of GN1 =  -0.0 conditional entropy of system =  -0.0


0.0

In [161]:
mini_bad_ics_2 = {
    'mMeB': 'aaa',
    'mMeZ': 'bbb',
    'mFeB': 'ccc',
    'mFeZ': 'ddd',
    'mMeBS': 'aaac',
    'mMeZS': 'bbbc',
    'mFeBS': 'aaac',
    'mFeZS': 'bbbc',
    'mMeBD': 'aaac',
    'mMeZD': 'bbbc',
    'mFeBD': 'aaac',
    'mFeZD': 'bbbc'
}

calculate_ics(mini_bad_ics_2)

terms:  ['aaa', 'aaa', 'ccc', 'ccc', 'bbb', 'bbb', 'ddd', 'ddd'] probability distribution of GN terms:  [0.25, 0.25, 0.25, 0.25]
[('aaa', 'aaac'), ('aaa', 'aaac'), ('ccc', 'aaac'), ('ccc', 'aaac'), ('bbb', 'bbbc'), ('bbb', 'bbbc'), ('ddd', 'bbbc'), ('ddd', 'bbbc')] [0.25, 0.25, 0.25, 0.25]
pair:  ('ddd', 'bbbc') p( ddd | bbbc ) =  0.5
pair:  ('aaa', 'aaac') p( aaa | aaac ) =  0.5
pair:  ('ccc', 'aaac') p( ccc | aaac ) =  0.5
pair:  ('bbb', 'bbbc') p( bbb | bbbc ) =  0.5
p(a,b) =  0.25 p(a|b) =  0.5
p(a,b) =  0.25 p(a|b) =  0.5
p(a,b) =  0.25 p(a|b) =  0.5
p(a,b) =  0.25 p(a|b) =  0.5
entropy of GN1 =  2.0 conditional entropy of system =  1.0


1.0

In [140]:
length = ['mMeB','mMeZ','mFeB','mFeZ','mMeBS','mMeZS','mFeBS','mFeZS']

r_l = generate_language(length)

r_p = get_pairs(r_l,kt.ics_pairs)

calculate_ics(r_l)

['ings', 'xyzb', 'pvmw', 'ulmq', 'frxb', 'qczi', 'udix', 'ceyt']
terms:  ['xyzb', 'ulmq', 'ceyt', 'udix'] probability distribution of GN terms:  [0.25, 0.25, 0.25, 0.25]
pair:  ('xyzb', 'udix') p( xyzb | udix ) =  0.5
pair:  ('udix', 'qczi') p( udix | qczi ) =  1.0
pair:  ('ceyt', 'udix') p( ceyt | udix ) =  0.5
pair:  ('ulmq', 'pvmw') p( ulmq | pvmw ) =  1.0
entropy of GN1 =  2.0 conditional entropy of system =  0.5


1.5

-1.1887326958077753