In [1]:
import csv # for writing dataframes to csv
import random # for making a random choice
import os # for scanning directories
import itertools
import string # for generating strings
from collections import Counter

import kintypes as kt # bringing large lists of kin types into the namespace
import math # for calculating logs
import pandas as pd
import re

testing = True # set to True to run code blocks with tests and examples
filtering = False # set to True to run the filtering process

# Internal co-selection

Internal co-selection refers to the tendency for kinship systems to have cross-generational consistency in the terminological distinctions or mergers that are made. That is, if your parents' elder brothers share a kin term, then so too will their children. Or if your parents' sisters are distinguished from your parents' brothers, so too will their children be distinguished. 

Imagine a kinship system like so: as in English, you call your parents' brothers are  *uncles*, and their sisters *aunts*. You call the child of your uncle a *chuncle*, and the child of your aunt a *chaunt*. Thus, you make the same sorts of distinctions among your parents' siblings' generation of kin as are made among your own generation of kin - and you can be certain about which children belong to which parents as a result. This is an example of internal co-selection.

In this notebook, we will gather information about the robustness of this tendency cross-linguistically, using data from Kinbank, a global database of kin terminology. We will also create simulations of existing kinship systems to find out whether internal co-selection is more common in kinship systems cross-linguistically (for a given amount of terminological variation) than we would expect by chance.

We will measure internal co-selection in terms of the **mutual information** between Generation N and Generation N+1 in a particular kinship system. That tells us how much information can be gained from one generation by observing the other - how certain we can be about which children 'go with' which parents. This can be calculated as the **entropy** of one generation (how much unpredictable variation there is) minus the **conditional entropy** between the two generations (how much unpredictability remains in one generation after observing another).

## The procedure

To calculate the mutual information (MI) of a particular kinship system, we must perform the following steps:

1. Extract kin terminology data from Kinbank for this language.
2. Condense the full kinship system down to the terms we are interested in: Ego's generation and Ego's parents' generation.
3. Calculate the probabilities of each kin term within the generation in which it belongs; and the probabilities of each parent-child pair.
4. Calculate entropy, conditional entropy, subtract them from each other to get the mutual information of the system.

After we get that going, we can do these same calculations on simulated kinship systems.

### Extract kin terminology from Kinbank

First, let's actually load our data in. The following function `get_kb_files()` pulls the full list of Kinbank filenames. Later, we can iterate through these to generate MI values for every language in our dataset.

In [2]:
def get_kb_files(path) -> list:
    files = []
    directory = os.scandir(path)
    for file in directory:
        files.append(file.name)
    return files

all_kb_files = get_kb_files('../languages/kinbank')

Using one of these filenames, we can extract the kin terminology from that file and populate a dictionary with it. We're only interested in two columns from the Kinbank data: `parameter`, which contains a short code indicating a **kin type**, and `word`, which contains the **kin term** associated with that kin type. An example of a row in the English data would be `mMeB, uncle`, where `mMeB` means 'male speaker's mother's older brother', and `uncle` is the term associated with that person.

In [3]:
def get_kin_terms(filepath: str) -> dict:
    kin_system = {}
    with open(filepath, encoding='utf8') as f:
        csv_reader = csv.DictReader(f)
        next(csv_reader) # to skip the header row
        for line in csv_reader:
            kin_type = line['parameter']
            kin_term = line['word']
            kin_system[kin_type] = kin_term
    return kin_system

Let's pick a random kinship system to test with throughout this notebook.

In [4]:
if testing:
    
    random.seed(47) # set a seed for reproducibility

    file = random.choice(all_kb_files) # pick a random filename from all_kb_files

    filepath = '../languages/kinbank/' # the filepath where the kinbank files are kept

    k = get_kin_terms(filepath + file)

    print(file,k)

Nogai_noga1249.csv {'mG': 'karɨndas', 'mB': 'adanas', 'mZ': 'karɨndas', 'myB': 'ini', 'myZ': 'siŋli', 'mF': 'ata', 'mM': 'ana', 'mC': 'tuwgailar', 'mS': 'kede', 'mD': 'kɨz', 'mFF': 'atay', 'mFM': 'tetey', 'mMF': 'atay', 'mMM': 'tetey', 'mSS': 'yiyen', 'mSD': 'yiyеn kɨz', 'mDS': 'yiyen', 'mDD': 'yiyеn kɨz', 'mFB': 'akay', 'mFZ': 'abay', 'mMB': 'nagaš akay', 'mMZ': 'abay', 'mBS': 'yiyen', 'mBD': 'yiyеn kɨz', 'mZS': 'yiyen', 'mZD': 'yiyеn kɨz', 'mFZD': 'bölе', 'mFBD': 'bölе', 'mMBD': 'bölе', 'mMZD': 'bölе', 'mFBS': 'bölе', 'mFZS': 'bölе', 'mMBS': 'bölе', 'mMZS': 'bölе', 'mH': 'kiew', 'mW': 'pišе', 'mHF': 'kaynata', 'mHM': 'kaynana', 'mWF': 'kaynata', 'mWM': 'kaynana', 'mSW': 'kеšеk', 'mDH': 'kiеw', 'meB': 'adanas', 'meZ': 'karɨndas', 'mFeB': 'akay', 'mFyB': 'akay', 'mFeZ': 'abay', 'mFyZ': 'abay', 'mMeZ': 'abay', 'mMyZ': 'abay', 'mMeB': 'nagaš akay', 'mMyB': 'nagaš akay', 'meBS': 'yiyen', 'myBS': 'yiyen', 'meBD': 'yiyеn kɨz', 'myBD': 'yiyеn kɨz', 'meZS': 'yiyen', 'myZS': 'yiyen', 'meZD': '

Our random language is Nogai, a Turkic language spoken in Southeastern European Russia, Kazakhstan, Uzbekistan, Ukraine, Bulgaria, Romania and Turkey. But as we can see from printing the system, there's a lot of extra kin terms here that we don't need for our experiment today. We're only interested in Ego and Ego's parents' generations, but the system contains kin types like `mS` (male speaker's son) or `mDD` (make speaker's daughter's daughter). In the next section, we'll reduce `k` down to just the terms we're interested in.

### Condense the system down

The list of possible **kin types** is far larger and more unwieldy than the set of **kin terms** in any language. For instance, while 'father's elder brother' and 'father's younger brother' are not distinguished in English (both take the term *uncle*), these distinctions are indeed encoded by terminology in other languages, like Hindi.

In the Kinbank datasets, kin types are recorded even if they are not distinguished terminologically in a given language. Thus, the English data has an entry for father's elder and younger brothers, and for father's elder and younger brother's sons and daughters. As a result, our dictionary `k` will have many duplicate entries. To avoid overinflating the amount of variation in each language later on, we want to filter out these duplicate entries at this point.

Instead, in `kintypes.py`, another file in this directory, I have created some data structures which group together kin types - so father's brother, father's elder brother, and father's younger brother are grouped together. The following code takes these data structures, compares them against the full kinship system we extracted above, and in the case of a match within these groups, it discards any duplicates and takes only the first matching entry.

We do this separately for Ego's generation and Ego's parents' generation because we want the Ego generation entries to match their parents - that is, we only want to have mother's elder brother's son if we also have mother's elder brother in the system. We also check whether the terms are the same if the speaker is male or female - if so, we take the male speaker terms by default.

In [5]:
def filter_mf(filtered_ks):
    for x,y in zip(kt.m_speaker,kt.f_speaker):
        if x in filtered_ks and y in filtered_ks:
            if filtered_ks[x] == filtered_ks[y]:
                del(filtered_ks[y]) # leave male speaker as default
            else:
                pass
        else:
            pass

In [6]:
def filter_age(ks):
    filtered_ks = {}
    for group in kt.age_split:
        if group[1] in ks and group[2] in ks:
            if ks[group[1]] == ks[group[2]]:
                filtered_ks[group[1]] = ks[group[1]] # leave elder as default
            else:
                filtered_ks[group[1]] = ks[group[1]]
                filtered_ks[group[2]] = ks[group[2]]
        elif group[0] in ks:
            filtered_ks[group[0]] = ks[group[0]]
        else:
            pass
    
    return filtered_ks

Filtering all the kinship systems AND running the MI calculations proves to be quite taxing on memory, so instead of filtering in situ, we're going to save all the filtered kinship systems to csv files. We can then read the data from them using the `get_kin_terms` function. So we'll need some infrastructure for writing the data to a csv file:

In [98]:
def write_headers(file):
    with open(file, 'a') as csv_file:
        csvwriter = csv.writer(csv_file)
        csvwriter.writerow(['parameter','word'])
        
def save_to_csv(file,family,data):
    filename = '../languages/kinbank-filter/' + family + '/' + file
    with open(filename, 'a', encoding="utf8") as csv_file:
        csvwriter = csv.writer(csv_file)
        write_headers(filename)
        for kt in data:
            csvwriter.writerow([kt,data[kt]])

And now we can neatly package our `get_kin_terms` function, our filtering functions, and our writing to csv functions.

In [94]:
def filter_ks(family,file):
    ks = get_kin_terms('../languages/kinbank-family/' + family + '/' + file)        
    
    filtered_ks = filter_age(ks)
    
    filter_mf(filtered_ks)
    
    for entry in filtered_ks:
        if ',' in filtered_ks[entry]:
            filtered_ks[entry] = filtered_ks[entry].split(',')[0]

    if filtered_ks:
        save_to_csv(file,family,filtered_ks)
       
    return filtered_ks

In [10]:
if testing:
    k = filter_ks(file)
    print(k)

{'mM': 'ana', 'mF': 'ata', 'mMeB': 'nagaš akay', 'mFeB': 'akay', 'mMeZ': 'abay', 'mFeZ': 'abay', 'meB': 'adanas', 'myB': 'ini', 'meZ': 'karɨndas', 'myZ': 'siŋli', 'mMBeS': 'bölе', 'mMeBS': 'bölе', 'mMyBS': 'bölе', 'mFBeS': 'bölе', 'mFeBS': 'bölе', 'mFyBS': 'bölе', 'mMZeS': 'bölе', 'mMeZS': 'bölе', 'mMyZS': 'bölе', 'mFZeS': 'bölе', 'mFeZS': 'bölе', 'mFyZS': 'bölе', 'mMBeD': 'bölе', 'mMeBD': 'bölе', 'mMyBD': 'bölе', 'mFBeD': 'bölе', 'mFeBD': 'bölе', 'mFyBD': 'bölе', 'mMZeD': 'bölе', 'mMeZD': 'bölе', 'mMyZD': 'bölе', 'mFZeD': 'bölе', 'mFeZD': 'bölе', 'mFyZD': 'bölе'}


And loop that over every Kinbank file:

**NOTE: if you're reading this, then this process is already done! You'll find the files in `../languages/kinbank-filter/`**

In [None]:
if filtering:
    for file in all_kb_files:
        ks = filter_ks(file)
        print(file,ks)

Now we have our kinship system filtered for relevant terms and duplicates, we also want to create a data structure that pairs up these terms for us. In `kintypes`, you will also find **a list of pairs of kin types**, where the first element in the pair is a parent type, and the second is their child; e.g. `mMeB` and `mMeBD` (mother's elder brother and mother's elder brother's daughter). We will be filtering our full kinship system `k` according to this list of pairs. This is because we're interested in whether kinship systems maintain patterns of terminological distinctions and mergers across these two generations, we will need to know which parent terms 'go with' which child terms.

The following function takes a kinship system as input, and outputs a list of tuples. The first element in the tuple is the parent term, the second is the corresponding child term. 

We also perform a check on the number of different kin types from Ego's parents' generation included across all the pairs in the kinship system. As has been established, the Kinbank data is very broad but not 100% complete for every language recorded. So, if the only pairs we are able to extract from the data include Ego's mother and father **and no other member of Ego's parents' generation** we will not continue to analyse this language, as this is not sufficient evidence to identify internal co-selection.

In [69]:
def get_pairs(ks: dict) -> list:
    pairs_of_terms = []
    parent_types = []

    for pair in kt.ics_pairs:
        if pair[0] in ks and pair[1] in ks:
            pairs_of_terms.append((ks[pair[0]],ks[pair[1]]))
            parent_types.append(pair[0])
                
    if len(set(parent_types)) > 2 : 
        return pairs_of_terms


But for our calculations, we'll still need to know which terms belong to which generation. Luckily, we know that the 0th element in each tuple is from Ego's parents' generation and the 1st element is from Ego's generation. So we can happily split these tuples down the middle and populate two lists with the terms.

In [56]:
def split_pairs(pairs: list) -> list:
    gn = []
    gn1 = []
    for pair in pairs:
        gn.append(pair[1])
        gn1.append(pair[0])
    
    return gn,gn1

To illustrate what these functions do, let's test them out with our random kinship system, `k`.

In [57]:
if testing:
    print(k)
    k_pairs = get_pairs(k)
    print(k_pairs)

{'mM': 'ana', 'mF': 'ata', 'mMeB': 'nagaš akay', 'mFeB': 'akay', 'mMeZ': 'abay', 'mFeZ': 'abay', 'meB': 'adanas', 'myB': 'ini', 'meZ': 'karɨndas', 'myZ': 'siŋli', 'mMBeS': 'bölе', 'mMeBS': 'bölе', 'mMyBS': 'bölе', 'mFBeS': 'bölе', 'mFeBS': 'bölе', 'mFyBS': 'bölе', 'mMZeS': 'bölе', 'mMeZS': 'bölе', 'mMyZS': 'bölе', 'mFZeS': 'bölе', 'mFeZS': 'bölе', 'mFyZS': 'bölе', 'mMBeD': 'bölе', 'mMeBD': 'bölе', 'mMyBD': 'bölе', 'mFBeD': 'bölе', 'mFeBD': 'bölе', 'mFyBD': 'bölе', 'mMZeD': 'bölе', 'mMeZD': 'bölе', 'mMyZD': 'bölе', 'mFZeD': 'bölе', 'mFeZD': 'bölе', 'mFyZD': 'bölе'}
['mM', 'mM', 'mM', 'mM', 'mF', 'mF', 'mF', 'mF', 'mMeB', 'mMeB', 'mFeB', 'mFeB', 'mMeZ', 'mMeZ', 'mFeZ', 'mFeZ']
[('ana', 'adanas'), ('ana', 'ini'), ('ana', 'karɨndas'), ('ana', 'siŋli'), ('ata', 'adanas'), ('ata', 'ini'), ('ata', 'karɨndas'), ('ata', 'siŋli'), ('nagaš akay', 'bölе'), ('nagaš akay', 'bölе'), ('akay', 'bölе'), ('akay', 'bölе'), ('abay', 'bölе'), ('abay', 'bölе'), ('abay', 'bölе'), ('abay', 'bölе')]


In [60]:
if testing:
    test = {'mM': 'blabla',
        'mF': 'blabla',
        'mB': 'beepbeep',
        'mZ': 'boopboop'}
    test_pairs = get_pairs(test)
    print(test_pairs)

['mM', 'mM', 'mF', 'mF']
None


In [18]:
if testing:
    
    k_gn,k_gn1 = split_pairs(k_pairs)

    print("Ego's generation: ", k_gn, '\n')

    print("Ego's parents' generation: ", k_gn1)

Ego's generation:  ['adanas', 'ini', 'karɨndas', 'siŋli', 'adanas', 'ini', 'karɨndas', 'siŋli', 'bölе', 'bölе', 'bölе', 'bölе', 'bölе', 'bölе', 'bölе', 'bölе'] 

Ego's parents' generation:  ['ana', 'ana', 'ana', 'ana', 'ata', 'ata', 'ata', 'ata', 'nagaš akay', 'nagaš akay', 'akay', 'akay', 'abay', 'abay', 'abay', 'abay']


`get_pairs()` gives us a long list of pairs, with lots of repetition. This is because some kin types will correspond to the same kin term in a language. For example, in English, mother's elder brother, mother's younger brother, father's elder brother and father's younger brother all correspond to the term 'uncle'. All of those people's children correspond to the word 'cousin'! So we'll end up with duplicate entries for these terms in some languages, but not all.

`split_pairs()` takes this long list of pairs and sorts it into terms that belong to Ego's generation and terms that belong to Ego's parents' generation. Importantly, since the order of the `pairs` list is preserved when we run `split_generations()`, we can still work out which terms form a parent-child pair by indexing `gn` and `gn1`.

Now we have our data structures, we can start to do some calculations.

### Calculating probabilities

To calculate entropy, we need a probability distribution over the terms in one single generation of a kinship system. So let's start with a function that can calculate the probability of a particular term.

Given a term and the full list of terms in the same generation, this function counts how many times that term exists in `generation` and divides that by the total length of `generation`.

In [19]:
def probability(term: str, generation: list) -> float:
    #print(generation.count(term),len(generation))
    return generation.count(term)/len(generation)

So if we pick a term at random from our Nogai kinship system, it will output the probability of picking that term.

In [20]:
if testing:
    k_term = random.choice(k_gn1)
    print(k_term, probability(k_term,k_gn1))

ana 0.25


When calculating mutual information, we also need the **conditional entropy** of our system. To calculate this, we will need not only the probabilities of terms in a generation, but also the **joint probabilities** of every pair of terms across those two generations. In other words, we need to calculate the probabilities of our `pairs` output by `get_pairs`.

Given two terms, this function counts how many pairs made of those two terms exist in `pairs`, then divides that by the total length of `pairs`.

In [21]:
def joint_probability(term1: str, term2: str, pairs: list) -> float:
    pair = (term1,term2)
    return pairs.count(pair)/len(pairs)

Once again, we can test this with a random pair from our Nogai list of pairs:

In [22]:
if testing:
    sum_jp = []
    for pair in set(k_pairs):
        jp = joint_probability(pair[0],pair[1],k_pairs)
        sum_jp.append(jp)
        print(pair, jp)
    print(sum(sum_jp))

('ata', 'ini') 0.0625
('ata', 'siŋli') 0.0625
('ana', 'karɨndas') 0.0625
('ana', 'adanas') 0.0625
('ana', 'ini') 0.0625
('ana', 'siŋli') 0.0625
('nagaš akay', 'bölе') 0.125
('akay', 'bölе') 0.125
('ata', 'karɨndas') 0.0625
('abay', 'bölе') 0.25
('ata', 'adanas') 0.0625
1.0


Now we can calculate probabilities, we can use these functions to calculate entropy, conditional entropy, and mutual information.

### Calculating entropy and mutual information

Entropy is defined as 

**INSERT EQUATION HERE LOL**

or, in English, it is the inverse sum over a distribution X of the probability of y * the log probability of y.

Entropy is a measure of the average level of uncertainty about the possible outcomes of a variable.

The functions we defined above only calculate a single probability at a time, so our next functions will need to iterate over the kinship system in order to have a full probability distribution. First, let's define a function that will iterate over a generation of the kinship system to output the entropy of that generation. 

Note: we only need one generation's entropy score to calculate mutual information - we will make the arbitrary choice to calculate the entropy of Ego's parents' generation later in this notebook.

In [23]:
def entropy(generation: list) -> list:
    entropy = 0
    for term in set(generation): # using a set as we want to count each unique term only once
        p = probability(term,generation)
        #print('entropy of',term,p*math.log(p))
        entropy += p*math.log2(p)
    return -entropy

In [24]:
if testing:
    print(entropy(k_gn1))

2.25


Moving on, conditional entropy is defined as

**ANOTHER EQUATION PLEASE**

or in English, the inverse sum over two distributions Y and X of the probability of each y * the log probability of each y given x.

Conditional entropy is the amount of information needed to describe the outcome of a random variable Y given that we already know the value of another random variable X.

To calculate it, we need the joint probability of each pair (given by `joint_probability()`) and the probability of one member of that pair (given by `probability()`). We can then calculate the conditional probability of parent term given child term as the joint probability of those terms over the probability of the parent term.

As before, we will define a function that iterates over all pairs to output the conditional entropy of Ego's generation given Ego's parents' generation.

In [25]:
def conditional_entropy(gn: list, pairs:list) -> float:
    entropy = 0
    for x,y in set(pairs): # x = parent, y = child
        p_xy = joint_probability(x,y,pairs)
        p_y = probability(y,gn)
        if p_xy > 0 and p_y > 0:
            #print('p(', x, '|', y,') = ', p_xy/p_y, 'p(y) = ', p_y)
            entropy += p_xy * math.log2(p_xy/p_y)
    return -entropy

In [26]:
if testing:
    print(conditional_entropy(k_gn,k_pairs))

1.25


Finally, mutual information is defined as

**LAST EQUATION**

or in English, entropy of X minus the conditional entropy of X given Y.

In this study, it is equal to the entropy of Ego's parents' generation minus the conditional entropy of Ego's parents' generation given Ego's generation. It tells us how much mutual dependence there is between these two generations; i.e. how much we can know about one by observing the other.

So long as we make sure to input the right entropy and conditional entropy values, we only need a simple function for this one:

In [27]:
def mutual_information(pairs: list):
    gn,gn1 = split_pairs(pairs)
    e = entropy(gn1)
    ce = conditional_entropy(gn,pairs)
    mi = e - ce
    return mi

In [28]:
if testing:
    print(mutual_information(k_pairs))

1.0


And there we have it! Step 4 complete. We can now take any (filtered) Kinbank file and output the mutual information between Ego's generation and Ego's parents' generation in that language.

But right at the beginning of this notebook, I mentioned using **simulations** to test the robustness of our claim that languages exhibit internal co-selection in their kinship systems. These simulations give us a baseline with which to compare the MI scores of real languages. Do languages across the world have greater mutual information between two generations than we would expect by chance?

## Simulations

If we want to argue that internal co-selection is a product of cultural evolution, we need to dispel the possibility that it occurs by chance.

To get an idea of how much information would be shared between two generations purely by chance, we need to create some randomly generated kinship systems. We can compare the MI of these simulations to the real languages to see whether the real languages have significantly greater mutual information between generations.

An important aspect of MI that we have not discussed so far: it is dependent on the amount of variation within the kinship system. A system with only one unique term in each generation would have MI of 0, which seems pretty terrible! But given this very limited variation (indeed, no variation), 0 is the highest MI such a language could have. As such, we perhaps need to modify our claim that kinship systems have "high MI" to be more specific: kinship systems in the wild have high MI *for the amount of variation in terminology they have*.

To compare real languages to simulations, we need a simulation which maintains the number of terms while randomising which child terms pair with which parent terms. To do this, we will take each language in our data, and randomly scramble which terms go with which types (within generation). This will randomise the syncretisms within the generations while maintaining the same amount of variation across the system overall.

To do this, we need to take the following steps:

1. Extract the kinship system of a language from kinbank (check!)
2. Filter the two generations we are interested in (check!)
3. Randomly reassign the kinship terms to new types.
4. Repeat the process a bunch of times for each language.

We already have the infrastructure for the first two! `get_kin_terms()`,  `get_pairs()` and `split_pairs()` will do this for us. So let's skip to 3, and write a function that randomises which terms form pairs, assuming that we have already extracted the kinship system and filtered the relevant pairs.

Remember that the order of `pairs` is preserved when we run `split_pairs()`. So when we pass `gn` and `gn1` to `shuffle_pairs()`, we know that we can re-unite our pairs by using the same index. Equally, when we shuffle `gn` and `gn1` in place, we know that we can safely combine them to make a new, randomised pair in place of the 'real' Nogai pair.

In [29]:
def shuffle_pairs(gn,gn1):
    random.shuffle(gn)
    random.shuffle(gn1)
    #print(Counter(gn))
    random_pairs = []
    for x,y in zip(gn,gn1):
        random_pairs.append((y,x))
    return random_pairs

In [30]:
# def shuffle_pairs(pairs):
#     gn,gn1 = split_pairs(pairs)
#     random.shuffle(gn)
#     print(Counter(gn))
#     random_pairs = []
    
#     for pair in set(pairs):
#         #print(term)
#         #for i in range(gn1.count(term)): # however many times this term forms a unique pair
#         new_pair = pick_new_pair(random_pairs,gn,pair[0])
#         for i in range(pairs.count(pair)):
#             random_pairs.append(new_pair)
        
            
#     return random_pairs

In [31]:
# def shuffle_pairs(pairs):
#     gn,gn1 = split_pairs(pairs)
#     print(Counter(gn))
#     new_pairs = []
#     for pair in set(pairs):
#         choices = list(set(gn))
#         random_term = random.choice(choices)
#         new_pair = (pair[0],random_term)
#         print(new_pair)
#         for x in pairs:
#             if x == pair:
#                 new_pairs.append(new_pair)
#     #print(new_pairs)
#     return new_pairs

In [32]:
def pick_new_pair(random_pairs,gn,term):
    choices = gn
    random_term = random.choice(choices)
    pair = (term,random_term)
    if pair in random_pairs:
        return pick_new_pair(random_pairs,choices,term)
    else:
        gn.remove(random_term)
        return pair

In [76]:
if testing:
    sim_pairs = shuffle_pairs(k_gn,k_gn1)
    sim_gn,sim_gn1 = split_pairs(sim_pairs)
    print(Counter(sim_gn))
    print(k_pairs,sim_pairs)


Counter({'bölе': 8, 'siŋli': 2, 'ini': 2, 'adanas': 2, 'karɨndas': 2})
[('ana', 'adanas'), ('ana', 'ini'), ('ana', 'karɨndas'), ('ana', 'siŋli'), ('ata', 'adanas'), ('ata', 'ini'), ('ata', 'karɨndas'), ('ata', 'siŋli'), ('nagaš akay', 'bölе'), ('nagaš akay', 'bölе'), ('akay', 'bölе'), ('akay', 'bölе'), ('abay', 'bölе'), ('abay', 'bölе'), ('abay', 'bölе'), ('abay', 'bölе')] [('ana', 'bölе'), ('ata', 'siŋli'), ('ata', 'ini'), ('akay', 'adanas'), ('abay', 'bölе'), ('akay', 'bölе'), ('nagaš akay', 'karɨndas'), ('abay', 'bölе'), ('ana', 'karɨndas'), ('nagaš akay', 'bölе'), ('ata', 'ini'), ('abay', 'bölе'), ('ana', 'adanas'), ('ata', 'siŋli'), ('ana', 'bölе'), ('abay', 'bölе')]


Now we can treat `sim_pairs` just as we treated `pairs`! Let's calculate the entropy, conditional entropy, and mutual information of this simulated system.

In [34]:
if testing:
    sim_gn,sim_gn1 = split_pairs(sim_pairs)
    e = entropy(sim_gn1)
    ce = conditional_entropy(sim_gn,sim_pairs)
    mi = mutual_information(sim_pairs)
    print(e,ce,mi)

2.25 1.4528195311147831 0.7971804688852169


In [75]:
if testing:
    sim_pairs = shuffle_pairs(k_gn,k_gn1)
    sim_gn,sim_gn1 = split_pairs(sim_pairs)
    e = entropy(sim_gn1)
    ce = conditional_entropy(sim_gn,sim_pairs)
    mi = mutual_information(sim_pairs)
    print(e,ce,mi)
    print(len(k_pairs),len(sim_pairs))

2.25 1.375 0.875
16 16


Wait! Some of these values are exactly the same as the real Nogai kinship system! I thought this was a randomised simulation - what gives? 

Variation gives! Entropy remains the same regardless, because the amount of variation in the simulation **does not change** by design.

The MI of these two systems (and by extension, the conditional entropy) **does** vary, which is what we want. Let's try with another language:

In [36]:
if testing:
    file2 = random.choice(all_kb_files)
    print(file2)
    k2 = filter_ks(file2)

    k2_pairs = get_pairs(k2)
    print(k2)
    k2_gn,k2_gn1 = split_pairs(k2_pairs)
    k2_e = entropy(k2_gn1)
    k2_ce = conditional_entropy(k2_gn,k2_pairs)
    k2_mi = mutual_information(k2_pairs)
    
    k2_sim = shuffle_pairs(k2_gn,k2_gn1)
    k2_sim_gn,k2_sim_gn1 = split_pairs(k2_sim)
    k2_sim_e = entropy(k2_sim_gn1)
    k2_sim_ce = conditional_entropy(k2_sim_gn,k2_sim)
    k2_sim_mi = mutual_information(k2_sim)
    
    print('Sungwaloge:', k2_e,k2_ce,k2_mi)
    print('Simulation:',k2_sim_e,k2_sim_ce,k2_sim_mi)
    
    print(len(k2_sim),len(k2_pairs))
    
    for i in k2_pairs:
        if i not in k2_sim:
            print(i)
            
    for i in k2_sim:
        if i not in k2_pairs:
            print(i)


Morgan1871_Iowä_iowa1245.csv
{'mM': 'heenʼ-nä', 'mF': 'heenʼ-kä', 'mMeB': 'heen-jaʼ-kä', 'mFeB': 'heenʼ-kä', 'mMeZ': 'heenʼ-nä', 'mFeZ': 'heen-tooʼ-me', 'meB': 'he-yenʼ-nä', 'myB': 'heen-thun̤ʼ-ga', 'meZ': 'he-yuʼ-nä', 'myZ': 'heen-tan̤ʼ-ya', 'mMBeS': 'heen-jaʼ-kä', 'mFBeS': 'he-yenʼ-nä', 'mFByS': 'heen-thun̤ʼ-ga', 'mMZeS': 'he-yenʼ-nä', 'mMZyS': 'heen-thun̤ʼ-ga', 'mFZeS': 'heen-toansʼ-ka', 'mFZyS': 'heen-toasʼ-ka', 'mMBeD': 'heenʼ-nah', 'mMByD': 'heenʼ-nä', 'mFBeD': 'he-yuʼ-nä', 'mFByD': 'heen-tanʼ-ya', 'mMZeD': 'hee-uʼ-nä', 'mMZyD': 'heen-tan̤ʼ-ya', 'mFZeD': 'heen-toasʼ-ka-me', 'fyB': 'e-chunʼ-cha', 'feZ': 'heen-tan̤ʼ-ga', 'fyZ': 'heen-tun̤ʼ-ga', 'fFByS': 'e-chunʼ-cha', 'fMZyS': 'e-chunʼ-cha', 'fFZeS': 'hee-yin̤ʼ-ga', 'fFBeD': 'heen-tan̤ʼ-ga', 'fFByD': 'heen-tun̤ʼ-ga', 'fMZeD': 'heen-tan̤ʼ-ga', 'fMZyD': 'heen-tun̤ʼ-ya', 'fFZeD': 'heen-yun̤ʼ-ga'}
Sungwaloge: 1.0 0.9999999999999997 3.3306690738754696e-16
Simulation: 1.0 0.4285714285714285 0.5714285714285715
14 14
('heenʼ-nä', 'he-yenʼ-

Now we see that while the entropy of our new language, Sungwaloge, and its simulation are equal, the conditional entropy for the simulation is greater and therefore the mutual information of the simulation is lower. What about if we did this 1000 times? How often would the mutual information of the simulation be lower then?

## Tidying up

We have all the pieces we need now to calculate MI and simulate kinship systems - all we need to do is write a few more functions that stick all of those pieces together in a neat parcel.

First, a function that takes pairs and spits out entropy, conditional entropy, and MI:

Second, a function that builds a simulated list of pairs when we pass in a kinship system:

In [37]:
def simulate_ks(ks: dict) -> list:
    pairs = get_pairs(ks)
    if pairs:
        gn,gn1 = split_pairs(pairs)
        simulation = shuffle_pairs(gn,gn1)
        return simulation

In [38]:
def write_data(pairs,results):
    gn,gn1 = split_pairs(pairs)
    egn = entropy(gn)
    egn1 = entropy(gn1)
    ce = conditional_entropy(gn,pairs)
    mi = mutual_information(pairs)
    
    results['mutual_information'] = mi
    results['entropy_gn'] = egn
    results['entropy_gn1'] = egn1
    results['conditional_entropy'] = ce
    results['variation_gn'] = len(set(gn))
    results['variation_gn1'] = len(set(gn1))
    results['number_of_pairs'] = len(set(pairs))
    
    return results

And a couple of functions that put everything together, saves the results to a separate file, and output a `pandas` dataframe so that we can take a good look. `ics_simulation` takes the full list of Kinbank filenames, extracts the relevant kin terms, performs the randomisation simulation on it a specified number of times, calculates entropy, conditional entropy, and MI for each simulation, and saves all that data to a separate file. It also performs some regex magic on the filename so that we get each language's unique code as well as each language's name in full.

In [111]:
def ics_simulation(filepath,all_files:list, times):
    df = []
    codes = []
    
    for file in all_files:
        match = re.search('[a-z]{4}[0-9]{4}[a-z]?', file)
        code = match.group()
        language = file.split('_' + code)[0]
        
        if code not in codes:
            codes.append(code)
        
            ks = get_kin_terms(filepath + file)

            for i in range(times):
                pairs = simulate_ks(ks)
                if pairs:
                    results = {}
                    results['language'] = language
                    results['code'] = code
                    results['simulation_code'] = code + '_' + str(i)
                    results['simulation'] = 'Y'
                    write_data(pairs,results)

                    df.append(results)
    
    #pd.DataFrame(df).to_csv('../data/raw/ics_sims.csv',index=False)
    pd.DataFrame(df).to_csv('../data/raw/ics_sims_aus.csv',index=False)    
    return pd.DataFrame(df)

`ics_real` performs similarly to `ics_simulation`, but instead of performing the randomisation, it calculates entropy, conditional entropy, and MI for the language as-is. It does this for every file in the Kinbank data and saves the data to a separate file.

In [104]:
def ics_real(filepath,all_files):
    df = []
    codes = []
    
    for file in all_files:
        print(file)
        match = re.search("[a-z]{4}[0-9]{4}[a-z]?", file)
        code = match.group()
        language = file.split('_' + code)[0]
        
        if code not in codes:
            codes.append(code)
        
            ks = get_kin_terms(filepath + file)

            pairs = get_pairs(ks)
            print(pairs)

            if pairs: # if pairs is not empty
                mi = mutual_information(pairs)

                results = {}
                results['language'] = language
                results['code'] = code
                results['simulation_code'] = code + '_REAL'
                results['simulation'] = 'N'
                write_data(pairs,results)

                df.append(results)
        
    #pd.DataFrame(df).to_csv('../data/raw/ics_real.csv',index=False)
    pd.DataFrame(df).to_csv('../data/raw/ics_real_aus.csv',index=False)
    print(len(df))
    
    return pd.DataFrame(df)

## Let's go!

If we want to create a dataset from the real language data, we just have to run:

In [78]:
filter_filepath = '../languages/kinbank-filter/' # the filepath where the kinbank files are kept
aus_filepath = '../languages/kinbank-family/Austronesian/'
filtered_kb_files = get_kb_files(filter_filepath)
filtered_aus_files = get_kb_files(aus_filepath)

#ics_real(filter_filepath,filtered_kb_files)
ics_real(aus_filepath,filtered_aus_files)

Maguindanao_(Magindonao_Moro)_magu1243.csv
[('ina', 'kaka'), ('ina', 'ali'), ('ama, bapa', 'kaka'), ('ama, bapa', 'ali'), ('bapa', 'tüngud, tǔngǔd, tungut'), ('bapa', 'tüngud, tǔngǔd, tungut'), ('bapa', 'tüngud, tǔngǔd, tungut'), ('bapa', 'tüngud, tǔngǔd, tungut'), ('bapa', 'tüngud, tǔngǔd, tungut'), ('bapa', 'tüngud, tǔngǔd, tungut'), ('bapa', 'tüngud, tǔngǔd, tungut'), ('bapa', 'tüngud, tǔngǔd, tungut'), ('babu', 'tüngud, tǔngǔd, tungut'), ('babu', 'tüngud, tǔngǔd, tungut'), ('babu', 'tüngud, tǔngǔd, tungut'), ('babu', 'tüngud, tǔngǔd, tungut'), ('babu', 'tüngud, tǔngǔd, tungut'), ('babu', 'tüngud, tǔngǔd, tungut'), ('babu', 'tüngud, tǔngǔd, tungut'), ('babu', 'tüngud, tǔngǔd, tungut'), ('ina', 'kakal'), ('ina', 'kaka'), ('ina', 'ali'), ('ama, bapa', 'kakal'), ('ama, bapa', 'kaka'), ('ama, bapa', 'ali'), ('bapa', 'tüngud, tǔngǔd, tungut'), ('bapa', 'tüngud, tǔngǔd, tungut'), ('bapa', 'tüngud, tǔngǔd, tungut'), ('bapa', 'tüngud, tǔngǔd, tungut')

[('papa', 'tuakana'), ('papa', 'teina'), ('papa', 'tuahine'), ('papa', 'tuahine'), ('tira', 'tuakana'), ('tira', 'teina'), ('tira', 'tuahine'), ('tira', 'tuahine'), ('taueka', 'tuakana'), ('taueka', 'teina'), ('taueka', 'tuahine'), ('taueka', 'tuahine'), ('taueka', 'tuahine'), ('taueka', 'tuahine'), ('taueka', 'tuahine'), ('taueka', 'tuakana'), ('taueka', 'teina'), ('taueka', 'tuahine'), ('taueka', 'tuahine'), ('taueka', 'tuahine'), ('taueka', 'tuahine'), ('taueka', 'tuahine'), ('matua vahine', 'tuakana'), ('matua vahine', 'teina'), ('matua vahine', 'tuahine'), ('matua vahine', 'tuahine'), ('matua vahine', 'tuahine'), ('matua vahine', 'tuahine'), ('matua vahine', 'tuahine'), ('matua vahine', 'tuakana'), ('matua vahine', 'teina'), ('matua vahine', 'tuahine'), ('matua vahine', 'tuahine'), ('matua vahine', 'tuahine'), ('matua vahine', 'tuahine'), ('matua vahine', 'tuahine'), ('papa', 'tuangaane'), ('papa', 'tuangaane'), ('papa', 'tuangaane'), ('papa', 'tuakana'), ('papa', 'teina'), ('tira

Unnamed: 0,language,simulation_code,simulation,mutual_information,entropy_gn,entropy_gn1,conditional_entropy,variation_gn,variation_gn1,number_of_pairs
0,Maguindanao_(Magindonao_Moro),magu1243_REAL,N,0.791858,1.154222,1.791858,1.000000,4,4,8
1,Malua_Bay,malu1245_REAL,N,1.530125,2.840224,1.996792,0.466667,8,4,13
2,Mangarevans,mang1401_REAL,N,0.192953,2.182068,1.969414,1.776461,5,6,18
3,Mangareva,mang1401_REAL,N,0.010934,1.995781,1.886541,1.875607,4,4,16
4,Mangei_(Sobjo),mang1407_REAL,N,0.918296,1.918296,1.584963,0.666667,4,3,6
...,...,...,...,...,...,...,...,...,...,...
131,Xalangi_(Maevo_Vanuatu),mari1426h_REAL,N,0.973258,1.930019,1.799345,0.826087,4,4,7
132,Yabem,yabe1254_REAL,N,1.425252,2.714447,2.582193,1.156941,7,6,16
133,Yakan,yaka1277_REAL,N,0.886541,1.306171,2.234367,1.347826,4,5,9
134,Yamdena,yamd1240_REAL,N,0.220460,0.706334,1.831702,1.611242,5,5,13


And if we want to run a simulation on a single file:

In [42]:
ics_simulation(filepath,[file2],100)

Unnamed: 0,language,simulation_code,simulation,mutual_information,entropy_gn,entropy_gn1,conditional_entropy,variation_gn,variation_gn1,number_of_pairs
0,Morgan1871_Iowä_iowa1245.csv,rgan1871_0,Y,0.42218,2.75,1.0,0.57782,7,2,11
1,Morgan1871_Iowä_iowa1245.csv,rgan1871_1,Y,0.42218,2.75,1.0,0.57782,7,2,11
2,Morgan1871_Iowä_iowa1245.csv,rgan1871_2,Y,0.42218,2.75,1.0,0.57782,7,2,11
3,Morgan1871_Iowä_iowa1245.csv,rgan1871_3,Y,0.50000,2.75,1.0,0.50000,7,2,10
4,Morgan1871_Iowä_iowa1245.csv,rgan1871_4,Y,0.42218,2.75,1.0,0.57782,7,2,11
...,...,...,...,...,...,...,...,...,...,...
95,Morgan1871_Iowä_iowa1245.csv,rgan1871_95,Y,0.25000,2.75,1.0,0.75000,7,2,12
96,Morgan1871_Iowä_iowa1245.csv,rgan1871_96,Y,0.17218,2.75,1.0,0.82782,7,2,13
97,Morgan1871_Iowä_iowa1245.csv,rgan1871_97,Y,0.42218,2.75,1.0,0.57782,7,2,11
98,Morgan1871_Iowä_iowa1245.csv,rgan1871_98,Y,0.50000,2.75,1.0,0.50000,7,2,10


And finally, the full set of simulations from all the kinbank data, run 1000 times on each language.

In [73]:
ics_simulation(filter_filepath,filtered_kb_files,1000)

Unnamed: 0,language,simulation_code,simulation,mutual_information,entropy_gn,entropy_gn1,conditional_entropy,variation_gn,variation_gn1,number_of_pairs
0,'Are'are,area1240_0,Y,0.291692,1.950212,0.863121,0.571429,4,2,6
1,'Are'are,area1240_1,Y,0.291692,1.950212,0.863121,0.571429,4,2,6
2,'Are'are,area1240_2,Y,0.291692,1.950212,0.863121,0.571429,4,2,6
3,'Are'are,area1240_3,Y,0.291692,1.950212,0.863121,0.571429,4,2,6
4,'Are'are,area1240_4,Y,0.291692,1.950212,0.863121,0.571429,4,2,6
...,...,...,...,...,...,...,...,...,...,...
650995,Zulu,zulu1248_995,Y,1.372389,2.199688,2.565448,1.193060,5,6,11
650996,Zulu,zulu1248_996,Y,1.218543,2.199688,2.565448,1.346906,5,6,12
650997,Zulu,zulu1248_997,Y,1.526235,2.199688,2.565448,1.039213,5,6,10
650998,Zulu,zulu1248_998,Y,1.372389,2.199688,2.565448,1.193060,5,6,11


In [82]:
ics_simulation(aus_filepath,filtered_aus_files,1000)

Unnamed: 0,language,code,simulation_code,simulation,mutual_information,entropy_gn,entropy_gn1,conditional_entropy,variation_gn,variation_gn1,number_of_pairs
0,Maguindanao_(Magindonao_Moro),magu1243,magu1243_0,Y,0.155600,1.154222,1.791858,1.636259,4,4,11
1,Maguindanao_(Magindonao_Moro),magu1243,magu1243_1,Y,0.195470,1.154222,1.791858,1.596388,4,4,10
2,Maguindanao_(Magindonao_Moro),magu1243,magu1243_2,Y,0.099343,1.154222,1.791858,1.692515,4,4,12
3,Maguindanao_(Magindonao_Moro),magu1243,magu1243_3,Y,0.155600,1.154222,1.791858,1.636259,4,4,11
4,Maguindanao_(Magindonao_Moro),magu1243,magu1243_4,Y,0.254664,1.154222,1.791858,1.537195,4,4,9
...,...,...,...,...,...,...,...,...,...,...,...
135995,Yami_Tao_,yami1254,yami1254_995,Y,0.040012,1.685475,1.685475,1.645463,4,4,16
135996,Yami_Tao_,yami1254,yami1254_996,Y,0.087535,1.685475,1.685475,1.597940,4,4,14
135997,Yami_Tao_,yami1254,yami1254_997,Y,0.089429,1.685475,1.685475,1.596046,4,4,15
135998,Yami_Tao_,yami1254,yami1254_998,Y,0.071119,1.685475,1.685475,1.614356,4,4,14


Let's test our analysis with only one language family, to keep things simple.

In [99]:
aus_filepath = '../languages/kinbank-family/Austronesian/'
aus_files = get_kb_files(aus_filepath)

for file in aus_files:
    filter_ks('Austronesian',file)


In [103]:
filtered_aus_files = get_kb_files(aus_filepath)

226 226


In [107]:
filtered_aus_filepath = '../languages/kinbank-filter/Austronesian/'

#ics_real(filter_filepath,filtered_kb_files)
ics_real(filtered_aus_filepath,filtered_aus_files)

Maguindanao_(Magindonao_Moro)_magu1243.csv
[('ama', 'kaka'), ('ama', 'ali'), ('bapa', 'tüngud'), ('bapa', 'tüngud'), ('bapa', 'tüngud'), ('bapa', 'tüngud'), ('babu', 'tüngud'), ('babu', 'tüngud'), ('babu', 'tüngud'), ('babu', 'tüngud')]
Malua_Bay_malu1245.csv
[('tate', 'söghösök dövat'), ('bini', 'natök'), ('bini', 'natök'), ('vavwe', 'tate'), ('vavwe', 'vavwe'), ('tate', 'söghösök'), ('tate', 'lelei'), ('tate', 'tisuk waven')]
Manam_mana1295.csv
None
Mangaia_mang1402.csv
None
Mangarevans_mang1401.csv
[('motua', 'tuakana'), ('motua', 'teina'), ('motua', "tue'ine"), ('motua-nui', "tue'ine"), ('motua-iti', "tue'ine"), ('motua-nui', "tue'ine"), ('motua-iti', "tue'ine"), ('kui-nui', "tue'ine"), ('kui-iti', "tue'ine"), ('kui-nui', "tue'ine"), ('kui-iti', "tue'ine"), ('motua', 'tuagane'), ('motua', 'tuakana'), ('motua', 'teina')]
Mangareva_mang1401.csv
Mangei_(Sobjo)_mang1407.csv
[('soesoe', 'kaka'), ('soesoe', 'kaka'), ('kolo', 'kaka'), ('kolo', 'kaka')]
Manggarai_mang1405.csv
None
Maori_ma

Unnamed: 0,language,code,simulation_code,simulation,mutual_information,entropy_gn,entropy_gn1,conditional_entropy,variation_gn,variation_gn1,number_of_pairs
0,Maguindanao_(Magindonao_Moro),magu1243,magu1243_REAL,N,0.721928,0.921928,1.521928,0.800000,3,3,4
1,Malua_Bay,malu1245,malu1245_REAL,N,1.500000,2.750000,1.500000,-0.000000,7,3,7
2,Mangarevans,mang1401,mang1401_REAL,N,0.661705,1.483832,2.128085,1.466380,4,5,8
3,Mangei_(Sobjo),mang1407,mang1407_REAL,N,0.000000,-0.000000,1.000000,1.000000,1,2,2
4,Maori,maor1246,maor1246_REAL,N,0.001340,1.940286,0.985228,0.983888,4,2,8
...,...,...,...,...,...,...,...,...,...,...,...
115,Xalangi_(Maevo_Vanuatu),mari1426h,mari1426h_REAL,N,1.313101,1.936260,1.858555,0.545455,4,4,6
116,Yabem,yabe1254,yabe1254_REAL,N,1.414694,2.519080,2.298689,0.883995,6,5,11
117,Yakan,yaka1277,yaka1277_REAL,N,0.811278,1.186278,0.811278,-0.000000,4,2,4
118,Yamdena,yamd1240,yamd1240_REAL,N,0.000000,-0.000000,0.721928,0.721928,1,2,2


In [112]:
ics_simulation(filtered_aus_filepath,filtered_aus_files,1000)

Unnamed: 0,language,code,simulation_code,simulation,mutual_information,entropy_gn,entropy_gn1,conditional_entropy,variation_gn,variation_gn1,number_of_pairs
0,Maguindanao_(Magindonao_Moro),magu1243,magu1243_0,Y,0.397417,0.921928,1.521928,1.124511,3,3,5
1,Maguindanao_(Magindonao_Moro),magu1243,magu1243_1,Y,0.272906,0.921928,1.521928,1.249022,3,3,5
2,Maguindanao_(Magindonao_Moro),magu1243,magu1243_2,Y,0.272906,0.921928,1.521928,1.249022,3,3,5
3,Maguindanao_(Magindonao_Moro),magu1243,magu1243_3,Y,0.272906,0.921928,1.521928,1.249022,3,3,5
4,Maguindanao_(Magindonao_Moro),magu1243,magu1243_4,Y,0.397417,0.921928,1.521928,1.124511,3,3,5
...,...,...,...,...,...,...,...,...,...,...,...
119995,Yami_Tao_,yami1254,yami1254_995,Y,0.721928,1.921928,1.521928,0.800000,4,3,7
119996,Yami_Tao_,yami1254,yami1254_996,Y,0.597417,1.921928,1.521928,0.924511,4,3,8
119997,Yami_Tao_,yami1254,yami1254_997,Y,0.721928,1.921928,1.521928,0.800000,4,3,7
119998,Yami_Tao_,yami1254,yami1254_998,Y,0.521928,1.921928,1.521928,1.000000,4,3,8
