In [1]:
import csv # for writing dataframes to csv
import random # for making a random choice
import os # for scanning directories
import itertools
import string # for generating strings
from collections import Counter

import kintypes as kt # bringing large lists of kin types into the namespace
import math # for calculating logs
import pandas as pd

# Internal co-selection

Internal co-selection refers to the tendency for kinship systems to have cross-generational consistency in the terminological distinctions or mergers that are made. That is, if your parents' elder brothers share a kin term, then so too will their children. If your parents' sisters are distinguished from your parents' brothers, so too will their children be distinguished. We can test the robustness of this tendency using our frankenlanguages, to see whether internal co-selection occurs at a higher rate than chance.

We will measure internal co-selection in terms of the **mutual information** between Generation N and Generation N+1 in a particular kinship system. That tells us how much information can be gained from one generation by observing the other - we can think of this as the benefit of internal co-selection. That is, we need to work out the conditional entropy between every possible pair of parent and child terms, and the entropy over an entire generation. This will tell us how much information is shared across the two generations; or how much we can predict about one generation given the other.

To do this, we need to do the following:

* Get a list of parent-child pairs for each language.
* Work out the probability of each pair (the joint probability of term A and term B)
* Work out the probabilities of each individual term in a generation.
* Calculate the conditional entropy of the system using 2 and 3, and the entropy of one generation using C.
* Calculate the mutual information of the system.

Luckily, we can re-use some of the infrastructure we already have. For ease, I will write out again the functions that extract kin terms from a kinbank file.

In [2]:
# to get a list of all the kinbank filenames

def get_kb_files():
    files = []
    path = '../languages/kinbank'
    directory = os.scandir(path)
    for file in directory:
        files.append(file.name)
    return files

In [3]:
# to pick a file at random

def random_language(all_data):
    language = random.choice(all_data)
    # print(language)
    return language

In [4]:
# to extract kin terms from one of those files

def get_kin_terms(filepath):
    kin_system = {}
    with open(filepath, encoding='utf8') as f:
        csv_reader = csv.DictReader(f)
        next(csv_reader) # to skip the header row
        for line in csv_reader:
            kin_type = line['parameter']
            kin_term = line['word']
            kin_system[kin_type] = kin_term
    return kin_system

In [5]:
all_kb_files = get_kb_files()

In [6]:
random.seed(52)
file = random_language(all_kb_files)
filepath = '../languages/kinbank/'

l = get_kin_terms(filepath + file)

print(file,l)

Mongo_mong1338.csv {'meB': 'nsómí', 'myB': 'bokume', 'mF': 'tata', 'mPP': 'nkoko', 'mSS': 'nkoko', 'mSD': 'nkoko', 'mDS': 'bonkana', 'mDD': 'bonkana', 'mFB': 'tantinkune', 'mFZ': 'faomoto', 'mMZ': 'nyango', 'mMeZ': 'nyango', 'meBS': 'bona', 'myBS': 'bona', 'meBD': 'bona', 'myBD': 'bona', 'meZS': 'bona', 'myZS': 'bona', 'meZD': 'bona', 'myZD': 'bona', 'mFBD': 'nkanea jende', 'mMBD': 'nkana', 'mMBS': 'nkana', 'mFBeS': 'botomolo', 'mFByS': 'bokume', 'mFBeD': 'nkåna', 'mFByD': 'nkåna', 'mMBeS': 'bona', 'mMByS': 'bona', 'mMBeD': 'bona', 'mMByD': 'bona', 'mFZH': 'bokilo', 'mFBW': 'bokilo', 'mMZH': 'bokilo', 'mMBW': 'bokilo', 'myZ': 'nkaneomoto', 'mFeB': 'tantinkune', 'mFeZ': 'faomoto', 'mFyZ': 'faomoto', 'mFeBD': 'nkanea jende', 'mFyBD': 'nkanea jende', 'mMeBS': 'nkana', 'mMyBS': 'nkana', 'mMeBD': 'nkana', 'mMyBD': 'nkana', 'fZ': 'nkaneomoto', 'feB': 'nsómí', 'fyB': 'bokume', 'fF': 'tata', 'fPP': 'nkoko', 'fSS': 'nkoko', 'fSD': 'nkoko', 'fDS': 'bonkana', 'fDD': 'bonkana', 'fFB': 'tantinkune'

## Getting the relevant terms

The first thing we can do is filter our full kinship system so we just have the kin types that we're interested in - that is, the kin from generation N and generation N+1. We will do this by comparing the kin types in the kinbank file against a list of pairs of kin types - parent-child pairs like 'mother's older brother' and 'mother's older brother's son'.

In [7]:
def get_pairs(ks,d = False):
    pairs_of_terms = {}
    placeholder = []
    
    for pair in kt.ics_pairs:
        if pair[0] in ks and pair[1] in ks:
            pairs_of_terms[pair] = (ks[pair[0]],ks[pair[1]])
            placeholder.append(pair)
            
    if d: # if we want terms mapped to types, return a dictionary
        return pairs_of_terms
    else: # if not, just return a list of pairs
        return list(pairs_of_terms.values())

In [8]:
def split_pairs_unique(pairs: dict):
    gn = {}
    gn1 = {}
    for pair in pairs:
        gn1[pair[0]] = pairs[pair][0]
        gn[pair[1]] = pairs[pair][1]
    
    return gn,gn1
        

In [9]:
def split_pairs(pairs:list):
    gn = []
    gn1 = []
    for pair in pairs:
        gn.append(pair[1])
        gn1.append(pair[0])
        
    return gn,gn1

In [67]:
l_pairs = get_pairs(l)

gn,gn1 = split_pairs(l_pairs)

Now we have a new kinship system with only the relevant terms - much easier to work with. We still need to filter out the **pairs of terms** that interest us, though. In order to calculate mutual information, we need two lists of terms that are equal in length. So even though 

In [12]:
def make_list(dictionary):
    return list(dictionary.values())

## Calculating entropy

Next, we need some functions to calculate probabilities for us.

In [13]:
def probability(term: str, generation: list) -> float:
    return generation.count(term)/len(generation)

And a function to calculate entropy over a list of data.

In [14]:
def entropy(generation):
    entropy = 0
    for term in set(generation):
        p = probability(term,generation)
        entropy += p*math.log(p)
    return -entropy

In [16]:
entropy(gn1)

1.6364955728889847

## Calculating mutual information

Lucky for us, the `sklearn` package has a built in function for calculating mutual information, `mutual_info_score`. We can give it our two generations and it will give us back the mutual information between them measured in nats.

In [17]:
from sklearn.metrics import mutual_info_score

mutual_info_score(gn,gn1)

0.47708075355136104

EDIT: last time I ran this, the entropy and MI were identical. Why different now?

## Simulating kinship systems

We don't just want to run this on real kinship systems. We also want to calculate the mutual information of kinship systems that we've simulated, so we can test whether real languages have higher mutual information than we would expect to occur by chance for a particular amount of variation. In other words, do kinship systems exhibit higher than expected mutual information between GN and GN+1 given the number of terms available in the system?

We already have a way to extract the terms we're interested in - now we need to randomise them and recombine them. This will give us a kinship system with the same amount of variation, but with the relationships between terms randomised.

*split pairs into two lists (order of kintypes is preserved)
*shuffle lists in place
*recombine into pairs by index
*run new list of pairs through calculate_mi

In [77]:
def randomise_pairs(gn,gn1):
    random.shuffle(gn)
    random.shuffle(gn1)
    random_pairs = []
    for x,y in zip(gn,gn1):
        random_pairs.append((y,x))
    return random_pairs

In [78]:
new_gn,new_gn1 = split_pairs(randomise_pairs(gn,gn1))

print(Counter(new_gn1))

print(Counter(gn1))

Counter({'faomoto': 20, 'nyangompame': 20, 'tantinkune': 16, 'nyango': 16, "tat'inkune": 4, "mam'inkune": 4})
Counter({'faomoto': 20, 'nyangompame': 20, 'tantinkune': 16, 'nyango': 16, "tat'inkune": 4, "mam'inkune": 4})


In [44]:
def randomise_generation(g):
    sim_g = {}
    terms = list(g.values())
    #print(Counter(terms))
    types = list(g.keys())
    random.shuffle(terms)
   # print(Counter(terms))
    
    for i in range(len(g)):
        random_term = terms[i]
        kintype = types[i]
        sim_g[kintype] = random_term
        
    return sim_g

In [22]:
l_dict = get_pairs(l,True)
gn,gn1 = split_pairs_unique(l_dict)

randomise_generation(gn1)

print(gn1)

Counter({'nyangompame': 6, 'faomoto': 6, 'tantinkune': 4, 'nyango': 4, "tat'inkune": 2, "mam'inkune": 2})
Counter({'nyangompame': 6, 'faomoto': 6, 'nyango': 4, 'tantinkune': 4, "tat'inkune": 2, "mam'inkune": 2})
{'mMB': 'nyangompame', 'mMeB': 'nyangompame', 'mMyB': 'nyangompame', 'mFB': 'tantinkune', 'mFeB': 'tantinkune', 'mFyB': "tat'inkune", 'mMZ': 'nyango', 'mMeZ': 'nyango', 'mMyZ': "mam'inkune", 'mFZ': 'faomoto', 'mFeZ': 'faomoto', 'mFyZ': 'faomoto', 'fMB': 'nyangompame', 'fMeB': 'nyangompame', 'fMyB': 'nyangompame', 'fFB': 'tantinkune', 'fFeB': 'tantinkune', 'fFyB': "tat'inkune", 'fMZ': 'nyango', 'fMeZ': 'nyango', 'fMyZ': "mam'inkune", 'fFZ': 'faomoto', 'fFeZ': 'faomoto', 'fFyZ': 'faomoto'}


In [23]:
def shuffle_system(g1,g2):
    g1 = randomise_generation(g1)
    g2 = randomise_generation(g2)
    return {**g1,**g2}

In [50]:
shuffle_system(gn,gn1)

{'mMBS': 'nkana',
 'mMBeS': 'nkana',
 'mMByS': 'bona',
 'mMBD': 'nkana',
 'mMBeD': 'nkana',
 'mMByD': 'nkåna',
 'mMeBS': 'nkana',
 'mMeBD': 'nkana',
 'mMyBS': 'nkana',
 'mMyBD': 'nkana',
 'mFBS': 'nkana',
 'mFBeS': 'botomolo',
 'mFByS': 'nkana',
 'mFBD': 'nkana',
 'mFBeD': 'nkana',
 'mFByD': 'nkana',
 'mFeBS': 'nkana',
 'mFeBD': 'bona',
 'mFyBS': 'nkana',
 'mFyBD': 'nkana',
 'mMZS': 'bona',
 'mMZeS': 'nkana',
 'mMZyS': 'nkanea jende',
 'mMZD': 'nkana',
 'mMZeD': 'nkana',
 'mMZyD': 'nkana',
 'mMeZS': 'nkåna',
 'mMeZD': 'nkana',
 'mMyZS': 'nkana',
 'mMyZD': 'nkana',
 'mFZS': 'nkana',
 'mFZeS': 'nkanea jende',
 'mFZyS': 'nkana',
 'mFZD': 'bona',
 'mFZeD': 'nkana',
 'mFZyD': 'nkana',
 'mFeZS': 'nkana',
 'mFeZD': 'nkana',
 'mFyZS': 'nkana',
 'mFyZD': 'nkana',
 'fMBS': 'nkana',
 'fMBeS': 'bona',
 'fMByS': 'nkana',
 'fMBD': 'nkana',
 'fMBeD': 'bona',
 'fMByD': 'nkanea jende',
 'fMeBS': 'nkana',
 'fMeBD': 'nkåna',
 'fMyBS': 'nkana',
 'fMyBD': 'nkana',
 'fFBS': 'nkana',
 'fFBeS': 'nkana',
 'fFB

## Calculating mutual information en masse

Now we have all the pieces, we can calculate the mutual information and entropy of both real kinship systems and simulated ones. Let's wrap everything up in a function.

In [24]:
def calculate_ics(file,simulation = False,times = False):
    ks = get_kin_terms(filepath + file)
    pairs = get_pairs(ks,True)
    g1,g2 = split_pairs_unique(pairs)
    language = file[:-13]

    df = []
    
    if simulation:
        for i in range(times):
            shuffled_system = shuffle_ks(g1,g2)
            pairs = get_pairs(shuffled_system)
            g1,g2 = split(pairs)
            e = entropy(g2)
            mi = mutual_info_score(g1,g2)
            results = {}
            results['simulation_number'] = sim
            results['mutual_information'] = mi
            results['entropy'] = e
            for i in g1:
                results[i] = g1[i]
            for i in g2:
                results[i] = g2[i]
            df.append(results)
        pd.DataFrame(df).to_csv('../data/raw/ics_' + language + '.csv',index=False)

    else:
        e = entropy(make_list(g2))
        mi = mutual_info_score(make_list(g1),make_list(g2))
        results = {}
        results['language'] = language
        results['mutual_information'] = mi
        results['entropy'] = e
        for i in g1:
            results[i] = g1[i]
        for i in g2:
            results[i] = g2[i]
        df.append(results)
        pd.DataFrame(df).to_csv('../data/raw/ics_real_languages.csv',index=False)

  
        
    return pd.DataFrame(df)
    

In [53]:
def calculate_mi(ks):
    pairs = get_pairs(ks)
    #print(pairs)
    gn,gn1 = split_pairs(pairs)
    #print(gn,gn1)
    print(Counter(gn1))
    e = entropy(gn1)
    mi = mutual_info_score(gn,gn1)
    return e,mi

In [79]:
calculate_mi(l)

Counter({'nyangompame': 20, 'faomoto': 20, 'tantinkune': 16, 'nyango': 16, "tat'inkune": 4, "mam'inkune": 4})


(1.6364955728889847, 0.47708075355136104)

In [45]:
def simulation(ks):
    pairs = get_pairs(ks,True)
    gn,gn1 = split_pairs_unique(pairs)
    sim_ks = shuffle_system(gn,gn1)
    sim_pairs = get_pairs(sim_ks)
    simgn,simgn1 = split_pairs(sim_pairs)
    print(Counter(simgn1))
    e = entropy(simgn1)
    mi = mutual_info_score(simgn,simgn1)
    return e,mi

In [64]:
def simulation(ks):
    pairs = get_pairs(ks)
    gn,gn1 = split_pairs(pairs)
    print(Counter(gn1))
    random_pairs = randomise_pairs(gn,gn1)
    simgn,simgn1 = split_pairs(random_pairs)
    print(Counter(simgn1))
    e = entropy(simgn1)
    mi = mutual_info_score(simgn,simgn1)
    return e,mi

In [80]:
simulation(l)

Counter({'nyangompame': 20, 'faomoto': 20, 'tantinkune': 16, 'nyango': 16, "tat'inkune": 4, "mam'inkune": 4})
Counter({'faomoto': 20, 'nyangompame': 20, 'tantinkune': 16, 'nyango': 16, "mam'inkune": 4, "tat'inkune": 4})


(1.6364955728889847, 0.13193950865523082)

In [81]:
file = 'Koya_koya1251.csv'
l2 = get_kin_terms(filepath + file)

In [82]:
calculate_mi(l2)

Counter({'māmā': 20, 'pépe': 20, 'poyé': 20, 'pedi': 4, 'kuci': 4})


(1.4131210683790687, 0.6774944044487079)

In [83]:
simulation(l2)

Counter({'māmā': 20, 'pépe': 20, 'poyé': 20, 'pedi': 4, 'kuci': 4})
Counter({'poyé': 20, 'pépe': 20, 'māmā': 20, 'pedi': 4, 'kuci': 4})


(1.4131210683790687, 0.13721690101481093)