# Lineages
Natalia Vélez, July 2020

In this notebook:

* Charting lineages
* Comparing lineages using various measures of well-being (# descendants, lifespan, causes of death, etc.)
* Preparing inputs for subsequent analyses (combined with food, map change logs)

This notebook uses data from November 2019--June 2020 (which corresponds to the Boundless World era)

In [None]:
%matplotlib inline

import os, re, glob
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import networkx
from os.path import join as opj
from tqdm import notebook
from ast import literal_eval as make_tuple

sns.set_context('talk')
sns.set_style('white')

def gsearch(*args): return glob.glob(opj(*args))

Extract births & parents:

In [None]:
# Read all data
data = pd.read_csv('outputs/lifelogs_bigserver2_boundless_data.tsv', sep='\t', index_col=0)

# Dataframe of births
births_df = data.copy()
births_df = births_df[births_df['event'] == 'B'].reset_index(drop=True)
births_df = births_df[['server',
                       'release',
                       'era',
                       'timestamp',
                       'playerID',
                       'hash',
                       'sex',
                       'location',
                       'parent',
                       'pop',
                       'chain']]

# Parse player IDs
births_df['playerID'] = births_df['playerID'].astype(np.int)

# Parse parent IDs
births_df['parent'] = births_df['parent'].str.extract('(noParent|(?<=parent=)[0-9]+)')
births_df['parent'] = np.where(births_df['parent'] == 'noParent',
                              -1, births_df['parent'])
births_df['parent'] = births_df['parent'].astype(np.int)

births_df.head()

## Sanity check: How many lineages can we expect?

How many Eves?

In [None]:
eves = births_df[births_df['parent'] < 0]
n_eves = len(eves)
n_births = len(births_df)
eve_rate = n_eves/n_births
print('%i Eves out of %i births (%0.2f%%)' % (n_eves, n_births, eve_rate*100))

Eve spawn rate over time:

In [None]:
eve_rate = births_df.copy()
eve_rate['is_eve'] = eve_rate['parent'] < 0
eve_rate = eve_rate.groupby('release')['is_eve'].agg(['count', 'sum']).reset_index()
eve_rate = eve_rate.rename(columns={'count': 'n_births', 'sum': 'n_eves'})
eve_rate['eve_rate'] = eve_rate['n_eves']/eve_rate['n_births']

ax = sns.lineplot('release', 'eve_rate', data=eve_rate)
ax.set(xlabel = 'Release #', ylabel = 'Eve spawn rate')

In [None]:
eves

## Parse names

Helper function: Remove from data...

* Roman numerals (including some nonsense ones at high-gen numbers)
* Kin labels? (e.g., SHINA1580640)

In [None]:
def is_roman(s):
    # Checks if string is a "valid" Roman numeral
    # Note this includes some *cough*idiosyncratic*cough* numerals in the OHOL dataset:
    # e.g., CLXLIII => True (but actually nonsense)
    roman_regex  = 'M{0,4}(CM|CD|D?C{0,3})(LXL|XC|XL|L?X{0,3})(IX|IV|V?I{0,3})'
    roman_match = re.match(roman_regex, s)
    
    str_length = len(s)
    match_length = roman_match.span()[1]
    
    return str_length == match_length

def is_kin(s, idx):
    kin_regex = '[A-Z]{0,}[0-9]+'
    kin_match = re.search(kin_regex, s)
    return bool(kin_match) & (idx > 0)

def is_valid(s, idx):
    
    not_roman = not is_roman(s)
    not_kin = not is_kin(s, idx)
    
    return not_roman & not_kin

In [None]:
row_data = ['386865', 'MOON', 'EVE', 'ROA']
[is_valid(si, idx) for idx, si in enumerate(row_data) if is_valid(si, idx)]


Find name files:

In [None]:
data_dir = '../data'
name_files = glob.glob(opj(data_dir, 'publicLifeLogData', 'lifeLog_bigserver2*', '*names.txt'))
name_files.sort()
print(*[os.path.basename(f) for f in name_files[:5]], sep='\n')

Iterate over name files and extract names:

In [None]:
name_list = []
re.compile("^([A-Z][0-9]+)+$")

for f in notebook.tqdm(name_files):
    with open(f, 'r') as handle:
        data_str = handle.read().splitlines()

    data = []
    for s in data_str:
        row_data = s.split() # Split lines
        row_data = [si for idx, si in enumerate(row_data) if is_valid(si, idx)]
        
        if len(row_data) > 3:
            row_data.remove('EVE')
            print(row_data)

        while len(row_data) < 3: 
            row_data.append('')

        name_list.append(row_data)

Assemble into dataframe:

In [None]:
name_df = pd.DataFrame(name_list, columns=['playerID', 'first', 'last'])
name_df.head()