# Lineages
Natalia Vélez, July 2020

The goal of this notebook is to construct family graphs (directed graph from parent —> child) out of the lifelog data. These family trees will be used in several other analyses.

In [2]:
%matplotlib inline

import os, re, glob, json
import pandas as pd
import numpy as np

import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from networkx.drawing.nx_agraph import graphviz_layout

from os.path import join as opj
from tqdm import notebook
from ast import literal_eval as make_tuple

sns.set_context('talk')
sns.set_style('white')

def gsearch(*args): return glob.glob(opj(*args))

Get data from all eras:

In [24]:
era_files = glob.glob('outputs/lifelogs*_data.tsv')
era_list = []

for f in era_files:
    era_data = pd.read_csv(f, sep='\t', index_col=0)
    era_list.append(era_data)
    
era_df = pd.concat(era_list)
era_df = era_df.drop(columns=['server', 'release', 'era'])
era_df.head()

  mask |= (ar1 == a)


Unnamed: 0,event,timestamp,avatar,player,age,sex,location,parent,cause_of_death,killer
0,B,1573257612,2251779.0,9b90770e9c4144721a6abe58faa161ef6b555786,,F,"(-334, -521)",2251499.0,,
1,D,1573257621,2251778.0,c17d2f3663837c82e4f7669a40aa6f618e89e57a,0.54,F,"(-361, -410)",,hunger,
2,B,1573257622,2251780.0,afb7da00e454ba6dac3cddbccfb8c14c5680ab1b,,F,"(-184, -916)",2251439.0,,
3,B,1573257623,2251781.0,abbebf5a1056fd091dff74135c014aa1081435d8,,F,"(-416, -524)",2251557.0,,
4,D,1573257624,2251740.0,bd6841eb13118ce2a0453e5802b91f7ec0c1e20e,3.72,F,"(-238, -855)",,hunger,


Find players' parents and time/location of birth:

In [26]:
idx_vars = ['player', 'avatar']
births = era_df[era_df['event'] == 'B'].copy()
births = births[idx_vars + ['timestamp', 'parent', 'location']]
births = births.rename({'location': 'birth', 'timestamp': 'tBirth'}, axis='columns')
births.head()

Unnamed: 0,player,avatar,tBirth,parent,birth
0,9b90770e9c4144721a6abe58faa161ef6b555786,2251779.0,1573257612,2251499,"(-334, -521)"
2,afb7da00e454ba6dac3cddbccfb8c14c5680ab1b,2251780.0,1573257622,2251439,"(-184, -916)"
3,abbebf5a1056fd091dff74135c014aa1081435d8,2251781.0,1573257623,2251557,"(-416, -524)"
7,bd6841eb13118ce2a0453e5802b91f7ec0c1e20e,2251782.0,1573257629,2251463,"(-156, -855)"
9,40a960756347fa0a7c3d14791bd0379c6cc754cf,2251783.0,1573257634,2251595,"(-155, -962)"


Find time and circumstances of death:

In [27]:
deaths =  era_df[era_df['event'] == 'D'].copy()
deaths = deaths[idx_vars + ['timestamp', 'location', 'age', 'cause_of_death']]
deaths = deaths.rename({'location': 'death', 'timestamp': 'tDeath'}, axis='columns')

deaths.head()

Unnamed: 0,player,avatar,tDeath,death,age,cause_of_death
1,c17d2f3663837c82e4f7669a40aa6f618e89e57a,2251778.0,1573257621,"(-361, -410)",0.54,hunger
4,bd6841eb13118ce2a0453e5802b91f7ec0c1e20e,2251740.0,1573257624,"(-238, -855)",3.72,hunger
5,7e058600565826811059c93c9ce403d7b6c4533b,2251577.0,1573257625,"(-201, -903)",21.72,hunger
6,40a960756347fa0a7c3d14791bd0379c6cc754cf,2251776.0,1573257626,"(-765, -581)",0.69,hunger
8,ac7e03fde959b54812abf9798cc25cf55b3d6baa,2251722.0,1573257632,"(-323, -508)",5.9,hunger


## Merge births and deaths

**DEBUG:** Look for missing player in births and deaths

In [28]:
missing_id = 2279990
print('[DEBUG] Searching for: %i' % missing_id)
print(births[births['avatar'] == missing_id])
print(deaths[deaths['avatar'] == missing_id])

[DEBUG] Searching for: 2279990
                                         player     avatar      tBirth  \
53530  dab8f320de3602d9a7543ffeae4f39a6e14cafbc  2279990.0  1573948575   

        parent         birth  
53530  2279930  (-6452, 369)  
                                         player     avatar      tDeath  \
53745  dab8f320de3602d9a7543ffeae4f39a6e14cafbc  2279990.0  1573951034   

              death    age cause_of_death  
53745  (-6441, 368)  40.98         hunger  


**DEBUG:** Unique avatar IDs in births and deaths

In [29]:
# Get unique avatars in each DF
unique_births = np.unique(births['avatar'])
unique_deaths = np.unique(deaths['avatar'])

# How many total unique avatars?
unique_avatars = np.concatenate((unique_births, unique_deaths))
unique_avatars = np.unique(unique_avatars)

# Which avatars are in deaths but not births, and vice versa?
not_in_deaths = np.setdiff1d(unique_births, unique_deaths)
not_in_births = np.setdiff1d(unique_deaths, unique_births)

print('%i unique avatars' % len(unique_avatars))
print('%i in both' % len(np.intersect1d(unique_births, unique_deaths)))
print('%i not in deaths: %s...' % (len(not_in_deaths), not_in_deaths[:5],))
print('%i not in births: %s...' % (len(not_in_births), not_in_births[:5],))

2934697 unique avatars
2929522 in both
4912 not in deaths: [ 221. 2850. 2851. 2852. 2937.]...
263 not in births: [525308. 525311. 525312. 525332. 525358.]...


Merge & clean up:

In [30]:
print('Births: %s' % (births.shape,))
print('Deaths: %s' % (deaths.shape,))

life_df = pd.merge(births, deaths, on=idx_vars, how='outer')
print('Merged dataframe: %s' % (life_df.shape,))

Births: (2934434, 5)
Deaths: (2929785, 6)
Merged dataframe: (2934697, 9)


In [45]:
life_df[life_df['avatar'] == not_in_births[-1]]

Unnamed: 0,player,avatar,tBirth,parent,birth,tDeath,death,age,cause_of_death
2934518,ef551ed04f7b4c8195142dcaa00aa144c2b3e2bc,2549358.0,,,,1578615000.0,"(-6189, 85)",3.78,hunger


In [26]:
life_df = pd.merge(births, deaths, on=idx_vars)

# Turn birth/death locations to tuples
print('Birth/death locations...')
life_df['birth'] = life_df['birth'].apply(make_tuple).apply(np.array)
life_df['death'] = life_df['death'].apply(make_tuple).apply(np.array)

# Split coordinates
print('Splitting into x/y coords...')
life_df[['birthX', 'birthY']] = pd.DataFrame(life_df['birth'].tolist(),
                                              index=life_df.index)   
life_df[['deathX', 'deathY']] = pd.DataFrame(life_df['death'].tolist(),
                                              index=life_df.index)

# Parse player IDs
print('Parsing player IDs...')
life_df['avatar'] = life_df['avatar'].astype(np.int)

# Parse parent IDs
print('Parsing parents...')
#life_df['parent'] = life_df['parent'].str.extract('(noParent|(?<=parent=)[0-9]+)')
life_df['parent'] = life_df['parent'].str.replace('noParent', '-1')
life_df['parent'] = life_df['parent'].astype(np.int)

# Order from most recent
print('Cleaning up...')
life_df = life_df.sort_values('tBirth', ascending=False)
print(life_df.shape)
life_df.head()

Birth/death locations...
Splitting into x/y coords...
Parsing player IDs...
Parsing parents...
Cleaning up...
(2922932, 15)


Unnamed: 0,release,era,player,avatar,tBirth,parent,birth,tDeath,death,age,cause_of_death,birthX,birthY,deathX,deathY
772934,342.0,boundless,a8b5975a81344f690f45ffc2554a0bc35af557a9,3080253,1592294343,3080205,"[-454363, 94]",1592294346,"[-454363, 94]",0.05,disconnect,-454363,94,-454363,94
772933,342.0,boundless,b2d09ba08ab6c9880104599a8bbfd91d9fa8e70d,3080250,1592294329,3080205,"[-454363, 94]",1592294335,"[-454363, 94]",0.1,disconnect,-454363,94,-454363,94
772932,342.0,boundless,a8b5975a81344f690f45ffc2554a0bc35af557a9,3080248,1592294261,3080224,"[-454899, 207]",1592294337,"[-454918, 159]",1.27,disconnect,-454899,207,-454918,159
772931,342.0,boundless,a8b5975a81344f690f45ffc2554a0bc35af557a9,3080247,1592294247,3080235,"[-454796, -125]",1592294255,"[-454796, -125]",0.13,disconnect,-454796,-125,-454796,-125
772930,342.0,boundless,a8b5975a81344f690f45ffc2554a0bc35af557a9,3080246,1592294231,3080224,"[-454899, 206]",1592294241,"[-454899, 206]",0.17,disconnect,-454899,206,-454899,206


In [27]:
print(life_df.shape)
life_df.head()

(2922932, 15)


Unnamed: 0,release,era,player,avatar,tBirth,parent,birth,tDeath,death,age,cause_of_death,birthX,birthY,deathX,deathY
772934,342.0,boundless,a8b5975a81344f690f45ffc2554a0bc35af557a9,3080253,1592294343,3080205,"[-454363, 94]",1592294346,"[-454363, 94]",0.05,disconnect,-454363,94,-454363,94
772933,342.0,boundless,b2d09ba08ab6c9880104599a8bbfd91d9fa8e70d,3080250,1592294329,3080205,"[-454363, 94]",1592294335,"[-454363, 94]",0.1,disconnect,-454363,94,-454363,94
772932,342.0,boundless,a8b5975a81344f690f45ffc2554a0bc35af557a9,3080248,1592294261,3080224,"[-454899, 207]",1592294337,"[-454918, 159]",1.27,disconnect,-454899,207,-454918,159
772931,342.0,boundless,a8b5975a81344f690f45ffc2554a0bc35af557a9,3080247,1592294247,3080235,"[-454796, -125]",1592294255,"[-454796, -125]",0.13,disconnect,-454796,-125,-454796,-125
772930,342.0,boundless,a8b5975a81344f690f45ffc2554a0bc35af557a9,3080246,1592294231,3080224,"[-454899, 206]",1592294241,"[-454899, 206]",0.17,disconnect,-454899,206,-454899,206


In [28]:
life_df[life_df['avatar'] == missing_id]

Unnamed: 0,release,era,player,avatar,tBirth,parent,birth,tDeath,death,age,cause_of_death,birthX,birthY,deathX,deathY


## Parse names

Helper function: Remove from data...

* Roman numerals (including some nonsense ones at high-gen numbers)
* Kin labels? (e.g., SHINA1580640)

In [None]:
def is_roman(s):
    # Checks if string is a "valid" Roman numeral
    # Note this includes some *cough*idiosyncratic*cough* numerals in the OHOL dataset:
    # e.g., CLXLIII => True (but actually nonsense)
    roman_regex  = 'M{0,4}(CM|CD|D?C{0,3})(LXL|XC|XL|L?X{0,3})(IX|IV|V?I{0,3})'
    roman_match = re.match(roman_regex, s)
    
    str_length = len(s)
    match_length = roman_match.span()[1]
    
    return str_length == match_length

def is_kin(s, idx):
    kin_regex = '[A-Z]{0,}[0-9]+'
    kin_match = re.search(kin_regex, s)
    return bool(kin_match) & (idx > 0)

def is_valid(s, idx):
    
    not_roman = not is_roman(s)
    not_kin = not is_kin(s, idx)
    
    return not_roman & not_kin

Find name files:

In [None]:
data_dir = '../data'
name_files = glob.glob(opj(data_dir, 'publicLifeLogData', 'lifeLog_bigserver2*', '*names.txt'))
name_files.sort()
print(*[os.path.basename(f) for f in name_files[:5]], sep='\n')

Iterate over name files and extract names:

In [None]:
name_list = []
re.compile("^([A-Z][0-9]+)+$")

for f in notebook.tqdm(name_files):
    with open(f, 'r') as handle:
        data_str = handle.read().splitlines()

    data = []
    for s in data_str:
        row_data = s.split() # Split lines
        row_data = [si for idx, si in enumerate(row_data) if is_valid(si, idx)]
        
        if len(row_data) > 3:
            row_data.remove('EVE')
            print('Corrected: %s' % row_data)

        while len(row_data) < 3: 
            row_data.append('')

        name_list.append(row_data)

Assemble into dataframe:

In [None]:
name_df = pd.DataFrame(name_list, columns=['playerID', 'first', 'last'])
name_df['playerID'] = name_df['playerID'].astype(np.int)
name_df.head()

Merge with `life_df`:

In [None]:
life_df = pd.merge(life_df, name_df, on='playerID')
life_df.head()

Save `life_df` to file:

In [None]:
life_df.to_csv('outputs/all_lifelogs_compact.tsv', sep='\t')

## Sanity check: How many lineages can we expect?

Spot eves:

In [None]:
eves = life_df[life_df['parent'] < 0].reset_index()
print(life_df['playerID'][:10])

How many eves?

In [None]:
n_eves = eves.shape[0]
n_births = len(life_df)
eve_rate = n_eves/n_births
print('%i Eves out of %i births (%0.2f%%)' % (n_eves, n_births, eve_rate*100))

Eve spawn rate over time:

In [None]:
eve_rate = life_df.copy()
eve_rate['is_eve'] = eve_rate['parent'] < 0
eve_rate = eve_rate.groupby('release')['is_eve'].agg(['count', 'sum']).reset_index()
eve_rate = eve_rate.rename(columns={'count': 'n_births', 'sum': 'n_eves'})
eve_rate['eve_rate'] = eve_rate['n_eves']/eve_rate['n_births']

ax = sns.lineplot('release', 'eve_rate', data=eve_rate)
ax.set(xlabel = 'Release #', ylabel = 'Eve spawn rate')

## Build family trees

Helper: Search recursively through lifelogs, starting with Eve

In [None]:
def search_fam(player):
    descendants = parent_df.loc[parent_df['parent'] == player, 'playerID'].values
    
    for d in descendants:
        descendants = np.append(descendants, search_fam(d))
        
    return descendants

Helper: Write data to JSON file

In [None]:
def write_json(data, f):
    with open(f, 'w') as outfile:
        json.dump(data, outfile)

All parent-child pairs:

In [None]:
# All parent-child links
parent_df = life_df[['playerID', 'parent']].copy()
parent_df.head()

Main loop: Build graphs from parent-child pairs:

In [None]:
families_list = []

for _, family in notebook.tqdm(eves.iterrows(), total=n_eves):

    eve = family['playerID']
    fam_name = family['last']
    fam_start = family['tBirth']

    fam_nodes = list(search_fam(eve))
    fam_df = parent_df[parent_df['playerID'].isin(fam_nodes)]
    fam = nx.from_pandas_edgelist(fam_df, 'parent', 'playerID', None, nx.DiGraph())    
    fam.add_node(str(eve))

    all_members = fam_nodes + [eve]

    if not len(fam_name):
        fam_name = 'nameless'

    fam_id = 'time-%i_eve-%i_name-%s' % (fam_start, eve, fam_name)

    # Add family to list
    families_list.extend([(relative, fam_id) for relative in all_members])

    # Save family data
    out_file = 'outputs/families/families_%s.json' % fam_id
    fam_data = nx.json_graph.node_link_data(fam)
    write_json(fam_data, out_file)

### Tag lifelogs by family
This will be used for several subsequent analyses (e.g., migration, comparing different success measures)

In [None]:
families_df = pd.DataFrame(families_list, columns=['playerID', 'family'])
families_df.to_csv('outputs/family_playerID.tsv', sep='\t')