# Compare success of different lineages
Natalia Vélez, July 2020

Now that we've built a graph representation of family trees, we'll use it to compare lineages and operationalize a success measure.

In [8]:
%matplotlib inline

import os, re, glob, datetime, json
from os.path import join as opj
import pandas as pd
import numpy as np
import scipy.stats
from tqdm import notebook

import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from networkx.drawing.nx_agraph import graphviz_layout

sns.set_context('talk')
sns.set_style('white')

## Set up data

Load lifelog data:

In [9]:
era_df = pd.read_csv('outputs/all_lifelogs_compact.tsv', sep='\t', index_col=0)
era_df.head()

  mask |= (ar1 == a)


Unnamed: 0,release,era,hash,playerID,tBirth,parent,birth,tDeath,death,age,cause_of_death,birthX,birthY,deathX,deathY,first,last
0,342.0,boundless,2cdc4d0b016f9c0d96d27013f1d19c42596e0c5c,3080130,1592288229,3080111,[-454640 41],1592288538,[-454625 49],5.14,hunger,-454640,41,-454625,49,CLAUDINE,P
1,342.0,boundless,a8b5975a81344f690f45ffc2554a0bc35af557a9,3080128,1592288191,3080105,[-454394 -4],1592291791,[-454383 15],60.0,oldAge,-454394,-4,-454383,15,DACY,LIGHTNING
2,342.0,boundless,c5838da39fd525a2ac24aee049dae6a42e502236,3080125,1592288138,3080101,[-311131 -322],1592291738,[-311112 -325],59.98,hunger,-311131,-322,-311112,-325,SHOKO,GREATHOUSE
3,342.0,boundless,11f9fce50bbc1ebbba5126434aea123e79249942,3080122,1592288029,3080104,[-454638 41],1592288696,[-454647 58],11.12,hunger,-454638,41,-454647,58,RANGER,PICKLE
4,342.0,boundless,b265b1958566b474094cf0282a0fa59e6d622885,3080120,1592287917,3080101,[-311111 -311],1592291365,[-311115 -66],57.46,hunger,-311111,-311,-311115,-66,HAPPY,GREATHOUSE


Load families:

In [10]:
fam_df = pd.read_csv('outputs/family_playerID.tsv', sep='\t', index_col=0)
fam_df.head()

  mask |= (ar1 == a)


Unnamed: 0,playerID,family
0,3080084,time-1592284232_eve-3080067_name-PICKLE
1,3080114,time-1592284232_eve-3080067_name-PICKLE
2,3080111,time-1592284232_eve-3080067_name-PICKLE
3,3080108,time-1592284232_eve-3080067_name-PICKLE
4,3080104,time-1592284232_eve-3080067_name-PICKLE


Merge:

In [11]:
era_df = pd.merge(era_df, fam_df, on='playerID')

Remove singleton families:

In [12]:
singletons = era_df.groupby('family')['playerID'].agg('count').reset_index()
singletons = singletons.rename(columns={'playerID': 'count'})
singletons = singletons[singletons['count'] == 1]
single_fams = singletons.family.values

print('Removing %i singleton families' % len(single_fams))
era_df = era_df[~era_df['family'].isin(single_fams)]
era_df = era_df.reset_index(drop=True)

Removing 21322 singleton families


NameError: name 'family' is not defined

Just look at lineages from the boundless world era:

In [None]:
boundless_df = era_df[era_df['era'] == 'boundless'].copy().reset_index(drop=True)
boundless_families = np.unique(boundless_df['family'])

print('Analyzing %i families' % len(boundless_families))
print('%i family members' % len(np.unique(boundless_df['playerID'])))
print(*boundless_families[:10], sep='\n')

Analyzing 3084 families
324597 family members
time-1573261529_eve-2252167_name-VIERNES
time-1573261796_eve-2252178_name-BELAND
time-1573261810_eve-2252180_name-BRAND
time-1573261816_eve-2252182_name-GERMAN
time-1573261826_eve-2252186_name-LOLI
time-1573261831_eve-2252188_name-SANDRA
time-1573261840_eve-2252190_name-DEVILLE
time-1573261851_eve-2252192_name-TOMBARI
time-1573261866_eve-2252194_name-SPARTA
time-1573261997_eve-2252229_name-MILLER


In [None]:
lineage_df = era_df.copy()
lineage_df = lineage_df[lineage_df['family'].isin(boundless_families)].reset_index(drop=True)
lineage_df.head()

In [None]:
t_fmt = '%Y-%m-%dT%H:%M:%S %Z'

start_t = np.min(lineage_df['tBirth'])
start_date = datetime.datetime.fromtimestamp(start_t).strftime(t_fmt)

end_t = np.max(lineage_df['tDeath'])
end_date = datetime.datetime.fromtimestamp(end_t).strftime(t_fmt)

print('Analyzing %i lineages' % len(boundless_families))
print('First lineage starts at: %s' % start_date)
print('Lineages end at: %s' % end_date)

## Compute summary statistics

### Family size and life expectancy

In [None]:
life_expectancy = lineage_df.groupby('family')['age'].agg(['mean', 'count']).reset_index()
ax = sns.distplot(life_expectancy['mean'])
ax.set(xlabel = 'Life expectancy by family')

In [None]:
ax = sns.distplot(life_expectancy['count'])
ax.set(xlabel = 'Total family size')

In [None]:
scipy.stats.mode(life_expectancy['count'])

In [None]:
g = sns.jointplot(data=life_expectancy, x = 'mean', y = 'count', alpha = 0.1)
g.set_axis_labels(xlabel='Life expectancy', ylabel='Family size')

### Living population size over time

In [None]:
living_list = []
for fam in notebook.tqdm(boundless_families):
    fam_df = lineage_df[lineage_df['family'] == fam].copy().reset_index(drop=True)
    t = fam_df['tBirth'].values
    for ti in t:
        is_alive = (fam_df['tBirth'] <= ti) & (fam_df['tDeath'] > ti)
        pop = np.sum(is_alive)
        living_list.append((fam, ti, pop))

In [None]:
living_df = pd.DataFrame(living_list, columns = ['family', 't', 'population'])
living_df = living_df.sort_values(by=['family', 't'], ascending=True).reset_index(drop=True)
living_df['t0'] = living_df.groupby('family')['t'].transform('first')
living_df['t_elapsed'] = (living_df['t'] - living_df['t0'])/60/60
living_df.head()

Plot a subset (to-do: sample by quintile)

In [None]:
np.random.seed(526)
random_families = np.random.choice(boundless_families, size=10, replace=False)
random_subset = living_df[living_df['family'].isin(random_families)]
g = sns.relplot(data=random_subset, x='t_elapsed', y='population', hue='family', kind='line',
                 height=6, aspect=2, alpha = 0.5)
g.set(xlabel = 'Time elapsed (hours)', ylabel = 'Population size')
g._legend.remove()

Maximum population size

In [None]:
max_pop = living_df.groupby('family')['population'].agg('max').reset_index()
ax = sns.distplot(max_pop['population'])
ax.set(xlabel = 'Maximum living population size')

In [None]:
scipy.stats.mode(max_pop['population'])

### Chain length

Helper: Read JSON files

In [None]:
def open_graph(f):
    with open(f) as handle:
        graph_data = json.load(handle)
    return nx.json_graph.node_link_graph(graph_data)

Find longest chain in family graphs

In [None]:
chain_list = []

for f in notebook.tqdm(boundless_families):
    fam_file = 'outputs/families/families_%s.json' % f
    fam_graph = open_graph(fam_file)
    fam_chain = nx.algorithms.dag_longest_path(fam_graph)
    chain_list.append((f, len(fam_chain)))

In [None]:
chain_df = pd.DataFrame(chain_list, columns=['family', 'chain'])
chain_df.head()

In [None]:
scipy.stats.mode(chain_df['chain'])

In [None]:
ax = sns.distplot(chain_df['chain'])
ax.set(xlabel = '# of generations')

## Modeling success

Criterion for "success": Reaching age 14 (viability fitness)

In [None]:
mortality_df = lineage_df.copy()
mortality_df['adult'] = (lineage_df['age'] >= 14)*1
mortality_df = mortality_df[['family', 'playerID', 'adult']]

mortality_summ = mortality_df.groupby('family')['adult'].agg(['sum', 'count']).reset_index()
mortality_summ['sum'] = mortality_summ['sum'].astype(np.int)
mortality_summ.head()

Compute beta distribution for each family

Prior: Uniform
$$
\theta \sim \mathrm{Beta}(\alpha_0, \beta_0) \\
\alpha_0 = \beta_0 = 1
$$

Posterior-sufficient statistics:
$$
\theta | D \sim \mathrm{Beta}(\alpha_0 + k, \beta_0 + N - k) \\ 
a = \alpha_0 + k \\
b = \beta_0 + N - k
$$

In [None]:
def beta_mean(row): return scipy.stats.beta.mean(row['a'], row['b'])
def beta_var(row): return scipy.stats.beta.var(row['a'], row['b'])

mortality_summ['a'] = 3 + mortality_summ['sum']
mortality_summ['b'] = 3 + mortality_summ['count'] - mortality_summ['sum']
mortality_summ['beta_mean'] = mortality_summ.apply(beta_mean, axis=1)
mortality_summ['beta_var'] = mortality_summ.apply(beta_var, axis=1)
mortality_summ['snr'] = mortality_summ['beta_mean']/mortality_summ['beta_var']
mortality_summ['weighted_size'] = mortality_summ['beta_mean']*mortality_summ['count']
mortality_summ.to_csv('outputs/family_fitness.tsv', sep='\t', index=False)
mortality_summ.head()

### Compare candidate success measures

Beta mean:

In [None]:
ax = sns.distplot(mortality_summ['beta_mean'])
ax.set(xlabel = 'Mean viability ($\mu$)')

Distribution of SNR:

In [None]:
ax = sns.distplot(mortality_summ['snr'])
ax.set(xlabel='Signal-to-noise ratio ($\mu/\sigma$)')

Distribution of weighted size:

In [None]:
ax = sns.distplot(mortality_summ['weighted_size'])
ax.set(xlabel='Weighted family size ($\mu N$)')

Distribution of # adults:

In [None]:
ax = sns.distplot(mortality_summ['sum'])
ax.set(xlabel='# of adults')

### Plot representative families

Split data into quartiles:

In [None]:
success = 'sum' # Success metric
mortality_summ['quantile'] = pd.qcut(mortality_summ[success], 4, labels=False)

What are the quantiles?

In [None]:
success_q = scipy.stats.mstats.mquantiles(mortality_summ[success], prob=[0.25, 0.5, 0.75, 1])
success_q

Plot representative family trees from each quartile:

In [None]:
np.random.seed(526)
representative_families = np.array([np.random.choice(group['family'], 10) 
                                    for name,group in mortality_summ.groupby('quantile')])
rep_list = np.array(representative_families)
rep_list = rep_list.flatten()

rep_info = mortality_summ.copy()
rep_info = rep_info[rep_info['family'].isin(rep_list)]
rep_info = rep_info.reset_index(drop=True)
rep_info = rep_info.sort_values('quantile')
rep_info.to_csv('plots/fitness_quantiles/selected_families.tsv', sep='\t', index=None)
rep_info.head()

In [None]:
for quant in notebook.tqdm(range(4)):
    for f in notebook.tqdm(representative_families[quant]):
        fam_file = 'outputs/families/families_%s.json' % f
        out_file = 'plots/fitness_quantiles/families_Q%i_%s.png' % (quant+1, f)

        # Chain length (for plot height)
        fam_graph = open_graph(fam_file)
        fam_chain = len(nx.algorithms.dag_longest_path(fam_graph))

        # Figure size (based on graphviz layout)
        nx.nx_agraph.write_dot(fam_graph,'fam.dot')
        pos=graphviz_layout(fam_graph, prog='dot')
        pos_coords = pd.DataFrame(list(pos.values()), columns=['x','y']).agg(['max', 'min'])
        w = (pos_coords.loc['max', 'x'] - pos_coords.loc['min', 'x'])/150
        h = (pos_coords.loc['max', 'y'] - pos_coords.loc['min', 'y'])/150

        # Adjust for 2-member families
        w = max(w, 2)
        h = max(h, 2)

        # Node color (based on whether individuals reached maturity)
        fam_nodes = list(fam_graph.nodes)
        fam_nodes = [int(n) for n in fam_nodes]

        fam_attr = mortality_df[['playerID', 'adult']].copy()
        fam_attr = fam_attr[fam_attr['playerID'].isin(fam_nodes)]
        fam_attr = fam_attr.set_index('playerID')
        fam_attr = fam_attr.to_dict()

        fam_color = [fam_attr['adult'][n] == 1 for n in fam_nodes]
        fam_color = ['#4ab1ff' if c else '#cccccc' for c in fam_color]

        plt.figure(3,figsize=(w,h)) 
        nx.draw(fam_graph, pos, with_labels=False, arrows=True, node_color=fam_color)
        plt.savefig(out_file, transparent=True)
        plt.close()

## Plots for talk

Where are our representative families along this distribution?

In [None]:
mortality_summ['log_n'] = np.log10(mortality_summ['sum'])
q_log = np.log10(success_q)
plt.figure(figsize=(12,4))
for q in q_log:
    plt.axvline(q, color='#aaaaaa', linestyle='--')
#plt.axvspan(0, q_log[0], alpha=0.5, color='red') 

ax = sns.distplot(mortality_summ['log_n'],bins=10)
ax.set_xlim(left=0)
ax.set_xticks(range(5))
labels = ['$10^{%i}$' % t for t in ax.get_xticks()]
ax.set(xlabel='# of adults', xticklabels=labels)
sns.despine()