# Split data into epochs

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import notebook

import sys
sys.path.append('..')
from utils import gsearch, str_extract, int_extract, to_date

Data files:

In [2]:
map_dir = '../../data/publicMapChangeData/bigserver2.onehouronelife.com/'

# map seed changes
seed_files = gsearch(map_dir, '*time_mapSeed.txt')
seed_files.sort()

# seed change times
seed_times = [int_extract('([0-9]+)(?=time)', f) for f in seed_files]


print(*[to_date(t) for t in seed_times], sep='\n')

2019-11-16 04:14:33
2019-11-18 13:41:43
2019-12-10 23:31:11
2020-01-06 16:22:00
2020-01-06 18:52:27
2020-01-22 12:18:39
2020-01-27 12:08:16
2020-02-17 19:18:59
2020-03-07 23:48:23
2020-03-12 21:04:44
2020-03-28 20:08:31
2020-03-29 16:12:50
2020-03-30 17:24:41
2020-04-17 19:37:36
2020-10-29 14:54:07
2020-12-19 16:01:14
2020-12-28 10:47:31


## Check: Are these epochs separate from one another? 

Note: This section violates order (it's dependent on outputs from `2_demographics`). Will move this later.

Get times at which each family started

In [3]:
eve_df = pd.read_csv('../2_demographics/outputs/family_fitness.tsv', sep='\t')
eve_df['eve'] = eve_df['family'].apply(lambda s: int_extract('(?<=eve-)[0-9]+', s))
eve_df['start_t'] = eve_df['family'].apply(lambda s: int_extract('(?<=time-)[0-9]+', s))
eve_df = eve_df[['family', 'eve', 'start_t']]
eve_df = eve_df[eve_df['start_t'] >= seed_times[0]]

print(eve_df.shape)
eve_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../2_demographics/outputs/family_fitness.tsv'

Select only families that started after the new map log format was introduced
(to-do: in the future, it'd be better to build this in to the data wrangling pipeline)

In [None]:
fam_df = pd.read_csv('../2_demographics/outputs/family_generations.tsv', sep='\t')
fam_df = pd.merge(fam_df, eve_df, on='family')

print(fam_df.shape)
fam_df.head()

Load all lifelogs

In [4]:
life_df = pd.read_csv('../2_demographics/outputs/all_lifelogs_compact.tsv', sep='\t', index_col = 0)
life_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../2_demographics/outputs/all_lifelogs_compact.tsv'

Return only lifelogs for members of families that passed the inclusion criteria

In [None]:
fam_life = pd.merge(life_df, fam_df, on='avatar')
print(life_df.shape)
print(fam_life.shape)
fam_life.head()

In [None]:
print('First birth: %s' % to_date(np.min(fam_life.tBirth)))
print('Last death: %s' % to_date(np.max(fam_life.tDeath)))

Assign the closest epoch to each family member

In [None]:
def closest_epoch(t):
    closest_tstamp = min([t_s for t_s in seed_times if t_s <= t], key=lambda t_s:abs(t_s-t))
    return to_date(closest_tstamp)

In [None]:
life_epochs = fam_life.copy()
life_epochs = life_epochs[np.isfinite(life_epochs.tDeath)]
life_epochs['eBirth'] = life_epochs['tBirth'].apply(closest_epoch)
life_epochs['eDeath'] = life_epochs['tDeath'].apply(closest_epoch)
life_epochs.head()

Does each individual live within the span of a single epoch?

In [None]:
epoch_crossers = life_epochs[life_epochs['eBirth'] != life_epochs['eDeath']]
print(epoch_crossers.shape)
epoch_crossers.head()

Are any boundary crossings particularly common?

In [None]:
common_crossings = epoch_crossers.groupby(['eBirth', 'eDeath'])['avatar'].agg('count').reset_index()
common_crossings

Are there regular periods in which families wink in and out?

In [None]:
fam_span = life_epochs.groupby('family').agg({'tBirth': 'min', 'tDeath': 'max'}).reset_index()
fam_span['tDeath'] = fam_span['tDeath'].astype(np.int)

# Time range
first_fam = np.min(fam_span.tBirth)
last_fam = np.max(fam_span.tDeath)

# Liiving families
t_vec = np.linspace(first_fam, last_fam+2, 10000)
living_families = []
for t in notebook.tqdm(t_vec):
    living_families.append(np.sum((fam_span['tBirth'] <= t) & (fam_span['tDeath'] > t)))
    
living_families = np.array(living_families)

In [None]:
date_vec = np.vectorize(lambda t: to_date(t, fmt='%Y-%m-%d'))
extinction_times = t_vec[living_families == 0]
extinction_days = np.unique(date_vec(extinction_times))

extinction_days

In [None]:
fig = plt.figure(figsize=(12,4))
plt.plot(t_vec, living_families)
plt.axvline(extinction_times, 'r--')

In [None]:
print(*[to_date(t) for t in seed_times], sep='\n')