# Detecting discoveries
Natalia Vélez, July 2020

In [1]:
%matplotlib inline

import os, re, glob
import pandas as pd
import numpy as np

from os.path import join as opj

from gini import gini
from statsmodels.distributions.empirical_distribution import ECDF

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import notebook

sns.set_style("white")
sns.set_context("talk")

## Prepare data

Helper: Extract timestamp from filenames

In [2]:
def file_start(f):
    t0 = re.search('((?<=start-)|(?<=time-))[0-9]+', f).group(0)
    return int(t0)

Family labels:

In [18]:
fam_file = '../2_demographics/outputs/family_playerID.tsv'
fam_df = pd.read_csv(fam_file, sep='\t', index_col=0)
fam_df = fam_df.rename(columns={'playerID':'player_id'})
fam_df['fam_start'] = fam_df['family'].apply(file_start)
fam_df.head()

  mask |= (ar1 == a)


Unnamed: 0,player_id,family,fam_start
0,3080084,time-1592284232_eve-3080067_name-PICKLE,1592284232
1,3080114,time-1592284232_eve-3080067_name-PICKLE,1592284232
2,3080111,time-1592284232_eve-3080067_name-PICKLE,1592284232
3,3080108,time-1592284232_eve-3080067_name-PICKLE,1592284232
4,3080104,time-1592284232_eve-3080067_name-PICKLE,1592284232


Family fitness:

In [4]:
fit_file = '../2_demographics/outputs/family_fitness.tsv'
fit_df = pd.read_csv(fit_file, sep='\t', index_col=None)
print('Analyzing %i families' % fit_df.shape[0])
fit_df.head()

Analyzing 3084 families


Unnamed: 0,family,sum,count,a,b,beta_mean,beta_var,snr,weighted_size
0,time-1573261529_eve-2252167_name-VIERNES,1,2,4,4,0.5,0.027778,18.0,1.0
1,time-1573261796_eve-2252178_name-BELAND,2,8,5,9,0.357143,0.015306,23.333333,2.857143
2,time-1573261810_eve-2252180_name-BRAND,4,11,7,10,0.411765,0.013456,30.6,4.529412
3,time-1573261816_eve-2252182_name-GERMAN,4,13,7,12,0.368421,0.011634,31.666667,4.789474
4,time-1573261826_eve-2252186_name-LOLI,4,18,7,17,0.291667,0.008264,35.294118,5.25


Object depth:

In [72]:
depth_file = '../../tech_tree/transition.csv'
depth_df = pd.read_csv(depth_file)
depth_df = depth_df.rename(columns={'id': 'object_id', 'num_ingredients': 'depth'})
depth_df = depth_df[['object_id', 'depth']]
depth_df.head()

Unnamed: 0,object_id,depth
0,11,0
1,19,0
2,30,0
3,31,1
4,32,0


Find map change files:

In [5]:
gsearch = lambda *args: glob.glob(opj(*args))
map_dir = 'outputs/maplog/'

map_files = gsearch(map_dir, '*.tsv')
map_files.sort()

print('Found %i files' % len(map_files))
print(*map_files[:10], sep='\n')

Found 236 files
outputs/maplog/maplog_release-284_start-1573895672.tsv
outputs/maplog/maplog_release-284_start-1573982073.tsv
outputs/maplog/maplog_release-284_start-1574068473.tsv
outputs/maplog/maplog_release-285_start-1574102503.tsv
outputs/maplog/maplog_release-287_start-1574151678.tsv
outputs/maplog/maplog_release-287_start-1574238079.tsv
outputs/maplog/maplog_release-287_start-1574324479.tsv
outputs/maplog/maplog_release-287_start-1574410879.tsv
outputs/maplog/maplog_release-287_start-1574497279.tsv
outputs/maplog/maplog_release-289_start-1574552311.tsv


Find map seed changes:

In [6]:
seed_file = 'outputs/seed_changes.txt'
with open(seed_file, 'r') as handle:
    seed_data = handle.read().splitlines()

seed_changes = np.array([int(s) for s in seed_data])
seed_changes = np.sort(seed_changes)

print(seed_changes)

[1573895673 1574102503 1576038671 1578345720 1578354747 1579713519
 1580144896 1581985139 1583642903 1584061484 1585440511 1585512770
 1585603481 1587166656]


Find seed file corresponding to timestamp:

In [7]:
def find_seed(tstamp):
    
    lag = tstamp - seed_changes
    seeds = seed_changes[lag >= 0]
    if len(seeds):
        seed = seeds[-1]
    else: # Special: First log file
        seed = seed_changes[0]
            
    return seed

Group mapchange files by world seed:

In [8]:
file_df = pd.DataFrame(map_files, columns=['file'])
file_df['tstamp'] = file_df.file.str.extract('(?<=start-)([0-9]+)')
file_df['tstamp'] = file_df['tstamp'].astype(np.int)
file_df['seed'] = file_df.tstamp.apply(find_seed)
file_df = file_df.sort_values('tstamp')
file_df['seed_start'] = file_df.groupby('seed')['tstamp'].transform('min')

file_df.head()

Unnamed: 0,file,tstamp,seed,seed_start
0,outputs/maplog/maplog_release-284_start-157389...,1573895672,1573895673,1573895672
1,outputs/maplog/maplog_release-284_start-157398...,1573982073,1573895673,1573895672
2,outputs/maplog/maplog_release-284_start-157406...,1574068473,1573895673,1573895672
3,outputs/maplog/maplog_release-285_start-157410...,1574102503,1574102503,1574102503
4,outputs/maplog/maplog_release-287_start-157415...,1574151678,1574102503,1574102503


## Identify discoveries

Helper function: Clean up individual map change files

In [21]:
def process_maplog(f):
    s_df = pd.read_csv(f, sep='\t', index_col=None)
    
    # Fix timestamps to start of world seed
    t_log = file_start(f)
    s_df['t_epoch'] = s_df['t_elapsed'] + t_log

    # Player events only
    s_df = s_df[s_df['player_id'] > 0]

    # Parse object IDs, removing special identifiers
    s_df['object_id'] = s_df.object_id.str.replace('(^f|v[0-9]+|u[0-9]+)', '')
    s_df['object_id'] = s_df['object_id'].astype(np.int)

    # Only interactions with valid objects
    s_df = s_df[(s_df['object_id'] > 0) & (s_df['object_id'] < 5000)]

    # Tag players by family
    s_df = pd.merge(s_df, fam_df, on='player_id')
    s_df['t_fam'] = s_df['t_epoch'] - s_df['fam_start'] # t=0 at Eve birth

    return s_df

Helper function: Identify discoveries

In [76]:
discoveries

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,t_fam,x,y,player_id
seed,family,object_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1573895673,time-1573895755_eve-2276905_name-ZABICKI,30,343.95,-5124.0,-1391.0,2276906.0
1573895673,time-1573895755_eve-2276905_name-ZABICKI,31,502.98,-5108.0,-1400.0,2276914.0
1573895673,time-1573895755_eve-2276905_name-ZABICKI,33,1416.72,-5115.0,-1410.0,2276930.0
1573895673,time-1573895755_eve-2276905_name-ZABICKI,34,737.11,-5109.0,-1381.0,2276909.0
1573895673,time-1573895755_eve-2276905_name-ZABICKI,45,704.62,-5106.0,-1419.0,2276909.0
1573895673,...,...,...,...,...,...
1573895673,time-1574097243_eve-2286236_name-WALUS,3051,400.26,-12540.0,-806.0,2286236.0
1573895673,time-1574097243_eve-2286236_name-WALUS,3176,3390.52,-12504.0,-738.0,2286326.0
1573895673,time-1574097243_eve-2286236_name-WALUS,3177,4318.23,-12569.0,-691.0,2286400.0
1573895673,time-1574097243_eve-2286236_name-WALUS,3179,3385.79,-12505.0,-738.0,2286326.0


In [78]:
#def id_discoveries(maplog):
maplog=seed_df.copy()   

# Prepare dataframe
discoveries = maplog.copy()
discoveries = discoveries[['seed','t_fam','family', 'object_id', 'x', 'y', 'player_id']]
discoveries = discoveries.sort_values('t_fam')

# Find the first time an object appears in family's repertoire
discoveries = discoveries.groupby(['seed', 'family', 'object_id']).first()
discoveries = discoveries.reset_index()
discoveries = pd.merge(discoveries, depth_df)
discoveries = discoveries.sort_values(['family', 't_fam'])

discoveries.head()

# # Return a count of # discoveries for each family member
# n_discoveries = discoveries.groupby(['family','player_id'])['object_id']
# n_discoveries = n_discoveries.agg('count').reset_index()
# n_discoveries = n_discoveries.rename(columns={'object_id': 'n'})

# # Fill in missing family members (no discoveries)
# log_fams = np.unique(discoveries['family'])
# all_fam = fam_df.copy()
# all_fam = all_fam[['family', 'player_id']]
# all_fam = all_fam[all_fam['family'].isin(log_fams)].reset_index(drop=True)

# n_discoveries_full = pd.merge(all_fam, n_discoveries, how='outer')
# n_discoveries_full['n'] = n_discoveries_full['n'].fillna(0).astype(int)
# n_discoveries_full.head()

# return discoveries, n_discoveries_full

Unnamed: 0,seed,family,object_id,t_fam,x,y,player_id,depth
1732,1573895673,time-1573895755_eve-2276905_name-ZABICKI,236,44.21,-5138.0,-1395.0,2276905.0,52
978,1573895673,time-1573895755_eve-2276905_name-ZABICKI,134,46.09,-5135.0,-1396.0,2276905.0,9
5316,1573895673,time-1573895755_eve-2276905_name-ZABICKI,2873,89.11,-5125.0,-1393.0,2276905.0,214
5165,1573895673,time-1573895755_eve-2276905_name-ZABICKI,2742,92.11,-5123.0,-1392.0,2276905.0,116
5314,1573895673,time-1573895755_eve-2276905_name-ZABICKI,2861,94.54,-5122.0,-1392.0,2276905.0,260


Main loop:

In [74]:
discovery_list = []
n_discovery_list = []

s = seed_changes[0]
# for s in seed_changes:

# Identify all logs with the same world seed
seed_logs = file_df[file_df['seed'] == s].copy()
seed_fs = seed_logs['file'].values

# Add all logs associated with the same world seed to dataframe
seed_list = [process_maplog(f) for f in seed_fs]
seed_df = pd.concat(seed_list).reset_index(drop=True)
seed_df['seed'] = s

# Identify discoveries by family
# = first time a family member interacts with an object, by world seed
seed_disc, seed_n_disc = id_discoveries(seed_df)
seed_n_disc['seed'] = s
seed_n_disc.head()

Unnamed: 0,family,player_id,n,seed
0,time-1574097243_eve-2286236_name-WALUS,2286247,0,1573895673
1,time-1574097243_eve-2286236_name-WALUS,2286245,0,1573895673
2,time-1574097243_eve-2286236_name-WALUS,2286242,27,1573895673
3,time-1574097243_eve-2286236_name-WALUS,2286238,4,1573895673
4,time-1574097243_eve-2286236_name-WALUS,2286344,0,1573895673


Unnamed: 0,family,player_id,n
0,time-1574097243_eve-2286236_name-WALUS,2286247,0
1,time-1574097243_eve-2286236_name-WALUS,2286245,0
2,time-1574097243_eve-2286236_name-WALUS,2286242,27
3,time-1574097243_eve-2286236_name-WALUS,2286238,4
4,time-1574097243_eve-2286236_name-WALUS,2286344,0


In [69]:
seed_n_disc.shape

(3186, 3)