# Find missing avatars
Natalia Vélez, April 2021

In [1]:
import pymongo
import pandas as pd
import numpy as np
from tqdm import notebook

import sys
sys.path.append('..')
from utils import gsearch

## Source 1: Original data

Find original data files:

In [2]:
data_dir = '../../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/'
data_files = gsearch(data_dir, '*y.txt')
data_files.sort()

print('Found %i lifelog files' % len(data_files))
print(*data_files[:10], sep='\n')

Found 799 lifelog files
../../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_01January_29_Tuesday.txt
../../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_01January_30_Wednesday.txt
../../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_01January_31_Thursday.txt
../../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_02February_01_Friday.txt
../../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_02February_02_Saturday.txt
../../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_02February_03_Sunday.txt
../../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_02February_04_Monday.txt
../../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_02February_05_Tuesday.txt
../../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_02February_06_Wednesday.txt
../../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_02February_07_Thursday

Get list of avatar IDs from original data:

In [3]:
original_avatars = []

for f in notebook.tqdm(data_files):
    # Read file
    with open(f, 'r') as handle:
        f_data = handle.read().splitlines()
     
    # Keep birth events
    f_data = [d.split(' ') for d in f_data if d[0] == 'B']
    
    # Add avatar IDs to list
    original_avatars += [int(d[2]) for d in f_data]
    
original_avatars.sort()
    
print('Found %i avatars in original lifelogs' % len(original_avatars))
print(*original_avatars[:10], sep='\n')
print('...')

  0%|          | 0/799 [00:00<?, ?it/s]

Found 3834737 avatars in original lifelogs
2
3
4
5
6
7
8
9
10
11
...


## Source 2: Activity matrix

Find label files:

In [4]:
label_dir = '../3_technology/outputs/activity_in/'
label_files = gsearch(label_dir, '*labels.txt')
label_files.sort()

print('Found %i label files' % len(label_files))
print(*label_files[:10], sep='\n')
print('...')

Found 269 label files
../3_technology/outputs/activity_in/activity_in_release-284_start-1573895672_labels.txt
../3_technology/outputs/activity_in/activity_in_release-284_start-1573982073_labels.txt
../3_technology/outputs/activity_in/activity_in_release-284_start-1574068473_labels.txt
../3_technology/outputs/activity_in/activity_in_release-285_start-1574102503_labels.txt
../3_technology/outputs/activity_in/activity_in_release-287_start-1574151678_labels.txt
../3_technology/outputs/activity_in/activity_in_release-287_start-1574238079_labels.txt
../3_technology/outputs/activity_in/activity_in_release-287_start-1574324479_labels.txt
../3_technology/outputs/activity_in/activity_in_release-287_start-1574410879_labels.txt
../3_technology/outputs/activity_in/activity_in_release-287_start-1574497279_labels.txt
../3_technology/outputs/activity_in/activity_in_release-289_start-1574552311_labels.txt
...


Get list of avatar IDs from label files:

In [5]:
label_avatars = []

for f in notebook.tqdm(label_files):
    with open(f, 'r') as handle:
        f_data = handle.read().splitlines()
    f_data = [int(d) for d in f_data]
    label_avatars += f_data

label_avatars = np.unique(label_avatars) # Remove duplicates

print('Found %i avatars in activity matrix labels' % len(label_avatars))
print(*label_avatars[:10], sep='\n')
print('...')

  0%|          | 0/269 [00:00<?, ?it/s]

Found 763682 avatars in activity matrix labels
2
24
2276905
2276906
2276907
2276908
2276909
2276910
2276911
2276912
...


Are there any avatars in the activity matrix who are missing from the lifelogs?

In [6]:
missing_labels = np.setdiff1d(label_avatars, original_avatars)
print('%i missing avatars' % len(missing_labels))

print(min(missing_labels))
print(max(missing_labels))

57958 missing avatars
2276981
4107654


In [7]:
len(missing_labels)/len(label_avatars)*100

7.589284545137898

## Source 3: Wrangled lifelogs

Load lifelogs after wrangling:

In [13]:
wrangled_lifelogs = pd.read_csv('../2_demographics/outputs/lifelogs_bigserver2_data.csv')
print(wrangled_lifelogs.shape)
wrangled_lifelogs.head()

(7659514, 13)


Unnamed: 0,server,release,era,event,timestamp,avatar,player,age,sex,location,parent,cause_of_death,killer
0,bigserver2,194.0,arc,B,1548804597,2,a51edcb77a3900d53adc61d394876c5ca7417486,,F,"(50, -7)",noParent,,
1,bigserver2,194.0,arc,D,1548807206,2,a51edcb77a3900d53adc61d394876c5ca7417486,57.49,F,"(64, -8)",,hunger,
2,bigserver2,194.0,arc,B,1548814222,3,ba474919bfbe67b14ec6e6fd05c19f383152b1b2,,F,"(1453, -436)",noParent,,
3,bigserver2,194.0,arc,B,1548814226,4,0c1781b6944db9e58b5d71adfe64af09fedad796,,M,"(1453, -436)",3,,
4,bigserver2,194.0,arc,B,1548814227,5,77a67da50b453a570d1e3311a56cd9da89187cd2,,F,"(1366, -395)",noParent,,


List of avatars from birth and death events:

In [9]:
births = wrangled_lifelogs[wrangled_lifelogs.event == 'B'].avatar.values
deaths = wrangled_lifelogs[wrangled_lifelogs.event == 'D'].avatar.values

print('Found %i births, %i deaths' % (len(births), len(deaths)))

Found 3834737 births, 3824777 deaths


In [10]:
np.setdiff1d(births, deaths)

array([    221,    2850,    2851, ..., 4109235, 4109236, 4109237])

In [11]:
np.setdiff1d(deaths, births)

array([], dtype=int64)

## Source 4: Compact lifelogs

In [14]:
compact_lifelogs = pd.read_csv('outputs/all_lifelogs_compact.csv')
print(compact_lifelogs.shape)
compact_lifelogs.head()

(3834737, 16)


Unnamed: 0,player,avatar,tBirth,parent,birth,sex,tDeath,death,age,cause_of_death,birthX,birthY,deathX,deathY,first,last
0,79d28bc07c1f45c6602d5d84f506e36a795751dc,4109237,1617695981,4109188,[-16083 279],F,,[],,,-16083,279,,,,
1,3da17539b16d6fe6911aefc57388bae9d1303e22,4109236,1617695903,4109184,[-15482 374],F,,[],,,-15482,374,,,,
2,b14977d2ebf15c9c0d643378aacedb42e8a43757,4109235,1617695831,4109188,[-16095 276],F,,[],,,-16095,276,,,,
3,2da3db708a04acdf8f3e52486e4cfbd820cfb766,4109234,1617695804,4109214,[-16807 79],M,1617696000.0,[-16807 79],0.03,disconnect,-16807,79,-16807.0,79.0,,
4,3da17539b16d6fe6911aefc57388bae9d1303e22,4109233,1617695734,4109211,[-16219 -46],M,1617696000.0,[-16354 -121],2.61,hunger,-16219,-46,-16354.0,-121.0,,


Are any avatars in the wrangled maplogs, but not the compact lifelogs?

In [16]:
np.setdiff1d(births, compact_lifelogs.avatar.values)

array([], dtype=int64)

## Source 5: Families

In [17]:
families = pd.read_csv('outputs/family_playerID.csv')
print(families.shape)
families.head()

(3834737, 2)


Unnamed: 0,avatar,family
0,4109234,time-1617694919_eve-4109214_name-(missing)
1,4109230,time-1617694919_eve-4109214_name-(missing)
2,4109225,time-1617694919_eve-4109214_name-(missing)
3,4109222,time-1617694919_eve-4109214_name-(missing)
4,4109217,time-1617694919_eve-4109214_name-(missing)


Was anyone not assigned a family?

In [26]:
print(np.where(pd.isnull(families.family)))
print(np.where(pd.isna(families.family)))

(array([], dtype=int64),)
(array([], dtype=int64),)
