# Find missing avatars
Natalia Vélez, April 2021

In [2]:
import pymongo
import pandas as pd
import numpy as np

## Which avatars are on the database?

Connect to database:

In [4]:
# Connect to database
keyfile = '../6_database/credentials.key'
creds = open(keyfile, "r").read().splitlines()
myclient = pymongo.MongoClient('134.76.24.75', username=creds[0], password=creds[1], authSource='ohol') 
print(myclient)
ohol = myclient.ohol
print(ohol.list_collection_names())

MongoClient(host=['134.76.24.75:27017'], document_class=dict, tz_aware=False, connect=True, authsource='ohol')
['tfidf_matrix.files', 'maplogs', 'item_embeddings', 'tech_tree', 'lifelogs', 'objects', 'expanded_transitions', 'transitions', 'tfidf_matrix.chunks', 'avatar_embeddings', 'activity_matrix.files', 'activity_matrix.chunks', 'cleaned_job_matrix.chunks', 'cleaned_job_matrix.files', 'activity_labels', 'categories']


Get list of avatars from lifelogs:

In [23]:
life_avatars = ohol.lifelogs.find({}, {'avatar':1, '_id': 0})
life_avatars = list(life_avatars)
life_avatars = [v for entry in life_avatars for k,v in entry.items()]
print('Found %i avatars in lifelogs' % len(life_avatars))
print('%i--%i' %(min(life_avatars), max(life_avatars)))

Found 1830190 avatars in lifelogs
3--4108849


Get list of avatars from activity matrix:

In [24]:
activity_avatars = ohol.activity_labels.find({}, {'avatars': 1, '_id': 0})
activity_avatars = list(activity_avatars)
activity_avatars = activity_avatars[0]['avatars']

print('Found %i avatars in activity matrix' % len(activity_avatars))
print('%i--%i' %(min(activity_avatars), max(activity_avatars)))

Found 763682 avatars in activity matrix
2--4107655


Missing avatars:

In [25]:
missing_from_lifelogs = np.setdiff1d(activity_avatars, life_avatars)
missing_from_lifelogs.sort()

print('Found %i avatars who are in the activity matrix, but not in the lifelogs' % len(missing_from_lifelogs))
print(*missing_from_lifelogs[:10], sep='\n')
print('...')
print(*missing_from_lifelogs[-10:], sep='\n')

Found 392869 avatars who are in the activity matrix, but not in the lifelogs
2
24
2276907
2276908
2276910
2276915
2276916
2276924
2276933
2276956
...
4107635
4107636
4107637
4107638
4107640
4107642
4107645
4107647
4107654
4107655


## Which avatars were in the original data?

Load original lifelogs:

In [42]:
original_lifelogs = pd.read_csv('outputs/all_lifelogs_compact.tsv', sep='\t', index_col=0)
original_avatars = original_lifelogs['avatar'].values
missing_in_original = np.setdiff1d(missing_from_lifelogs, original_avatars)

print('%i avatars in original lifelogs' % len(original_avatars))
print('%i missing avatars were also missing in original lifelogs' % len(missing_in_original))

  mask |= (ar1 == a)


2226610 avatars in original lifelogs
247520 missing avatars were also missing in original lifelogs


In [43]:
missing_in_original

array([      2,      24, 2276907, ..., 4107647, 4107654, 4107655])

In [None]:
len(missing_from)

Load original families:

In [33]:
original_families = pd.read_csv('outputs/family_playerID.tsv', sep='\t', index_col = 0)
family_avatars = original_families.avatar.values
missing_with_family = np.intersect1d(family_avatars, missing_after_upload)

print('%i avatars assigned to families' % len(family_avatars))
print('...including %i of the missing avatars' % len(missing_with_family))

1830190 avatars assigned to families
...including 0 of the missing avatars


  mask |= (ar1 == a)


In [35]:
missing_after_upload[-10:]

array([4108806, 4108810, 4108813, 4108815, 4108830, 4108831, 4108833,
       4108837, 4108842, 4108848])

In [44]:
original_lifelogs[original_lifelogs.avatar == 4107655]

Unnamed: 0,player,avatar,tBirth,parent,birth,tDeath,death,age,cause_of_death,birthX,birthY,deathX,deathY,first,last
