# Wrangling lifelog data (July 2019–May 2020)
Natalia Vélez, last updated June 2020

In this notebook:

* Load, clean up lifelog data from the entire history of the game (updated!)
* Split data by version
* Split data by era (arc, rift, boundless world)
* Prepare inputs for subsequent analyses (census, family trees, migration patterns)

In subsequent analyses, we will focus on data from the Boundless World era (November 2019–present)

In [1]:
%matplotlib inline

import os, re, glob, random, datetime
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from os.path import join as opj
from tqdm import notebook
from ast import literal_eval as make_tuple

sns.set_context('paper')

Helper functions:

In [2]:
gsearch = lambda *args: glob.glob(opj(*args))
str_extract = lambda pattern, s: re.search(pattern, s).group(0)

## Parse version history

In future analyses, we'll want to split data by release; different releases of the game will have different items, mechanics, etc. To do that, we'll parse the version history here and get the start and end dates for each release.

In [3]:
ver_file = '../1_download/outputs/version_history.tsv'

# Load file
ver = pd.read_csv(ver_file, sep='\t')
ver.head()

Unnamed: 0,release,timestamp
0,0,1490908000.0
1,16,1492207000.0
2,17,1492226000.0
3,19,1492472000.0
4,20,1495232000.0


`find_version`: Helper function. Takes a filename as input, finds the corresponding release.

In [4]:
def find_version(file):
    file_date = date_extract(file)

    tmp_ver = ver.copy()
    tmp_ver['lag'] = file_date - tmp_ver['timestamp']
    tmp_ver = tmp_ver[tmp_ver['lag'] >= 0]

    file_ver = tmp_ver.loc[tmp_ver['lag'].idxmin()].release
    
    return file_ver

`find_era`: Find era associated with release

In [5]:
def find_era(release):
    if release < 252:
        r = 'arc'
    elif (d >= 252) & (d < 281):
        r = 'rift'
    elif d >= 281:
        r = 'boundless'
        
    return r

## Clean up data

We first need to filter files by date, to pick out files within the range we're interested in. (This might be a roundabout way of doing it—suggestions welcome.)

`date_extract`: Helper function. Takes a basename as a string (e.g., '2019_03March_23_Saturday.txt') and returns an integer representation of the date (e.g., 20190323).

In [6]:
def date_extract(s):
    
    date_regex = '([0-9]{4})_([0-9]{2})[A-Za-z]+_([0-9]{2})'
    date_search = re.findall(date_regex, s)
    date_str = ''.join(date_search[0])
    date_dt = datetime.datetime.strptime(date_str, '%Y%m%d')
    date_tstamp = date_dt.timestamp()
    
    return date_tstamp

List all files:

In [7]:
data_dir = '../data'
all_files = gsearch(data_dir, 'publicLifeLogData', 'lifeLog*', '2*y.txt')
all_files.sort()
print(*all_files[:10], sep='\n')

../data/publicLifeLogData/lifeLog_bigserver1.onehouronelife.com/2019_01January_25_Friday.txt
../data/publicLifeLogData/lifeLog_bigserver1.onehouronelife.com/2019_01January_26_Saturday.txt
../data/publicLifeLogData/lifeLog_bigserver1.onehouronelife.com/2019_01January_27_Sunday.txt
../data/publicLifeLogData/lifeLog_bigserver1.onehouronelife.com/2019_01January_28_Monday.txt
../data/publicLifeLogData/lifeLog_bigserver1.onehouronelife.com/2019_01January_29_Tuesday.txt
../data/publicLifeLogData/lifeLog_bigserver1.onehouronelife.com/2019_01January_30_Wednesday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_01January_29_Tuesday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_01January_30_Wednesday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_01January_31_Thursday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_02February_01_Friday.txt


Extract dates:

In [8]:
all_dates = [date_extract(f) for f in all_files]
print(*all_dates[:10], sep='\n')

1548403200.0
1548489600.0
1548576000.0
1548662400.0
1548748800.0
1548835200.0
1548748800.0
1548835200.0
1548921600.0
1549008000.0


(outdated) Filter files within range:

In [None]:
# start = datetime.datetime(2019, 7, 30, 0, 0).timestamp()
# end = datetime.datetime(2020, 5, 29, 0, 0).timestamp()

# # Check if date is within range
# between_dates = lambda d: (d >= start) & (d <= end)

# file_df = pd.DataFrame({'file': all_files,
#                         'date': all_dates})
# file_df['include'] = file_df.date.apply(between_dates)
# file_df = file_df[file_df.include]

# file_df.head()

In [9]:
# data_files = file_df['file'].values
data_files = all_files
print('%i files found' % len(data_files))
print(*[os.path.basename(f) for f in data_files[:20]], sep='\n')

11848 files found
2019_01January_25_Friday.txt
2019_01January_26_Saturday.txt
2019_01January_27_Sunday.txt
2019_01January_28_Monday.txt
2019_01January_29_Tuesday.txt
2019_01January_30_Wednesday.txt
2019_01January_29_Tuesday.txt
2019_01January_30_Wednesday.txt
2019_01January_31_Thursday.txt
2019_02February_01_Friday.txt
2019_02February_02_Saturday.txt
2019_02February_03_Sunday.txt
2019_02February_04_Monday.txt
2019_02February_05_Tuesday.txt
2019_02February_06_Wednesday.txt
2019_02February_07_Thursday.txt
2019_02February_08_Friday.txt
2019_02February_09_Saturday.txt
2019_02February_10_Sunday.txt
2019_02February_11_Monday.txt


Load all files:

In [12]:
data_list = []
empty_files = []
for f in notebook.tqdm(data_files):
    try:
        tmp_server = str_extract('(?<=lifeLog_)[a-zA-Z0-9]+', f)
        tmp_ver = find_version(f)
        tmp_era = find_era(tmp_ver)
        
        tmp_d = pd.read_csv(f, sep =' ', header=None)
        tmp_d.insert(0, 'server', tmp_server)
        tmp_d.insert(1, 'release', tmp_ver)
        tmp_d.insert(2, 'era', tmp_era)
        data_list.append(tmp_d)
    except:
        empty_files.append(f)

HBox(children=(FloatProgress(value=0.0, max=11848.0), HTML(value='')))




In [13]:
raw_data = pd.concat(data_list)
raw_data.head()

Unnamed: 0,server,release,era,0,1,2,3,4,5,6,7,8
0,bigserver1,194.0,arc,B,1548458068,2.0,989df9f12fbd84ae3abdd1c50cdcf6245fd79a79,F,"(50,-7)",noParent,pop=1,chain=1
1,bigserver1,194.0,arc,B,1548458071,3.0,750a099096bc2c7244bd0a48ab92585a3c414644,M,"(50,-7)",parent=2,pop=2,chain=2
2,bigserver1,194.0,arc,B,1548458074,4.0,cf5135550e971c587e4fa57eadef593d031971f3,F,"(-43,26)",noParent,pop=3,chain=1
3,bigserver1,194.0,arc,B,1548458075,5.0,5f79fe6c7b7f7f8be2900bacb7f64fa1036197de,F,"(-43,26)",parent=4,pop=4,chain=2
4,bigserver1,194.0,arc,B,1548458076,6.0,1ba18eb436c2cc1d0cf1a2a5c850abf4434cc587,F,"(-14,-48)",noParent,pop=5,chain=1


Deaths:

In [14]:
death_raw = raw_data[raw_data.iloc[:,3] == 'D'].copy().reset_index(drop=True)
death_raw.head()

Unnamed: 0,server,release,era,0,1,2,3,4,5,6,7,8
0,bigserver1,194.0,arc,D,1548458078,4.0,cf5135550e971c587e4fa57eadef593d031971f3,age=14.08,F,"(-43,26)",disconnect,pop=4
1,bigserver1,194.0,arc,D,1548458090,10.0,cf5135550e971c587e4fa57eadef593d031971f3,age=0.12,M,"(7,51)",disconnect,pop=10
2,bigserver1,194.0,arc,D,1548458097,14.0,cf5135550e971c587e4fa57eadef593d031971f3,age=0.05,M,"(50,-7)",disconnect,pop=10
3,bigserver1,194.0,arc,D,1548458098,13.0,16de33d89e8a16656e1404a4d56b4b0a579e1583,age=14.14,F,"(50,-7)",disconnect,pop=10
4,bigserver1,194.0,arc,D,1548458102,5.0,5f79fe6c7b7f7f8be2900bacb7f64fa1036197de,age=0.45,F,"(-59,-11)",hunger,pop=9


Births:

In [15]:
birth_raw = raw_data[raw_data.iloc[:,3] == 'B'].copy().reset_index(drop=True)
birth_raw.head()

Unnamed: 0,server,release,era,0,1,2,3,4,5,6,7,8
0,bigserver1,194.0,arc,B,1548458068,2.0,989df9f12fbd84ae3abdd1c50cdcf6245fd79a79,F,"(50,-7)",noParent,pop=1,chain=1
1,bigserver1,194.0,arc,B,1548458071,3.0,750a099096bc2c7244bd0a48ab92585a3c414644,M,"(50,-7)",parent=2,pop=2,chain=2
2,bigserver1,194.0,arc,B,1548458074,4.0,cf5135550e971c587e4fa57eadef593d031971f3,F,"(-43,26)",noParent,pop=3,chain=1
3,bigserver1,194.0,arc,B,1548458075,5.0,5f79fe6c7b7f7f8be2900bacb7f64fa1036197de,F,"(-43,26)",parent=4,pop=4,chain=2
4,bigserver1,194.0,arc,B,1548458076,6.0,1ba18eb436c2cc1d0cf1a2a5c850abf4434cc587,F,"(-14,-48)",noParent,pop=5,chain=1


### Clean up data

In [23]:
shared_header = ['server', 'release', 'era',
                 'event', 'timestamp', 'playerID',
                 'hash', 'age', 'sex', 'parent', 'location',
                 'cause_of_death', 'pop', 'chain', 'killer']

#### Deaths

In [24]:
death_data = death_raw.copy()
# Insert missing columns
death_data.insert(9, 'parent', np.nan)
death_data.insert(13, 'chain', np.nan)
death_data.insert(14, 'killer', np.nan)

death_data.columns = shared_header                

death_data.head()

Unnamed: 0,server,release,era,event,timestamp,playerID,hash,age,sex,parent,location,cause_of_death,pop,chain,killer
0,bigserver1,194.0,arc,D,1548458078,4.0,cf5135550e971c587e4fa57eadef593d031971f3,age=14.08,F,,"(-43,26)",disconnect,pop=4,,
1,bigserver1,194.0,arc,D,1548458090,10.0,cf5135550e971c587e4fa57eadef593d031971f3,age=0.12,M,,"(7,51)",disconnect,pop=10,,
2,bigserver1,194.0,arc,D,1548458097,14.0,cf5135550e971c587e4fa57eadef593d031971f3,age=0.05,M,,"(50,-7)",disconnect,pop=10,,
3,bigserver1,194.0,arc,D,1548458098,13.0,16de33d89e8a16656e1404a4d56b4b0a579e1583,age=14.14,F,,"(50,-7)",disconnect,pop=10,,
4,bigserver1,194.0,arc,D,1548458102,5.0,5f79fe6c7b7f7f8be2900bacb7f64fa1036197de,age=0.45,F,,"(-59,-11)",hunger,pop=9,,


In [22]:
row

server            bigserver2
release                  209
era                      arc
event                      D
timestamp         1551928834
pllayerID                NaN
hash                     NaN
age                      NaN
sex                      NaN
parent                   NaN
location                 NaN
cause_of_death           NaN
pop                      NaN
chain                    NaN
killer                   NaN
Name: 540906, dtype: object

In [20]:
for i, row in notebook.tqdm(death_data.iterrows(), total=death_data.shape[0]):
    age = str_extract('(?<=age=)\d+\.\d+', row['age'])
    age = float(age)
    
    pop = str_extract('(?<=pop=)[0-9]+', row['pop'])
    pop = int(pop)
    
    death_data.at[i, 'age'] = age
    death_data.at[i, 'pop'] = pop

HBox(children=(FloatProgress(value=0.0, max=6296028.0), HTML(value='')))

TypeError: expected string or bytes-like object

Clean up IDs,  locations, causes of death

In [None]:
# Unique ID: Player ID + hash
death_data['uniqueID'] = death_data.apply(lambda row: '%s_%s_%s' % (row['playerID'], row['server'], row['release']),
                                          axis=1)

# Locations
death_data['location'] = death_data['location'].apply(make_tuple)

# Check for murdered players
murderers = death_data['cause_of_death'].str.extract(r'(?<=killer_)([0-9]+)', expand=False)
death_data['cause_of_death'] = death_data['cause_of_death'].str.replace("killer_[0-9]+", "murdered")
death_data['killer'] = murderers

print(death_data['cause_of_death'].unique())
print(death_data['killer'].unique()[:10])

death_data.head()

#### Births

In [None]:
birth_data = birth_raw.copy()
# Insert missing columns
birth_data.insert(6, 'uniqueID', np.nan)
birth_data.insert(7, 'age', np.nan)
birth_data.insert(11, 'cause_of_death', np.nan)
birth_data.insert(12, 'killer', np.nan)
birth_data.columns = shared_header

for i, row in tqdm_notebook(birth_data.iterrows(), total=birth_data.shape[0]):    
    pop = str_extract('(?<=pop=)[0-9]+', row['pop'])
    pop = int(pop)
    
    chain = str_extract('(?<=chain=)[0-9]+', row['chain'])
    chain = int(chain)
    
    birth_data.at[i, 'pop'] = pop
    birth_data.at[i, 'chain'] = chain

birth_data.head()

Clean up IDs, locations, parents:

In [None]:
birth_data['uniqueID'] = birth_data.apply(lambda row: '%s_%s_%s' % (row['playerID'], row['server'], row['release']), 
                                          axis=1)
birth_data['location'] = birth_data['location'].apply(make_tuple)
birth_data['parent'] = birth_data['parent'].str.extract(r'(?<=parent=)([0-9]+)')

birth_data.head()

#### Save outputs

In [None]:
lifelog_data = pd.concat([death_data, birth_data])
lifelog_data = lifelog_data.sort_values(by=['server', 'timestamp'])
lifelog_data = lifelog_data.reset_index(drop=True)

lifelog_data.head()

In [None]:
lifelog_data.to_csv('outputs/lifelogs_201907-202005_data.tsv', sep='\t', index=True)