# Wrangling lifelog data (July 2019–May 2020)
Natalia Vélez, last updated June 2020

In this notebook:

* Load, clean up lifelog data from the entire history of the game (updated!)
* Split data by version
* Split data by era (arc, rift, boundless world)
* Prepare inputs for subsequent analyses (census, family trees, migration patterns)

In subsequent analyses, we will focus on data from the Boundless World era (November 2019–present)

In [1]:
%matplotlib inline

import os, re, glob, random, datetime
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from os.path import join as opj
from tqdm import notebook
from ast import literal_eval as make_tuple

sns.set_context('paper')

Helper functions:

In [2]:
gsearch = lambda *args: glob.glob(opj(*args))
str_extract = lambda pattern, s: re.search(pattern, s).group(0)

## Parse version history

In future analyses, we'll want to split data by release; different releases of the game will have different items, mechanics, etc. To do that, we'll parse the version history here and get the start and end dates for each release.

In [3]:
ver_file = '../1_download/outputs/version_history.tsv'

# Load file
ver = pd.read_csv(ver_file, sep='\t')
ver.head()

Unnamed: 0,release,timestamp
0,1,1483052000.0
1,5,1483472000.0
2,8,1484065000.0
3,14,1484961000.0
4,16,1492207000.0


`find_version`: Helper function. Takes a filename as input, finds the corresponding release.

In [4]:
def find_version(file):
    file_date = date_extract(file)

    tmp_ver = ver.copy()
    tmp_ver['lag'] = file_date - tmp_ver['timestamp']
    tmp_ver = tmp_ver[tmp_ver['lag'] >= 0]

    file_ver = tmp_ver.loc[tmp_ver['lag'].idxmin()].release
    
    return file_ver

`find_era`: Find era associated with release

In [5]:
def find_era(release):
    if release < 252:
        r = 'arc'
    elif (release >= 252) & (release < 280):
        r = 'rift'
    elif release >= 280:
        r = 'boundless'
        
    return r

## Clean up data

We first need to filter files by date, to pick out files within the range we're interested in. (This might be a roundabout way of doing it—suggestions welcome.)

`date_extract`: Helper function. Takes a basename as a string (e.g., '2019_03March_23_Saturday.txt') and returns an integer representation of the date (e.g., 20190323).

In [6]:
def date_extract(s):
    
    date_regex = '([0-9]{4})_([0-9]{2})[A-Za-z]+_([0-9]{2})'
    date_search = re.findall(date_regex, s)
    date_str = ''.join(date_search[0])
    date_dt = datetime.datetime.strptime(date_str, '%Y%m%d')
    date_tstamp = date_dt.timestamp()
    
    return date_tstamp

List all files:

In [7]:
data_dir = '../data'
all_files = gsearch(data_dir, 'publicLifeLogData', 'lifeLog_bigserver2*', '2*y.txt')
all_files.sort()
print(len(all_files))
print(*all_files[:10], sep='\n')

505
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_01January_29_Tuesday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_01January_30_Wednesday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_01January_31_Thursday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_02February_01_Friday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_02February_02_Saturday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_02February_03_Sunday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_02February_04_Monday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_02February_05_Tuesday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_02February_06_Wednesday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_02February_07_Thursday.txt


Extract dates:

In [8]:
all_dates = [date_extract(f) for f in all_files]
print(*all_dates[:10], sep='\n')

1548748800.0
1548835200.0
1548921600.0
1549008000.0
1549094400.0
1549180800.0
1549267200.0
1549353600.0
1549440000.0
1549526400.0


(outdated) Filter files within range:

In [11]:
# data_files = file_df['file'].values
data_files = all_files
print('%i files found' % len(data_files))
print(*[os.path.basename(f) for f in data_files[-20:]], sep='\n')

505 files found
2020_05May_28_Thursday.txt
2020_05May_29_Friday.txt
2020_05May_30_Saturday.txt
2020_05May_31_Sunday.txt
2020_06June_01_Monday.txt
2020_06June_02_Tuesday.txt
2020_06June_03_Wednesday.txt
2020_06June_04_Thursday.txt
2020_06June_05_Friday.txt
2020_06June_06_Saturday.txt
2020_06June_07_Sunday.txt
2020_06June_08_Monday.txt
2020_06June_09_Tuesday.txt
2020_06June_10_Wednesday.txt
2020_06June_11_Thursday.txt
2020_06June_12_Friday.txt
2020_06June_13_Saturday.txt
2020_06June_14_Sunday.txt
2020_06June_15_Monday.txt
2020_06June_16_Tuesday.txt


Load all files:

In [20]:
pd.read_csv(tmp_name_file, sep=' ', header=None)

EmptyDataError: No columns to parse from file

In [None]:
data_list = []
empty_files = []
for f in notebook.tqdm(data_files):
    tmp_server = str_extract('(?<=lifeLog_)[a-zA-Z0-9]+', f)
    tmp_ver = find_version(f)
    tmp_era = find_era(tmp_ver)

    tmp_d = pd.read_csv(f, sep =' ', header=None)
    tmp_d.insert(0, 'server', tmp_server)
    tmp_d.insert(1, 'release', tmp_ver)
    tmp_d.insert(2, 'era', tmp_era)
    data_list.append(tmp_d)

In [None]:
raw_data = pd.concat(data_list)
raw_data.tail()

Deaths:

In [None]:
death_raw = raw_data[raw_data.iloc[:,3] == 'D'].copy().reset_index(drop=True)
death_raw.head()

Births:

In [None]:
birth_raw = raw_data[raw_data.iloc[:,3] == 'B'].copy().reset_index(drop=True)
birth_raw.head()

### Clean up data

In [None]:
shared_header = ['server', 'release', 'era',
                 'event', 'timestamp', 'playerID',
                 'hash', 'age', 'sex', 'location', 'parent',
                 'cause_of_death', 'pop', 'chain', 'killer']

#### Deaths

In [None]:
death_data = death_raw.copy()

# Insert missing columns
death_data.insert(10, 'parent', np.nan)
death_data.insert(13, 'chain', np.nan)
death_data.insert(14, 'killer', np.nan)

death_data.columns = shared_header
death_data = death_data.dropna(subset=['location'])

death_data.head()

In [None]:
for i, row in notebook.tqdm(death_data.iterrows(), total=death_data.shape[0]):
    try:
        age = str_extract('(?<=age=)\d+\.\d+', row['age'])
        age = float(age)

        pop = str_extract('(?<=pop=)[0-9]+', row['pop'])
        pop = int(pop)

        death_data.at[i, 'age'] = age
        death_data.at[i, 'pop'] = pop
    except TypeError:
        pass

Clean up IDs,  locations, causes of death

In [None]:
# Locations
death_data['location'] = death_data['location'].apply(make_tuple)

# Check for murdered players
murderers = death_data['cause_of_death'].str.extract(r'(?<=killer_)([0-9]+)', expand=False)
death_data['cause_of_death'] = death_data['cause_of_death'].str.replace("killer_[0-9]+", "murdered")
death_data['killer'] = murderers

print(death_data['cause_of_death'].unique())
print(death_data['killer'].unique()[:10])

death_data.head()

#### Births

In [None]:
birth_data = birth_raw.copy()

# Insert missing columns
birth_data.insert(7, 'age', np.nan)
birth_data.insert(11, 'cause_of_death', np.nan)
birth_data.insert(14, 'killer', np.nan)

birth_data.columns = shared_header
birth_data = birth_data.dropna(subset=['location'])

birth_data.head()

In [None]:
for i, row in notebook.tqdm(birth_data.iterrows(), total=birth_data.shape[0]):
    try:
        pop = str_extract('(?<=pop=)[0-9]+', row['pop'])
        pop = int(pop)

        chain = str_extract('(?<=chain=)[0-9]+', row['chain'])
        chain = int(chain)

        birth_data.at[i, 'pop'] = pop
        birth_data.at[i, 'chain'] = chain
    except TypeError:
        pass

birth_data.head()

Clean up IDs, locations, parents:

In [None]:
# Fix messed-up tuples
# birth_data['location'] = np.where(birth_data['location'].str.strip().str[-1] == ')',
#                                   birth_data['location'],
#                                   birth_data['location'] + ')')

# # Then proceed
# birth_data['location'] = birth_data['location'].apply(make_tuple)
# birth_data['parent'] = np.where(birth_data['parent'] == 'noParent', 
#                                 -1,
#                                 birth_data['parent'].str.extract(r'(?<=parent=)([0-9]+)'))
# birth_data['parent'] = birth_data['parent'].int
birth_data.head()

#### Save outputs

In [None]:
lifelog_data = pd.concat([death_data, birth_data])
lifelog_data = lifelog_data.sort_values(by=['server', 'timestamp'])
lifelog_data = lifelog_data.reset_index(drop=True)

lifelog_data.head()

Split by eras and save:

In [None]:
eras = ['arc', 'rift', 'boundless']
all_servers = [str_extract('(?<=lifeLog_)[a-zA-Z0-9]+', f) for f in data_files]
servers = np.unique(all_servers)

for e in notebook.tqdm(eras):
    era_data = lifelog_data[(lifelog_data['era'] == e) & (lifelog_data['server'] == 'bigserver2')]
    print(era_data.empty)
    if not era_data.empty:
        era_data = era_data.reset_index(drop=True)
        era_fname = 'outputs/lifelogs_bigserver2_%s_data.tsv' % e
        era_data.to_csv(era_fname, sep='\t', index=True)

In [None]:
np.unique(lifelog_data.era)

In [None]:
lifelog_data.tail()