# Wrangling lifelog data (July 2019–May 2020)
Natalia Vélez, last updated June 2020

In this notebook:

* Load, clean up lifelog data from 2019-07-30 to *2020-05-29* (updated!)
* Split data by version
* Prepare inputs for subsequent analyses (census, family trees, migration patterns)

The period studied spans the start of the MapChange logs to the most recent data download (versions 251–340)

In [1]:
%matplotlib inline

import os, re, glob, random, datetime
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from os.path import join as opj
from tqdm import tqdm_notebook
from ast import literal_eval as make_tuple

sns.set_context('paper')

Helper functions:

In [2]:
gsearch = lambda *args: glob.glob(opj(*args))
str_extract = lambda pattern, s: re.search(pattern, s).group(0)

## Parse version history

In future analyses, we'll want to split data by release; different releases of the game will have different items, mechanics, etc. To do that, we'll parse the version history here and get the start and end dates for each release.

In [3]:
ver_file = '../1_download/outputs/version_history.tsv'

# Load file
ver = pd.read_csv(ver_file, sep='\t')
ver.head()

Unnamed: 0,release,timestamp
0,0,1490908000.0
1,16,1492207000.0
2,17,1492226000.0
3,19,1492472000.0
4,20,1495232000.0


`find_version`: Helper function. Takes a filename as input, finds the corresponding release.

In [4]:
def find_version(file):
    file_date = date_extract(file)

    tmp_ver = ver.copy()
    tmp_ver['lag'] = file_date - tmp_ver['timestamp']
    tmp_ver = tmp_ver[tmp_ver['lag'] >= 0]

    file_ver = tmp_ver.loc[tmp_ver['lag'].idxmin()].release
    
    return file_ver

## Clean up data

We first need to filter files by date, to pick out files within the range we're interested in. (This might be a roundabout way of doing it—suggestions welcome.)

`date_extract`: Helper function. Takes a basename as a string (e.g., '2019_03March_23_Saturday.txt') and returns an integer representation of the date (e.g., 20190323).

In [5]:
def date_extract(s):
    
    date_regex = '([0-9]{4})_([0-9]{2})[A-Za-z]+_([0-9]{2})'
    date_search = re.findall(date_regex, s)
    date_str = ''.join(date_search[0])
    date_dt = datetime.datetime.strptime(date_str, '%Y%m%d')
    date_tstamp = date_dt.timestamp()
    
    return date_tstamp

List all files:

In [6]:
data_dir = '../data'
all_files = gsearch(data_dir, 'publicLifeLogData', 'lifeLog*', '2*y.txt')
all_files.sort()
print(*all_files[:10], sep='\n')

../data/publicLifeLogData/lifeLog_bigserver1.onehouronelife.com/2019_01January_25_Friday.txt
../data/publicLifeLogData/lifeLog_bigserver1.onehouronelife.com/2019_01January_26_Saturday.txt
../data/publicLifeLogData/lifeLog_bigserver1.onehouronelife.com/2019_01January_27_Sunday.txt
../data/publicLifeLogData/lifeLog_bigserver1.onehouronelife.com/2019_01January_28_Monday.txt
../data/publicLifeLogData/lifeLog_bigserver1.onehouronelife.com/2019_01January_29_Tuesday.txt
../data/publicLifeLogData/lifeLog_bigserver1.onehouronelife.com/2019_01January_30_Wednesday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_01January_29_Tuesday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_01January_30_Wednesday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_01January_31_Thursday.txt
../data/publicLifeLogData/lifeLog_bigserver2.onehouronelife.com/2019_02February_01_Friday.txt


Extract dates:

In [7]:
all_dates = [date_extract(f) for f in all_files]
print(*all_dates[:10], sep='\n')

1548403200.0
1548489600.0
1548576000.0
1548662400.0
1548748800.0
1548835200.0
1548748800.0
1548835200.0
1548921600.0
1549008000.0


Filter files within range:

In [8]:
start = datetime.datetime(2019, 7, 30, 0, 0).timestamp()
end = datetime.datetime(2020, 5, 29, 0, 0).timestamp()

# Check if date is within range
between_dates = lambda d: (d >= start) & (d <= end)

file_df = pd.DataFrame({'file': all_files,
                        'date': all_dates})
file_df['include'] = file_df.date.apply(between_dates)
file_df = file_df[file_df.include]

file_df.head()

Unnamed: 0,file,date,include
188,../data/publicLifeLogData/lifeLog_bigserver2.o...,1564470000.0,True
189,../data/publicLifeLogData/lifeLog_bigserver2.o...,1564556000.0,True
190,../data/publicLifeLogData/lifeLog_bigserver2.o...,1564643000.0,True
191,../data/publicLifeLogData/lifeLog_bigserver2.o...,1564729000.0,True
192,../data/publicLifeLogData/lifeLog_bigserver2.o...,1564816000.0,True


In [9]:
data_files = file_df['file'].values
print('%i files found' % len(data_files))
print(*[os.path.basename(f) for f in data_files[:20]], sep='\n')

4349 files found
2019_07July_30_Tuesday.txt
2019_07July_31_Wednesday.txt
2019_08August_01_Thursday.txt
2019_08August_02_Friday.txt
2019_08August_03_Saturday.txt
2019_08August_04_Sunday.txt
2019_08August_05_Monday.txt
2019_08August_06_Tuesday.txt
2019_08August_07_Wednesday.txt
2019_08August_08_Thursday.txt
2019_08August_09_Friday.txt
2019_08August_10_Saturday.txt
2019_08August_11_Sunday.txt
2019_08August_12_Monday.txt
2019_08August_13_Tuesday.txt
2019_08August_14_Wednesday.txt
2019_08August_15_Thursday.txt
2019_08August_16_Friday.txt
2019_08August_17_Saturday.txt
2019_08August_18_Sunday.txt


Load all files:

In [10]:
data_list = []
empty_files = []
for f in tqdm_notebook(data_files):
    try:
        tmp_server = str_extract('(?<=lifeLog_)[a-zA-Z0-9]+', f)
        tmp_ver = find_version(f)
        
        tmp_d = pd.read_csv(f, sep =' ', header=None)
        tmp_d.insert(0, 'server', tmp_server)
        tmp_d.insert(0, 'release', tmp_ver)
        data_list.append(tmp_d)
    except:
        empty_files.append(f)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=4349.0), HTML(value='')))




In [11]:
raw_data = pd.concat(data_list)
raw_data.head()

Unnamed: 0,release,server,0,1,2,3,4,5,6,7,8
0,254.0,bigserver2,D,1564444828,1842380,583a6de73718f5dd0ddf388f4e68dc060b15e6df,age=1.81,F,"(48,94)",hunger,pop=82
1,254.0,bigserver2,D,1564444830,1842301,520ddb069aa77dcb202dd0310a8852e8fddc58dd,age=8.00,M,"(40,350)",hunger,pop=82
2,254.0,bigserver2,B,1564444832,1842398,583a6de73718f5dd0ddf388f4e68dc060b15e6df,F,"(-325,209)",parent=1842145,pop=84,chain=3
3,254.0,bigserver2,B,1564444836,1842399,520ddb069aa77dcb202dd0310a8852e8fddc58dd,M,"(67,-266)",parent=1842397,pop=83,chain=2
4,254.0,bigserver2,D,1564444859,1842311,dfd85ac03c4dd577352484b023d19d521b592696,age=7.76,M,"(-200,-123)",hunger,pop=82


Deaths:

In [12]:
death_raw = raw_data[raw_data.iloc[:,2] == 'D'].copy().reset_index(drop=True)
death_raw.head()

Unnamed: 0,release,server,0,1,2,3,4,5,6,7,8
0,254.0,bigserver2,D,1564444828,1842380,583a6de73718f5dd0ddf388f4e68dc060b15e6df,age=1.81,F,"(48,94)",hunger,pop=82
1,254.0,bigserver2,D,1564444830,1842301,520ddb069aa77dcb202dd0310a8852e8fddc58dd,age=8.00,M,"(40,350)",hunger,pop=82
2,254.0,bigserver2,D,1564444859,1842311,dfd85ac03c4dd577352484b023d19d521b592696,age=7.76,M,"(-200,-123)",hunger,pop=82
3,254.0,bigserver2,D,1564444861,1842145,f766576701306189b31a049054c0840b39f84c51,age=18.89,F,"(-322,242)",hunger,pop=82
4,254.0,bigserver2,D,1564444863,1842359,9381dc0dab74b62eee67d3dc275f5c61ea51bcd0,age=4.23,M,"(-308,258)",hunger,pop=82


Births:

In [13]:
birth_raw = raw_data[raw_data.iloc[:,2] == 'B'].copy().reset_index(drop=True)
birth_raw.head()

Unnamed: 0,release,server,0,1,2,3,4,5,6,7,8
0,254.0,bigserver2,B,1564444832,1842398,583a6de73718f5dd0ddf388f4e68dc060b15e6df,F,"(-325,209)",parent=1842145,pop=84,chain=3
1,254.0,bigserver2,B,1564444836,1842399,520ddb069aa77dcb202dd0310a8852e8fddc58dd,M,"(67,-266)",parent=1842397,pop=83,chain=2
2,254.0,bigserver2,B,1564444863,1842400,dfd85ac03c4dd577352484b023d19d521b592696,M,"(16,-57)",parent=1842304,pop=84,chain=2
3,254.0,bigserver2,B,1564444864,1842401,fd6e14532b361175d5203c50a22d20fe951c89f0,M,"(146,49)",parent=1842336,pop=84,chain=2
4,254.0,bigserver2,B,1564444866,1842402,f766576701306189b31a049054c0840b39f84c51,F,"(68,64)",parent=1842334,pop=84,chain=2


### Clean up data

In [14]:
shared_header = ['release', 'server', 'event', 'timestamp', 'playerID', 'hash', 'uniqueID', 'age', 'sex',
                 'location', 'parent', 'cause_of_death', 'killer', 'pop', 'chain']

#### Deaths

In [15]:
death_data = death_raw.copy()
# Insert missing columns
death_data.insert(6, 'uniqueID', np.nan)
death_data.insert(10, 'parent', np.nan)
death_data.insert(13, 'chain', np.nan)
death_data.insert(12, 'killer', np.nan)
death_data.columns = shared_header

for i, row in tqdm_notebook(death_data.iterrows(), total=death_data.shape[0]):
    age = str_extract('(?<=age=)\d+\.\d+', row['age'])
    age = float(age)
    
    pop = str_extract('(?<=pop=)[0-9]+', row['pop'])
    pop = int(pop)
    
    death_data.at[i, 'age'] = age
    death_data.at[i, 'pop'] = pop

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(FloatProgress(value=0.0, max=1191532.0), HTML(value='')))




Clean up IDs,  locations, causes of death

In [16]:
# Unique ID: Player ID + hash
death_data['uniqueID'] = death_data.apply(lambda row: '%s_%s_%s' % (row['playerID'], row['server'], row['release']),
                                          axis=1)

# Locations
death_data['location'] = death_data['location'].apply(make_tuple)

# Check for murdered players
murderers = death_data['cause_of_death'].str.extract(r'(?<=killer_)([0-9]+)', expand=False)
death_data['cause_of_death'] = death_data['cause_of_death'].str.replace("killer_[0-9]+", "murdered")
death_data['killer'] = murderers

print(death_data['cause_of_death'].unique())
print(death_data['killer'].unique()[:10])

death_data.head()

['hunger' 'disconnect' 'murdered' 'oldAge']
[nan '1841694' '1841943' '1841921' '1842167' '1841831' '1842476' '1842228'
 '1842430' '1842343']


Unnamed: 0,release,server,event,timestamp,playerID,hash,uniqueID,age,sex,location,parent,cause_of_death,killer,pop,chain
0,254.0,bigserver2,D,1564444828,1842380,583a6de73718f5dd0ddf388f4e68dc060b15e6df,1842380_bigserver2_254.0,1.81,F,"(48, 94)",,hunger,,82,
1,254.0,bigserver2,D,1564444830,1842301,520ddb069aa77dcb202dd0310a8852e8fddc58dd,1842301_bigserver2_254.0,8.0,M,"(40, 350)",,hunger,,82,
2,254.0,bigserver2,D,1564444859,1842311,dfd85ac03c4dd577352484b023d19d521b592696,1842311_bigserver2_254.0,7.76,M,"(-200, -123)",,hunger,,82,
3,254.0,bigserver2,D,1564444861,1842145,f766576701306189b31a049054c0840b39f84c51,1842145_bigserver2_254.0,18.89,F,"(-322, 242)",,hunger,,82,
4,254.0,bigserver2,D,1564444863,1842359,9381dc0dab74b62eee67d3dc275f5c61ea51bcd0,1842359_bigserver2_254.0,4.23,M,"(-308, 258)",,hunger,,82,


#### Births

In [17]:
birth_data = birth_raw.copy()
# Insert missing columns
birth_data.insert(6, 'uniqueID', np.nan)
birth_data.insert(7, 'age', np.nan)
birth_data.insert(11, 'cause_of_death', np.nan)
birth_data.insert(12, 'killer', np.nan)
birth_data.columns = shared_header

for i, row in tqdm_notebook(birth_data.iterrows(), total=birth_data.shape[0]):    
    pop = str_extract('(?<=pop=)[0-9]+', row['pop'])
    pop = int(pop)
    
    chain = str_extract('(?<=chain=)[0-9]+', row['chain'])
    chain = int(chain)
    
    birth_data.at[i, 'pop'] = pop
    birth_data.at[i, 'chain'] = chain

birth_data.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(FloatProgress(value=0.0, max=1193101.0), HTML(value='')))




Unnamed: 0,release,server,event,timestamp,playerID,hash,uniqueID,age,sex,location,parent,cause_of_death,killer,pop,chain
0,254.0,bigserver2,B,1564444832,1842398,583a6de73718f5dd0ddf388f4e68dc060b15e6df,,,F,"(-325,209)",parent=1842145,,,84,3
1,254.0,bigserver2,B,1564444836,1842399,520ddb069aa77dcb202dd0310a8852e8fddc58dd,,,M,"(67,-266)",parent=1842397,,,83,2
2,254.0,bigserver2,B,1564444863,1842400,dfd85ac03c4dd577352484b023d19d521b592696,,,M,"(16,-57)",parent=1842304,,,84,2
3,254.0,bigserver2,B,1564444864,1842401,fd6e14532b361175d5203c50a22d20fe951c89f0,,,M,"(146,49)",parent=1842336,,,84,2
4,254.0,bigserver2,B,1564444866,1842402,f766576701306189b31a049054c0840b39f84c51,,,F,"(68,64)",parent=1842334,,,84,2


Clean up IDs, locations, parents:

In [18]:
birth_data['uniqueID'] = birth_data.apply(lambda row: '%s_%s_%s' % (row['playerID'], row['server'], row['release']), 
                                          axis=1)
birth_data['location'] = birth_data['location'].apply(make_tuple)
birth_data['parent'] = birth_data['parent'].str.extract(r'(?<=parent=)([0-9]+)')

birth_data.head()

Unnamed: 0,release,server,event,timestamp,playerID,hash,uniqueID,age,sex,location,parent,cause_of_death,killer,pop,chain
0,254.0,bigserver2,B,1564444832,1842398,583a6de73718f5dd0ddf388f4e68dc060b15e6df,1842398_bigserver2_254.0,,F,"(-325, 209)",1842145,,,84,3
1,254.0,bigserver2,B,1564444836,1842399,520ddb069aa77dcb202dd0310a8852e8fddc58dd,1842399_bigserver2_254.0,,M,"(67, -266)",1842397,,,83,2
2,254.0,bigserver2,B,1564444863,1842400,dfd85ac03c4dd577352484b023d19d521b592696,1842400_bigserver2_254.0,,M,"(16, -57)",1842304,,,84,2
3,254.0,bigserver2,B,1564444864,1842401,fd6e14532b361175d5203c50a22d20fe951c89f0,1842401_bigserver2_254.0,,M,"(146, 49)",1842336,,,84,2
4,254.0,bigserver2,B,1564444866,1842402,f766576701306189b31a049054c0840b39f84c51,1842402_bigserver2_254.0,,F,"(68, 64)",1842334,,,84,2


#### Save outputs

In [19]:
lifelog_data = pd.concat([death_data, birth_data])
lifelog_data = lifelog_data.sort_values(by=['server', 'timestamp'])
lifelog_data = lifelog_data.reset_index(drop=True)

lifelog_data.head()

Unnamed: 0,release,server,event,timestamp,playerID,hash,uniqueID,age,sex,location,parent,cause_of_death,killer,pop,chain
0,254.0,bigserver2,D,1564444828,1842380,583a6de73718f5dd0ddf388f4e68dc060b15e6df,1842380_bigserver2_254.0,1.81,F,"(48, 94)",,hunger,,82,
1,254.0,bigserver2,D,1564444830,1842301,520ddb069aa77dcb202dd0310a8852e8fddc58dd,1842301_bigserver2_254.0,8.0,M,"(40, 350)",,hunger,,82,
2,254.0,bigserver2,B,1564444832,1842398,583a6de73718f5dd0ddf388f4e68dc060b15e6df,1842398_bigserver2_254.0,,F,"(-325, 209)",1842145.0,,,84,3.0
3,254.0,bigserver2,B,1564444836,1842399,520ddb069aa77dcb202dd0310a8852e8fddc58dd,1842399_bigserver2_254.0,,M,"(67, -266)",1842397.0,,,83,2.0
4,254.0,bigserver2,D,1564444859,1842311,dfd85ac03c4dd577352484b023d19d521b592696,1842311_bigserver2_254.0,7.76,M,"(-200, -123)",,hunger,,82,


In [20]:
lifelog_data.to_csv('outputs/lifelogs_201907-202005_data.tsv', sep='\t', index=True)