# Activity matrices
Natalia Vélez, June 2020

In [1]:
import glob
import pandas as pd
import numpy as np
from tqdm import notebook

Read features:

In [2]:
feature_f = 'outputs/activity_features.txt'
with open(feature_f, 'r') as handle:
    features = handle.read().splitlines()
    
print('Found %i unique items' % len(features))
print(*features[:50], sep='\t')

Found 2805 unique items
30	31	32	33	34	35	39	40	45	48	49	53	54	55	57	58	59	61	62	63	64	65	66	67	68	69	70	71	72	73	74	75	77	78	79	80	82	83	85	86	87	92	96	99	100	101	103	104	105	106


Find map change files:

In [3]:
map_files = glob.glob('outputs/maplog/*.tsv')
print(*map_files[:5], sep='\n')

outputs/maplog/maplog_release-300_start-1578345719.tsv
outputs/maplog/maplog_release-330_start-1588118683.tsv
outputs/maplog/maplog_release-314_start-1583729303.tsv
outputs/maplog/maplog_release-330_start-1588032283.tsv
outputs/maplog/maplog_release-304_start-1580317696.tsv


## Wrangling function
Todo: This should be done in 3_1 instead, but I kept going back and forth about how much information to keep in the wrangled dataframes...

In [4]:
def map_crosstab(f):

    map_df = pd.read_csv(f, sep='\t')
    
    # Trim invalid objects
    map_df = map_df[map_df.player_id > 0] # No player attached to event
    map_df = map_df[map_df.object_id != '0'] # Interact w/ empty square 

    # Clean up modifiers
    map_df['object_id'] = map_df.object_id.str.replace(r'^(f)|([u-v][0-9]+$)', '')

    # Convert to number
    map_df['object_id'] = map_df['object_id'].astype(np.int64)
    map_df['object_id'] = np.where(map_df['object_id'] > 5000, 9999, map_df['object_id'])
    map_df['object_id'] = map_df['object_id'].astype(str)

    # Make objectID and playerID into categorical variables
    map_df['object_id'] = pd.Categorical(map_df['object_id'], categories=features)
    map_df['player_id'] = map_df['player_id'].astype(np.int64)
    map_df['player_id'] = pd.Categorical(map_df['player_id'])

    # Co-occurrence matrix
    map_df = map_df.reset_index(drop=True)
    map_df = map_df[['object_id', 'player_id']]

    map_occ = pd.crosstab(map_df.player_id, map_df.object_id, dropna=False)
    return map_occ

In [5]:
for f in notebook.tqdm(map_files):
    crosstab = map_crosstab(f)
    out_file = f.replace('maplog', 'jobmatrix')
    crosstab.to_csv(out_file, sep='\t', index_label='playerID')

HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))

KeyboardInterrupt: 