# Expand transitions
Natalia Velez, April 2021

In this file, we:
   1. Parse the transition files
   2. Expand category transitions into transitions between individual objects, with a separate workflow for pattern transitions
   3. Upload expanded transitions to database

In [2]:
import os, re, glob, json
from os.path import join as opj
import numpy as np
import pandas as pd
import json
import networkx as nx 
import pymongo
import pprint
from tqdm import notebook
from itertools import cycle

# Embed html-formatted text
# (Used to do QA on transitions)
from IPython.core.display import display, HTML
def embed(s): return display(HTML(s))

## Load data from database

Connect:

In [3]:
keyfile = '../6_database/credentials.key'

#Connection string
creds = open(keyfile, "r").read().splitlines()
myclient = pymongo.MongoClient('134.76.24.75', username=creds[0], password=creds[1], authSource='ohol') 
print(myclient)

ohol = myclient.ohol
print(ohol)

MongoClient(host=['134.76.24.75:27017'], document_class=dict, tz_aware=False, connect=True, authsource='ohol')
Database(MongoClient(host=['134.76.24.75:27017'], document_class=dict, tz_aware=False, connect=True, authsource='ohol'), 'ohol')


Load objects:

In [4]:
objects = list(ohol.objects.find())
objects = pd.DataFrame(objects)

print('Loading %i objects' % objects.shape[0])
objects.head()

Loading 4161 objects


Unnamed: 0,_id,id,name,containable,containSize,vertSlotRot,permanent,minPickupAge,heldInHand,blocksWalking,...,floorHugging,slotsLocked,noFlip,sideAccess,creationSoundForce,invisCont,slotPos,vert,spritesDrawnBehind,spritesAdditiveBlend
0,60750c9e68dcd62afd70767e,11,Skin Tone A &B &C &D &E &F,0,1.0,0.0,0,3,0,0,...,,,,,,,,,,
1,60750c9e68dcd62afd70767f,19,Female001 D,0,1.0,0.0,0,3,0,0,...,0.0,,,,,,,,,
2,60750c9e68dcd62afd707680,30,Wild Gooseberry Bush,0,1.0,0.0,1,3,0,0,...,0.0,,,,,,,,,
3,60750c9e68dcd62afd707681,31,Gooseberry,1,1.0,0.0,0,3,1,0,...,0.0,0.0,,,,,,,,
4,60750c9e68dcd62afd707682,32,Big Hard Rock,0,1.0,0.0,1,3,0,0,...,0.0,,,,,,,,,


Load categories:

In [5]:
categories = list(ohol.categories.find())
categories = pd.DataFrame(categories)

print('Loading %i categories' % categories.shape[0])
categories.head()

Loading 269 categories


Unnamed: 0,_id,parentID,numObjects,probabilistic,pattern,children,probs
0,60750cbd68dcd62afd7086bf,316,3,False,True,"[2505, 2517, 3002]",
1,60750cbd68dcd62afd7086c0,319,3,False,True,"[2506, 2522, 3003]",
2,60750cbd68dcd62afd7086c1,320,3,False,True,"[2510, 2520, 3004]",
3,60750cbd68dcd62afd7086c2,321,3,False,True,"[2507, 2524, 3007]",
4,60750cbd68dcd62afd7086c3,322,3,False,True,"[2508, 2523, 3008]",


Load transitions:

In [6]:
trans_cols = ['origActor', 'origTarget', 'newActor', 'newTarget']
cat_cols = [c+'_cat' for c in trans_cols]
pattern_cols = [c+'_pattern' for c in trans_cols]

transitions = list(ohol.transitions.find())
transitions = pd.DataFrame(transitions)
transitions = transitions[trans_cols]

print('Loading %i transitions' % transitions.shape[0])
transitions.head()

Loading 4847 transitions


Unnamed: 0,origActor,origTarget,newActor,newTarget
0,67,2970,0,2966
1,235,1890,1892,0
2,441,445,441,444
3,0,2671,2539,2665
4,239,335,327,291


## Expand transitions between categories

### Helper functions

Useful functions for debugging:

In [7]:
def obj_name_elem(o):
    if o in [0,-1]:
        return 'empty'
    else:
        return objects[objects.id==o].name.values[0]
obj_name = np.vectorize(obj_name_elem)

def display_transition(row):
    # Get item names
    names = row.apply(obj_name)
    
    # Generate HTML tags
    tag = '<mark style="background-color: %s;">%s</mark>'
    actor_color = '#91d8f2'
    target_color = '#ffbf6b'
    out = '(%s,%s) &#x2192; (%s,%s)' % (tag % (actor_color, names['origActor']),
                                        tag % (target_color, names['origTarget']),
                                        tag % (actor_color, names['newActor']),
                                        tag % (target_color, names['newTarget']))
    
    embed(out)

In [8]:
display_transition(transitions.loc[1])

Helper functions: Identify categories

In [9]:
# Helper function: Does this item name correspond to a category?
cat_names = categories.parentID.values
def is_cat_elem(objID): return objID in cat_names
is_cat = np.vectorize(is_cat_elem)

# Helper function: Is this category a "pattern" category?
# (Pattern categories are parsed differently from regular categories - see below)
pattern_cats = categories[categories.pattern].parentID.values
def is_pattern_elem(cat): return cat in pattern_cats
is_pattern = np.vectorize(is_pattern_elem)

In [10]:
# Some examples
print(obj_name(33))
print('Category?: %s' % is_cat_elem(33))
print('Pattern?: %s\n' % is_pattern_elem(33))

print(obj_name(903))
print('Category?: %s' % is_cat_elem(903))
print('Pattern?: %s\n' % is_pattern_elem(903))

print(obj_name(3831))
print('Category?: %s' % is_cat_elem(3831))
print('Pattern?: %s\n' % is_pattern_elem(3831))

Stone
Category?: False
Pattern?: False

@ Tillable Row
Category?: True
Pattern?: False

@ Non-Dug track
Category?: True
Pattern?: True



Helper functions to read out information about categories:

In [11]:
def is_real_obj(cat):
    '''
    Is this a real object, or just a generic category label?
    e.g., @ Tillable row --> False
    
    Input: int (needs to match an entry in categories.parentID)
    Output: bool
    '''
    name = objects[objects.id==cat]['name'].values[0]
    return not name[0] == '@'

def cat_children(cat, pattern=False):
    '''
    Return object ID (if not category) or list of children (if regular category)
    
    Inputs:
    cat: int
    
    Output: numpy array of categories
    '''
    if is_cat(cat):
        children = categories[categories.parentID == cat]['children'].values[0]
        if pattern & is_real_obj(cat): 
            return [cat] + children
        else:
            return children
    else:
        return np.array([cat])
    
def n_children_elem(cat):
    '''
    Get number of children
    '''
    if cat in [0, -1]:
        return 0
    else:
        return len(cat_children(cat))
n_children = np.vectorize(n_children_elem)

In [12]:
# Some examples
print(obj_name(903))
print('Real object? %s' % is_real_obj(903))
print('%i children:' % n_children_elem(903))
print(*[obj_name(o) for o in cat_children(903)], sep='\n')

@ Tillable Row
Real object? False
2 children:
Shallow Tilled Row - groundOnly +biomeBlock4
Fertile Soil Pile


Put together combinations of different categories:

In [13]:
def expand_pairs(arr1, arr2):
    '''
    Expand one array in a set (e.g., old and new targets) if they are not the same length
    
    Inputs:
    arr1, arr2: lists
    
    Outputs:
    pairs: list of tuples
    '''
    if len(arr2) == 1:
        pairs = zip(arr1, cycle(arr2))
    elif len(arr1) == 1:
        pairs = zip(cycle(arr1), arr2)
    else:
        pairs = zip(arr1, arr2)
    
    return list(pairs)

In [14]:
# Some examples
expand_pairs(['scrambled_egg'], ['bacon', 'sausage', 'ham'])

[('scrambled_egg', 'bacon'),
 ('scrambled_egg', 'sausage'),
 ('scrambled_egg', 'ham')]

### Expand non-pattern categories

In [15]:
def expand_transitions(row):
    '''
    Takes a row from the original transitions dataset, expands any transitions containing category labels,
    and returns the set of child transitions

    Inputs:
    row: pd.Series object (must contain fields: origActor, newActor, origTarget, newTarget)
    
    Outputs:
    ex_transitions: list of tuples (which will later be turned into a dataframe)
    '''
    # Expand category labels into lists of children
    #inputs = row[trans_cols]
    ex_inputs = row.apply(cat_children)
    is_category = row.apply(is_pattern)

    # Match up children to make expanded transitions
    actors = expand_pairs(ex_inputs['origActor'], ex_inputs['newActor'])
    targets = expand_pairs(ex_inputs['origTarget'], ex_inputs['newTarget'])

    ex_transitions = pd.DataFrame([(oa, ot, na, nt)
                                   for oa,na in actors
                                   for ot,nt in targets],
                                 columns = trans_cols)
    
    return ex_transitions

In [16]:
nonpattern_transition = transitions.loc[5]
display_transition(transitions.loc[5])

nonpattern_expanded = expand_transitions(nonpattern_transition)

print('After expansion:')
for _,row in nonpattern_expanded.iterrows():
    display_transition(row)

After expansion:


### Expand **pattern** categories

According to [this comment](https://github.com/JustinLove/onetech/blob/mapping/process/src/TransitionImporter.js) in the onetech repository, pattern categories need to be parsed differently:
```
  // Pattern categories work differently than regular categories:
  // 1. The parentID is an actual object and should stick around
  // 2. A transition is only considered if all matching pattern
  //    categories have the same number of objectIDs
  // 3. For each objectID, a new transition is created which maps
  //    each other pattern category objectID to the new object 
```

After digging through examples of particular transitions, I found three exceptions to this comment:

1. Some parent items are generic category labels. Category objects start with the "@" character (e.g., "@ High Heat Source", "@ Tillable Row") and do not correspond to any one item. In these cases, we don't add the parent IDs to the expanded transitions.
2. Some transitions violate (2), but the parent IDs alone specify a valid transition. These invalid transitions are often superseded by a transition between specific items (e.g., yellow paint + sheet metal --> empty bucket). In the main loop, we'll check for pattern transitions that introduce contradictions and will filter those out.

e.g., (<strong>1904</strong>, 4652) --> (659, 4675)<br />
(<strong>bucket of {blue, yellow, black, green} paint</strong>, stack of sheet metal) --> (empty bucket, stack of blue sheet metal)

3. Some transitions violate (2) because the new target/object are empty. These transitions should still be added.

e.g., (-1, <strong>1769</strong>) --> (0, 0)<br />
(decay, <strong>dead {German Shepherd, Collie, Beagle, Pit Bull, ...}</strong>) --> (empty, empty) :(((


In [68]:
def expand_pattern_transitions(row):
    trans_in = row[trans_cols]
    
    # Check for valid categories
    row_cats = trans_in.apply(is_pattern)
    cat_lengths = trans_in.apply(n_children)
    valid_pattern = cat_lengths[row_cats].nunique() == 1
    real_parents = trans_in[row_cats].apply(is_real_obj).all()

    # If it's a real pattern: Expand out
    if valid_pattern: 
        ex_inputs = np.where(row_cats,
                             trans_in.apply(lambda c: cat_children(c, pattern=True)),
                             trans_in.apply(lambda c: np.array([c])))
        ex_inputs = pd.Series(ex_inputs, index=trans_cols)

    # If it's not and the parent items are a real item: Treat as a single transition    
    elif real_parents: 
        ex_inputs = trans_in.copy()

    # Otherwise: Just toss it
    else:
        ex_inputs = pd.Series(([],[],[],[]),index=trans_cols)

    # Convert into dataframe and expand out
    trans_df = pd.DataFrame(ex_inputs).T
    trans_df = trans_df.apply(lambda col: col.explode(), axis=0).reset_index(drop=True)
    
    return trans_df

In [18]:
pattern_transition = transitions.loc[258]
display_transition(transitions.loc[258])

pattern_expanded = expand_pattern_transitions(pattern_transition)

print('After expansion:')
for _,row in pattern_expanded.iterrows():
    display_transition(row)

After expansion:


### Main loop: Assemble dataframe of expanded transitions

Helper function: Is this category probabilistic? (only probabilistic categories can have multiple outcomes)

In [29]:
def is_probabilistic_elem(cat):
    if is_cat(cat):
        return categories[categories.parentID == cat].probabilistic.values[0]
    else:
        return False
    
is_probabilistic = np.vectorize(is_probabilistic_elem)

print(obj_name(1966))
print(is_probabilistic_elem(1966))

Random Die Roll
True


Rearrange transitions in priority order:

Single objects --> Non-pattern categories --> Pattern categories

In [32]:
ordered_transitions = transitions.copy()
ordered_transitions['prob'] = ordered_transitions.apply(is_probabilistic).apply(any, axis=1)
ordered_transitions['category'] = ordered_transitions.apply(is_cat).apply(any, axis=1)*1
ordered_transitions['pattern'] = ordered_transitions.apply(is_pattern).apply(any, axis=1)*1
ordered_transitions['priority'] = ordered_transitions['category'] + ordered_transitions['pattern']
ordered_transitions = ordered_transitions.sort_values(by='priority').reset_index()

ordered_transitions

Unnamed: 0,index,origActor,origTarget,newActor,newTarget,prob,category,pattern,priority
0,0,67,2970,0,2966,False,0,0,0
1,3106,4435,-1,4431,4434,False,0,0,0
2,3107,467,468,467,469,False,0,0,0
3,3108,2531,557,0,2688,False,0,0,0
4,3110,2256,67,0,2264,False,0,0,0
...,...,...,...,...,...,...,...,...,...
4842,880,-1,1018,0,1019,False,1,1,2
4843,3810,323,-1,239,321,False,1,1,2
4844,1471,394,1207,394,1208,False,1,1,2
4845,3102,964,1015,964,1016,False,1,1,2


origActor     3863
origTarget    1884
newActor       659
newTarget     3865
Name: 4, dtype: object

Main loop:

In [73]:
expanded_list = []
print('Original transitions: %i' % ordered_transitions.shape[0])

for idx, row in notebook.tqdm(ordered_transitions.iterrows(), total=ordered_transitions.shape[0]):
    if row['pattern'] == 1:
        row_df = expand_pattern_transitions(row)
    elif row['category'] == 1:
        row_df = expand_transitions(row)
    else:
        row_df = pd.DataFrame(row[trans_cols]).T
        
    row_df['prob'] = row['prob']
    expanded_list.append(row_df)

# Turn to dataframe        
expanded_df = pd.concat(expanded_list)
expanded_size_orig = expanded_df.shape[0]

# Remove duplicate rows
expanded_df = expanded_df.drop_duplicates().reset_index(drop=True)
expanded_size_final = expanded_df.shape[0]

print('Original expanded transitions: %i' % expanded_size_orig)
print('Removed %i duplicate transitions' % (expanded_size_orig-expanded_size_final))
print('Final expanded # of transitions: %i' % expanded_size_final)
    
# Drop duplicate transitions

Original transitions: 4847


  0%|          | 0/4847 [00:00<?, ?it/s]

Original expanded transitions: 20419
Removed 783 duplicate transitions
Final expanded of transitions: 19636


In [89]:
# Find contradictory 
n_products = expanded_df.groupby(['origActor','origTarget', 'prob']).size().reset_index(name='counts')
is_contradictory = n_products[(~n_products.prob) & (n_products.counts > 1)]

Unnamed: 0,origActor,origTarget,prob,counts
43,-1,142,False,2
44,-1,143,False,2
111,-1,420,False,2
132,-1,531,False,2
133,-1,532,False,2
...,...,...,...,...
18522,4539,4391,False,2
18535,4539,4592,False,2
18548,4540,4616,False,2
18564,4569,4616,False,2


In [96]:
for _, r in expanded_df[(expanded_df.origActor == -1) & (expanded_df.origTarget == 420)].iterrows():
    display_transition(r)

Upload expanded transitions to database

In [60]:
ordered_transitions.loc[idx-1]

index          1688
origActor       778
origTarget      764
newActor       1132
newTarget      1385
prob          False
category          1
pattern           1
priority          2
Name: 4786, dtype: object

In [61]:
ordered_transitions.loc[idx]

index          1563
origActor        59
origTarget     2226
newActor          0
newTarget      2220
prob          False
category          1
pattern           1
priority          2
Name: 4787, dtype: object

In [62]:
expand_pattern_transitions(row)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
pd.DataFrame(expanded_transitions, columns=trans_cols)

## Search through Tech tree

Find naturally-occurring objects -- these are the roots of the Tech tree:

In [None]:
natural_obj_df = objects[objects['mapChance'] > 0][['id', 'name']]
natural_objs = natural_obj_df.id

len('Seeding tree with %i naturally-occurring objects' % len(natural_objs))
natural_obj_df

Helper function: Find all transitions where a given object is an ingredient

In [None]:
# Initalize with natural objects
ingredients = natural_objs
previously_visited = natural_objs

# o = 33
# transitions[(transitions.origActor == o) | (transitions.origTarget == o)]


## QA