In [1]:
import numpy as np
import json
import pandas as pd

In [2]:
NUMBER_OF_POKEMON = 250
NUMBER_OF_MOVES = 800

# For N pokmon, there are N -1 different parent nodes that can be constructed. 
poke_gen = np.zeros((NUMBER_OF_POKEMON * 2 - 1, NUMBER_OF_MOVES))
poke_gen.shape

(499, 800)

In [3]:
'''
Read each pokemon data field and add the move data into the poke_gen matrix
'''
for poke_id in range(1, NUMBER_OF_POKEMON, + 1):
    with open('../data/%s.json' % poke_id, 'r') as poke_data_file:
        data = json.load(poke_data_file)

        moves = data['moves']
        for move in moves:
            move_id = int(move['move']['url'].split('/')[-2])
            poke_gen[poke_id - 1][move_id - 1] = 1

poke_gen.shape

(499, 800)

In [4]:
def compute_genetic_diff(poke_gen, a, b):
    '''
    Compute the genetic difference between two Pokemon.
    :param poke_gen np.array: a pokemon move matrix, where the row (i) represent pokemon with id=(i + 1) 
    :param a int: the index of the first pokemon
    :param b int: the index of the second pokemon
    :return: The genetic difference
    :rtype: float
    '''
    return np.abs(poke_gen[a] - poke_gen[b]).sum()

def compute_avg_gene(poke_gen, a, b):
    '''
    Compute the average genome between two Pokemon.
    :param poke_gen np.array: a pokemon move matrix, where the row (i) represent pokemon with id=(i + 1) 
    :param a int: the index of the first pokemon
    :param b int: the index of the second pokemon
    :return: The average genome
    :rtype: np.array
    '''
    return (poke_gen[a] + poke_gen[b]) / 2

import heapq

def build_phy_tree(poke_gen):
    '''
    Construct a phylogentic tree given N pokemon in a matrix. The algorithm assumes that each row contains
    a pokemon's genetic information (i.e moves)
    :param poke_gen np.array: a pokemon move matrix, where the row (i) represent pokemon with id=(i + 1)
    :return: A tree represented in matrix form where a generated common ancestor node links to two other nodes in the tree.
                and the index of the maximum parent node
    :rtype: tuple(np.array, int)
    '''
    distance_heap = []    
    # initial distance calculations
    for i in range(NUMBER_OF_POKEMON):
        for j in range(i + 1, NUMBER_OF_POKEMON):
            dist = compute_genetic_diff(poke_gen, i, j)
            heapq.heappush(distance_heap, (dist, i, j))

        
    max_parent_index = NUMBER_OF_POKEMON
    
    # the array that will hold the parent hood of different pokemons
    children = np.zeros((poke_gen.shape[0], 2)).astype(int) - 1
    # pokemon that have already have a common parent
    completed = set()
    while len(distance_heap) != 0:
        dist, i, j = heapq.heappop(distance_heap)
        if i not in completed and j not in completed:
            # convert children indices to their ID counterpart
            children[max_parent_index, 0] = i 
            children[max_parent_index, 1] = j 
            poke_gen[max_parent_index, :] = compute_avg_gene(poke_gen, i, j)
            completed.add(i)
            completed.add(j)
            
            for index in range(max_parent_index):
                if index not in completed:
                    p_dist = compute_genetic_diff(poke_gen, index, max_parent_index)
                    heapq.heappush(distance_heap, (p_dist, index, max_parent_index))
            
            max_parent_index += 1
            
    return children, max_parent_index
        
    

In [5]:
ptree = build_phy_tree(poke_gen)[0]
ptree

array([[ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,  -1],
       [ -1,

## Serializaing Tree matrix into Hierarchical JSON form

We can walk through the matrix backwards until we recover a tree structure

In [12]:
def serialize_tree(ptree):
    '''
    Convert the matrix representation of the tree into dict form.
    :param poke_gen np.array: a pokemon move matrix, where the row (i) represent pokemon with id=(i + 1)
    return: A recursive tree dict.
    rtype: dict
    '''
    pokemon_information = []

    for poke_id in range(1, NUMBER_OF_POKEMON + 1):
        with open('../data/%s.json' % poke_id, 'r') as poke_data_file:
            data = json.load(poke_data_file)
            pokemon_name = data['name']
            pokemon_information.append({'name': pokemon_name, 'id': poke_id})
    
    def _form_tree(node_index):
        if node_index < NUMBER_OF_POKEMON:
            # it's a regular pokemon, save information in dict.
            return pokemon_information[node_index]
    
        else:
            # parent node
            children = ptree[node_index]
            c1 = children[0]
            c2 = children[1]
            return {
                'id': int(node_index + 1),
                'children': [_form_tree(c1), _form_tree(c2)],
            }
        
    return _form_tree(ptree.shape[0] -2)

In [13]:
st = serialize_tree(ptree)
st

{'id': 498,
 'children': [{'name': 'mewtwo', 'id': 150},
  {'id': 497,
   'children': [{'id': 357,
     'children': [{'name': 'charizard', 'id': 6},
      {'id': 272,
       'children': [{'name': 'charmander', 'id': 4},
        {'name': 'charmeleon', 'id': 5}]}]},
    {'id': 496,
     'children': [{'name': 'delibird', 'id': 225},
      {'id': 495,
       'children': [{'id': 491,
         'children': [{'name': 'gligar', 'id': 207},
          {'name': 'sneasel', 'id': 215}]},
        {'id': 494,
         'children': [{'id': 492,
           'children': [{'id': 487,
             'children': [{'name': 'corsola', 'id': 222},
              {'id': 479,
               'children': [{'name': 'mantine', 'id': 226},
                {'id': 347,
                 'children': [{'name': 'wooper', 'id': 194},
                  {'name': 'quagsire', 'id': 195}]}]}]},
            {'id': 490,
             'children': [{'id': 488,
               'children': [{'id': 474,
                 'children': [{'id': 45

In [14]:
with open('../data/tree_data.json', 'w') as f:
    f.write(json.dumps(st))

In [15]:
## Generate the data needed for the web app to properly query for pokemon information.

In [17]:
pokemon_info = np.zeros((NUMBER_OF_POKEMON, 4)).astype(str)
for poke_id in range(1, NUMBER_OF_POKEMON + 1):
    with open('../data/%s.json' % poke_id, 'r') as f:
        poke_json = json.loads(f.read())
        poke_name = poke_json['name']
        
        types = poke_json['types']
        
        type1 = poke_json['types'][0]['type']['name']
        type2 = poke_json['types'][1]['type']['name'] if len(types) > 1 else ''
        pokemon_info[poke_id - 1, 0] = poke_id
        pokemon_info[poke_id - 1, 1] = poke_name
        pokemon_info[poke_id - 1, 2] = type1
        pokemon_info[poke_id - 1, 3] = type2
        

pokemon_info = np.hstack((pokemon_info, poke_gen[0: NUMBER_OF_POKEMON, :]))
pokemon_info

array([['1', 'bulbasaur', 'grass', ..., '0.0', '0.0', '0.0'],
       ['2', 'ivysaur', 'grass', ..., '0.0', '0.0', '0.0'],
       ['3', 'venusaur', 'grass', ..., '0.0', '0.0', '0.0'],
       ...,
       ['248', 'tyranitar', 'rock', ..., '0.0', '0.0', '0.0'],
       ['249', 'lugia', 'psychic', ..., '0.0', '0.0', '0.0'],
       ['250', 'ho-oh', 'fire', ..., '0.0', '0.0', '0.0']], dtype='<U32')

In [18]:
pd.DataFrame(data=pokemon_info).to_csv('pokemon_all_info.csv', header=None)