In [None]:
import numpy as np
import random
# import to do training
from tpg.trainer import Trainer
# import to run an agent (always needed)
from tpg.agent import Agent
# faster training
import multiprocessing as mp
# visual tools
from IPython.display import clear_output
import time
import matplotlib.pyplot as plt
# for writing
import csv
from datetime import date

In [None]:
class GridWorld_fig11:
    def __init__ (self, rows, cols, win_state, start_state):
        self.memory = []
        self.memory_position = 0
        self.memory_limit = 20
        self.rows = rows
        self.cols = cols
        self.start_state = start_state
        self.win_state = win_state
        self.current_state = self.start_state
        
    def sample_action (self):
        rand = random.uniform(0, 1)
        if (rand >= 0) and (rand < 0.25):
            return 0
        elif (rand >= 0.25) and (rand < 0.5):
            return 1
        elif (rand >= 0.5) and (rand < 0.75):
            return 2
        else:
            return 3
        
    def reset (self):
        self.current_state = self.start_state
        return self.current_state
        
    # just reset for now...
    def close (self):
        self.current_state = self.start_state
        return 1
    
    def check_win (self):
        if self.current_state == self.win_state:
            return True
        return False
    
    def step (self, action):
        # north
        if action == 0:
            next = (self.current_state[0] - 1, self.current_state[1])
        # south
        elif action == 1:
            next = (self.current_state[0] + 1, self.current_state[1])
        # east
        elif action == 2:
            next = (self.current_state[0], self.current_state[1] + 1)
        # west
        else:
            next = (self.current_state[0], self.current_state[1] - 1)

        terminate = False
        reward = 0
        # check if move is legal
        if (next[0] >= 0 and next[0] <= (self.rows-1)) and (next[1] >= 0 and next[1] <= (self.cols-1)):            
            illegal = 0
              
            if (next == (1, 1)) or (next == (2, 0)) or (next == (2, 1)) or (next == (1, 3)) or (next == (2, 3) or (next == (2, 4))):
                illegal = 1    
                    
            if (illegal == 0):
                self.current_state = next
                reward -= 0.01
            else:
                reward -= 1
        else:
            reward -= 1
            
        # punish repeat states within last 20 states
        if self.current_state in self.memory:
            reward -= 1
        
        if self.check_win():
            reward += 100
            terminate = True
        
        # add new state to memory
        if len(self.memory) <= self.memory_limit:
            (self.memory).append(self.current_state)
        # after memory is full, begin overriding it
        else:
            if self.memory_position < self.memory_limit:
                self.memory[self.memory_position] = self.current_state
                self.memory_position += 1
            else:
                self.memory_position = 0
                self.memory[self.memory_position] = self.current_state
        
        return self.current_state, reward, terminate

In [None]:
def runAgent(args):
    agent = args[0]
    envName = args[1]
    scoreList = args[2]
    numEpisodes = args[3] # number of times to repeat game
    numFrames = args[4] 
    gen = args[5]
    
    # skip if task already done by agent
    if agent.taskDone(envName):
        print('Agent #' + str(agent.agentNum) + ' can skip.')
        scoreList.append((agent.team.id, agent.team.outcomes))
        return
    
    env = GridWorld_fig11(5, 5, (0, 4), (4, 0))
    valActs = 4
    
    # create environment object from above
    scoreTotal = 0
    for ep in range(numEpisodes):
        state = env.reset()
        scoreEp = 0
        states = []
        step_map = np.zeros((10, 10))
        if numEpisodes > 1:
            for i in range(numFrames): # frame loop           
                # action selection
                act = agent.act(state)
                
                state, reward, isDone = env.step(act)
                # for heatmap
                step_map[state] = step_map[state] + 1
                states.append(state)
                #print('State: ' + str(state))
                scoreEp += reward
                
                # win
                if isDone:
                    print('win!')
                    print(states)
                    break
                    
            print('Agent #' + str(agent.agentNum) + 
              ' | Ep #' + str(ep) + ' | Score: ' + str(scoreEp))
            
        scoreTotal += scoreEp
    scoreTotal /= numEpisodes
    env.close()
    agent.reward(scoreTotal, envName)
    scoreList.append((agent.team.id, agent.team.outcomes))
    if isDone:
        return states
    else:
        return 0

In [None]:
# NOTES:
# pAtomic equal to 1 so teams are single nodes
    # starting off, teams don't reference teams
# if the maxTeamSize is set, we can try to maximize the use of actually useful learners.
    # this might help with the consistency of success...
trainer = Trainer(actions=4, teamPopSize=50, pActAtom=1.0, 
                  nRegisters=4, initMaxActProgSize=48, gap=0.5)

envName = 'fig11'
allScores = []
champions = []
for gen in range(500):
    scoreList = []

    agents = trainer.getAgents()

    agent_track = 0
    best_score = -200.0
    curr_champion = []
    for agent in agents:
        run = runAgent([agent, envName, scoreList, 10, 100, gen])
        if (scoreList[agent_track][1][envName] > best_score) and (run != 0):
            best_score = scoreList[agent_track][1][envName]
            curr_champion = run
            print(run)
        agent_track += 1

    champions.append(curr_champion)

    # apply scores, must do this when multiprocessing
    # because agents can't refer to trainer
    teams = trainer.applyScores(scoreList)

    trainer.evolve(tasks=[envName]) # go into next gen

    # an easier way to track stats than the above example
    scoreStats = trainer.fitnessStats

    allScores.append((scoreStats['min'], scoreStats['max'], scoreStats['average']))

    print('Gen: ' + str(gen))
    print('Results so far: ' + str(allScores))

print('Results:\nMin, Max, Avg')
for score in allScores:
    print(score[0],score[1],score[2])

In [None]:
x = []
y = []
for i in range(500):
    x.append(i)
for score in allScores:
    y.append(score[1])
plt.xlabel('Generation')
plt.ylabel('Max Score')
plt.plot(x, y)
plt.show()

In [None]:
x = []
y = []
for i in range(500):
    x.append(i)

for score in allScores:
    y.append(score[2])
plt.xlabel('Generation')
plt.ylabel('Average Score')
plt.plot(x, y)
plt.show()