# qtpg

### imports

In [201]:
import numpy as np
import random

### Downing fig 11 GridWorld 

#### GridWorld is based on fig 11 from Downing's "Reinforced Genetic Programming" paper
#### As shown below, it is a 5x5 maze
![DowningFig11](downing_fig_11.png)
#### The current implementation involves the following rewards per action:
#####  +100 to score upon reaching the goal
#####  -0.01 to score per action
#####  -1 to score upon repeating a state that was repeated n amount of times (n currently is 20)
#####  -1 to score upon hitting a wall or going out of bounds

In [260]:
# will use Downing fig 11 for testing on this
class GridWorld:
    def __init__ (self, rows, cols, win_state, start_state):
        self.memory = []
        self.memory_position = 0
        self.memory_limit = 20
        self.rows = rows
        self.cols = cols
        self.start_state = start_state
        self.win_state = win_state
        self.current_state = self.start_state
        
    def sample_action (self):
        rand = random.uniform(0, 1)
        if (rand >= 0) and (rand < 0.25):
            return 0
        elif (rand >= 0.25) and (rand < 0.5):
            return 1
        elif (rand >= 0.5) and (rand < 0.75):
            return 2
        else:
            return 3
        
    def reset (self):
        self.current_state = self.start_state
        return self.current_state
        
    # just reset for now...
    def close (self):
        self.current_state = self.start_state
        return 1
    
    def check_win (self):
        if self.current_state == self.win_state:
            return True
        return False
    
    def step (self, action):
        # north
        if action == 0:
            next = (self.current_state[0] - 1, self.current_state[1])
        # south
        elif action == 1:
            next = (self.current_state[0] + 1, self.current_state[1])
        # east
        elif action == 2:
            next = (self.current_state[0], self.current_state[1] + 1)
        # west
        else:
            next = (self.current_state[0], self.current_state[1] - 1)

        terminate = False
        reward = 0
        # check if move is legal
        if (next[0] >= 0 and next[0] <= (self.rows-1)) and (next[1] >= 0 and next[1] <= (self.cols-1)):            
            illegal = 0
            if (next == (1, 2)) or (next == (1, 3)) or (next == (2, 2)) or (next == (2, 3)):
                illegal = 1
                    
            if (illegal == 0):
                self.current_state = next
                reward += 0.01
            else:
                #reward -= 1
                reward += 0
        else:
            #reward -= 1
            reward += 0
            
        # punish repeat states within last 20 states
        if self.current_state in self.memory:
            #reward -= 1
            reward += 0
        
        if self.check_win():
            reward += 100
            terminate = True
        
        # add new state to memory
        if len(self.memory) <= self.memory_limit:
            (self.memory).append(self.current_state)
        # after memory is full, begin overriding it
        else:
            if self.memory_position < self.memory_limit:
                self.memory[self.memory_position] = self.current_state
                self.memory_position += 1
            else:
                self.memory_position = 0
                self.memory[self.memory_position] = self.current_state
        
        return self.current_state, reward, terminate

### Reinforcement Learning Functions

In [261]:
class q_table: 
    def __init__ (self):
        self.q = []
    
    def create (self, agents):
        for agent in agents:
            team = agent.team
            for learner in team.learners:
                (self.q).append({'team': str(team.id), 'learner': str(learner.id), 'action': learner.actionObj.actionCode, 'q': 0})
    
    def update (self, team_id, learner_id, action, q_value):
        (self.q).append({'team': str(team_id), 'learner': str(learner_id), 'action': action, 'q': q_value})
    
    def display (self):
        for entry in self.q:
            print(entry)

In [262]:
def get_learners (team):
    print('Getting learners for team: ' + str(team.id))
    return team.learners

In [263]:
def evaluate (team, state, epsilon, q_table):
    #learners = get_learners(team)
    #top_bid = 0
    top_learner = None
    action = None   

    # get best learner
    actVars = {'frameNum':-1}
    valid_learners = [lrnr for lrnr in team.learners if lrnr.isActionAtomic()]
    top_learner = max(valid_learners, key=lambda lrnr: lrnr.bid(state, actVars=actVars))

#     for learner in learners:
#         bid = learner.bid(state)
#         if (bid > top_bid):
#             top_bid = bid
#             top_learner = learner 

    if top_learner == None:
        print('No top learner found!')
        return None, 0
    else:
        # e greedy action selection
        e_prob = random.uniform(0, 1)

        actions = []
        top_q = 0
        top_action = None
        for entry in q_table.q:
#             print('Entry team id: ' + entry['team'])
#             print('Team id: ' + team.id)
#             print('Entry learner id: ' + entry['learner'])
#             print('Top learner id: ' + top_learner.id)
#             print(' ')
#             print(' ')
            if (entry['team'] == str(team.id)) and (entry['learner'] == str(top_learner.id)):
                actions.append(entry['action'])
                #print('Action: ' + str(entry['action']))
                if entry['q'] > top_q:
                    top_q = entry['q']
                    top_action = entry['action']
        
        #print('Actions: ' + str(len(actions)))

        if e_prob < epsilon:
            if len(actions) == 1:
                action = actions[0]
            else:
                rand_action = random.randint(0, len(actions)-1)
                action = actions[rand_action]
        else:
            # select action with highest q value from top learner's actions
            action = top_action
    
    return top_learner, action

In [264]:
def update (q_table, team, next_learner, action, learner, reward):
    alpha = 0.1
    discount = 0.1
    
    # find the greatest q value out of possible actions for learner t+1
    second_max_q = 0
    for second_learner in q_table.q:
        if second_learner['team'] == team.id and second_learner['learner'] == next_learner.id:
            if second_learner['q'] > second_max_q:
                second_max_q = second_learner['q']
    
    # find the current learner and q update
    for first_learner in q_table.q:
        if first_learner['team'] == team.id and first_learner['learner'] == learner.id and first_learner['action'] == action:
            # equation 1 from tpg pdf
            first_learner['q'] += alpha * (reward + (discount * second_max_q) - first_learner['q'])

In [269]:
def evaluate_fitness (q_table, team, env):
    epsilon = 0.2 # where should I define this??
    l_t, a_t = evaluate(team, env.current_state, epsilon, q_table)
    t = 0
    t_max = 10000
    total_reward = 0
    while t < t_max:
        s_next, reward, isDone = env.step(a_t)
        #print(reward)
        total_reward += reward
        if isDone:
            print('done!')
            return total_reward
        l_next, a_next = evaluate(team, env.current_state, epsilon, q_table)
        if l_t.id != l_next.id:
            update(q_table, team, l_next, a_t, l_t, reward)
        a_t = a_next
        l_t = l_next
        t = t + 1
    return total_reward

### TPG

In [270]:
# uncomment and run only to update local branch of tpg
# current local branch [June 4 2021]: new-tpg 
# pip install ../PyTPG/.

In [271]:
# tpg imports
# import to do training
from tpg.trainer import Trainer
# import to run an agent (always needed)
from tpg.agent import Agent
# visual tools
from IPython.display import clear_output
import time
import matplotlib.pyplot as plt
# for writing
import csv
from datetime import date

In [272]:
trainer = Trainer(actions=4, teamPopSize=50, pActAtom=1.0, 
                      nRegisters=4, initMaxActProgSize=48, gap=0.5)
table = q_table()

envName = 'GridWorld'
env = GridWorld(5, 5, (0, 4), (4, 0))

scoreList = []

for gen in range(500):
    agents = trainer.getAgents()
    
    if gen == 0:
        table.create(agents)
    
    for agent in agents:
        team = agent.team
        #for team in agent.teams:
        env.reset()
        fitness = evaluate_fitness(table, team, env)
        scoreList.append(fitness)
        
    # evolution :)
    print(scoreList)
    teams = trainer.applyScores(scoreList)
    trainer.evolve(envName=[envName])
    scoreStats = trainer.fitnessStats
    
    # evolve q table

[0, 0, 40.46000000000052, 0, 0, 0, 0, 0, 0, 41.01000000000041, 0.04, 0, 41.38000000000034, 0.04, 0, 0, 0, 0, 0, 40.640000000000484, 0, 0, 39.15000000000078, 0.04, 0, 0.04, 0, 0, 39.86000000000064, 0, 0, 0, 0, 40.5600000000005, 0.04, 0, 0, 0.04, 0, 0, 0, 0, 0, 0, 39.67000000000068, 0, 0.04, 40.840000000000444, 0, 0]


TypeError: 'int' object is not subscriptable

In [None]:
trainer = Trainer(actions=4, teamPopSize=50, pActAtom=1.0, 
                      nRegisters=4, initMaxActProgSize=48, gap=0.5)

agents = trainer.getAgents()

env = GridWorld(5, 5, (4, 0), (0, 4))

action = env.sample_action()
state, reward, isDone = env.step(action)

for agent in agents:
    team = agent.team
#     for team in agent.team:
    for learner in team.learners:
        temp = team.id
        print(temp)
        print(learner.id)
        print(str(learner.actionObj.actionCode))
        print(learner.getAction((1, 1), False))
        #actVars = None
        #print(learner.bid((1, 1), actVars = actVars))
        print('-----------')
    print('HEYHEYHEY STARTSTARTSTART')
    valid_learners = [lrnr for lrnr in team.learners if lrnr.isActionAtomic()]
    print('VALID LERNERS')
    print(valid_learners)
    actVars = {'frameNum':-1}
    print(actVars['frameNum'])
    top_learner = max(valid_learners, key=lambda lrnr: lrnr.bid(state, actVars=actVars))
    print(top_learner)
    print('HEYHEYHEY   ENDENDENZD')