In [1]:
# Import statements.
import numpy as np
import random as rand
import torch
import math
import matplotlib.pyplot as plt
from ExperimentManager import Experiment
from torch import nn
import torch.nn.functional as F
import torch.distributions as tdist
%matplotlib inline

In [2]:
manager = Experiment.start_experiment('experimentsRNN/', 'experiment', print)

Please enter a brief description of this experiment:
Set end activation to relu, Hypers: (40, 1, agent_hypers, 1/4, 1/4, 1000)


In [3]:
# Generates network weights.
def generate_weights(starting_size, ending_size, weights_needed):
    difference = (starting_size - ending_size) / (weights_needed + 1)
    weights = []
    for i in range(weights_needed):
        weights.append(int(starting_size - (difference * (i+1))))
    return weights

In [4]:
# RNN policy map.
class POLICY_MAP(nn.Module):
    
    # Constructor.
    def __init__(self, input_size, hidden_size, pre_hidden_count, post_hidden_count, class_count, t_device):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.pre_hidden_count = pre_hidden_count
        self.post_hidden_count = post_hidden_count
        self.class_count = class_count
        self.t_device = t_device
        weights = generate_weights(input_size, hidden_size, pre_hidden_count)
        self.pre_hidden_layers = []
        prev_weight = input_size
        for w in weights:
            self.pre_hidden_layers.append(nn.Linear(prev_weight, w).to(t_device))
            prev_weight = w
        self.pre_hidden_layers.append(nn.Linear(prev_weight, hidden_size).to(t_device))
        self.post_hidden_layers = []
        for _ in range(post_hidden_count):
            self.post_hidden_layers.append(nn.Linear(hidden_if cell_state is None:
            hidden = torch.zeros(self.hidden_size).to(self.t_device)catsize, hidden_size).to(t_device))
        self.policy_out = nn.Linear(hidden_size, class_count).to(t_device)
        self.sin = torch.sin
        self.relu = F.relu
        self.softmax = nn.Softmax(dim=-1)
        self.parameters = []
        for layer in self.pre_hidden_layers + self.post_hidden_layers + [self.policy_out]:
            self.parameters += list(layer.parameters())
        
    # Forward propogation.
    def forward(self, states, hidden=None, train=False):
        outs = []
        if hidden is None:
            hidden = torch.zeros(self.hidden_size).to(self.t_device)
        for x in states:
            for layer in self.pre_hidden_layers:
                x = self.sin(layer(x))
            x += hidden
            for layer in self.post_hidden_layers:
                x = self.sin(layer(x))
            hidden = x
            if train:
                outs.append(self.policy_out(x))
            else:
                outs.append(self.relu(self.policy_out(x)))
        if train:
            return outs
        else:
            return outs, hidden

In [5]:
# RNN Agent that plays the game.
class AGENT:
    
    # Constructor.
    def __init__(self, name, learning_rate, input_size, hidden_size, pre_hidden_count, post_hidden_count, class_count, t_device, s_device):
        self.name = name
        self.learning_rate = learning_rate
        self.class_count = class_count
        self.t_device = t_device
        self.s_device = s_device
        self.policy_map = POLICY_MAP(input_size, hidden_size, pre_hidden_count, post_hidden_count, class_count, t_device)
        self.optimizer = torch.optim.Adam(self.policy_map.parameters, lr=learning_rate)
        self.cross_loss = nn.CrossEntropyLoss()
        self.age = 0
        
    # Train the agent.
    # Input data should be a list of trajectories, which should be of form [[states], [actions]].
    def train(self, trajectories, unroll_depth, batch_size=32, epochs=50, extra_info=''):
        batches = []
        batch = [[[] for _ in range(unroll_depth)], [[] for _ in range(unroll_depth)]]
        rand.shuffle(trajectories)
        count = 0
        for t in trajectories:
            for i in range(unroll_depth):
                batch[0][i].append(t[0][i])
                batch[1][i].append(t[1][i])
            count += 1
            if count >= batch_size:
                batch[0] = [torch.stack(state).to(self.t_device) for state in batch[0]]
                batch[1] = [torch.Tensor(actions).long().to(self.t_device) for actions in batch[1]]
                batches.append(batch)
                batch = [[[] for _ in range(unroll_depth)], [[] for _ in range(unroll_depth)]]
                count = 0
        self.optimizer.zero_grad()
        losses = []
        for e in range(epochs):
            for b in range(len(batches)):
                batch = batches[b]
                inputs = batch[0]
                targets = batch[1]
                outputs = self.policy_map(inputs, train=True)
                loss = 0
                for i in range(len(outputs)):
                    loss += self.cross_loss(outputs[i], targets[i])
                loss.backward()
                self.optimizer.step()
                losses.append(loss.detach().cpu().numpy())
                print('\rTRAINING {} | AGE {} | BATCH {}/{} | EPOCH {}/{} | LOSS {} | {}'.format(self.name, self.age, b+1, len(batches), e+1, epochs, losses[-1], extra_info), end='')
        self.age += 1
        return sum(losses) / len(losses)
    
    # Plays the game.
    def play_game(self, env, render=False, extra_info=''):
        done = False
        action = 0
        score = 0
        step = 0
        lives = 4
        hidden_state = None
        groups = []
        group = []
        env.reset()
        while not done:
            observation, reward, done, info = env.step(action)
            print('\r{} | STEP {} | SCORE {} | AGE {} {}\t\t'.format(self.name, step, score, self.age, extra_info), end = '')
            score += reward
            step += 1
            tensor = torch.Tensor.float(torch.from_numpy(observation / 255)).to(self.t_device)
            policies, hidden_state = self.policy_map([tensor], hidden_state)
            policy = policies[0].to(self.s_device)
            if min(policy) < 0 or sum(policy) == 0:
                action = rand.randint(0, self.class_count - 1)
            else:
                distribution = torch.distributions.categorical.Categorical(policy)
                action = int(distribution.sample())
            group.append((tensor, action))
            if info['ale.lives'] != lives or done:
                groups.append(group)
                action = 0
                lives = info['ale.lives']
                hidden_state = None
                group = []
            if render:
                env.render()
        return groups, score, step

In [6]:
# Population.
class POPULATION:
    
    # Constructor.
    def __init__(self, population_size, number_of_attempts, agent_params, teach_percent, train_percent, age_cutoff):
        self.population_size = population_size
        self.number_of_attempts = number_of_attempts
        self.teach_percent = teach_percent
        self.train_percent = train_percent
        self.population = []
        self.agents_created = population_size
        self.age_cutoff = age_cutoff
        self.agent_params = agent_params
        for i in range(population_size):
            agent = AGENT(f'AGENT_{i}', agent_params[0], agent_params[1], agent_params[2], agent_params[3], agent_params[4], agent_params[5], agent_params[6], agent_params[7])
            self.population.append(agent)
        self.generation = 0
    
    # Sorts the trajectories into ones for the greedy head and non-greedy head.
    def prepare_data(self, groups, unroll_depth):
        trajectories = []
        for total_run in groups:
            index = unroll_depth
            while index < len(total_run):
                actions = []
                states = []
                for group in total_run[index-unroll_depth:index]:
                    actions.append(group[1])
                    states.append(group[0])
                trajectories.append([states, actions])
                index += 1
        return trajectories
        
    # Runs and trains the agents.
    def run_population(self, env, unroll_depth, epochs, batch_size, render=False):
        new_pop = []
        total_score = 0
        high_score = None
        low_score = None
        manager.print('BEGIN RUNNING POPULATION | GENERATION {}'.format(self.generation))
        rand.shuffle(self.population)
        for agent in self.population:
            candidate_runs = []
            for g in range(self.number_of_attempts):
                groups, score, step = agent.play_game(env, render, f'| MEMBER {len(new_pop) + 1}/{len(self.population)} | GAME {g+1}/{self.number_of_attempts}')
                total_score += score
                candidate_runs.append((groups, score, step))
                if high_score is None or high_score < score:
                    high_score = score
                if low_score is None or low_score > score:
                    low_score = score
            candidate_runs.sort(key = lambda x: x[1], reverse=True)
            new_pop.append((agent, candidate_runs[0][0], candidate_runs[0][1]))
        print('')
        manager.print('END RUNNING POPULATION | AVERAGE SCORE {} | LOW SCORE {} | HIGH SCORE {}'.format(total_score / (len(new_pop) * self.number_of_attempts), low_score, high_score))
        manager.save()
        new_pop.sort(key = lambda x: x[2], reverse=True)
        teach_pop = new_pop[:int(len(new_pop) * self.teach_percent)]
        train_pop = new_pop[-int(len(new_pop) * self.train_percent):]
        examples = []
        for exp in teach_pop:
            examples += exp[1]
        trajectories = self.prepare_data(examples, unroll_depth)
        manager.print('BEGIN TRAINING POPULATION')
        count = 0
        losses = []
        for train in train_pop:
            agent = train[0]
            #if agent.age > self.age_cutoff and rand.uniform(0,1) > 0.5:
            #    agent = self.replace_agent(agent)
            loss = agent.train(trajectories, unroll_depth, batch_size, epochs, extra_info=f'| MEMBER {count+1}/{len(train_pop)}')
            losses.append(loss)
            count += 1
        print('')
        manager.print('END TRAINING POPULATION | AVG LOSS {}'.format(sum(losses)/len(losses)))
        manager.save()
        self.generation += 1

In [7]:
agent_hypers = (10**-4, 128, 80, 4, 2, 14, torch.device('cpu'), torch.device('cpu'))
population = POPULATION(40, 1, agent_hypers, 1/4, 1/4, 1000)

In [8]:
import gym
env = gym.make('KungFuMaster-ram-v0')

In [None]:
while True:
    population.run_population(env, 20, 10, 256, True)

BEGIN RUNNING POPULATION | GENERATION 0
AGENT_15 | STEP 894 | SCORE 0.0 | AGE 0 | MEMBER 40/40 | GAME 1/1		1			
END RUNNING POPULATION | AVERAGE SCORE 805.0 | LOW SCORE 0.0 | HIGH SCORE 6200.0
BEGIN TRAINING POPULATION
TRAINING AGENT_15 | AGE 0 | BATCH 63/63 | EPOCH 10/10 | LOSS 50.41295623779297 | | MEMBER 10/100
END TRAINING POPULATION | AVG LOSS 51.10423895578536
BEGIN RUNNING POPULATION | GENERATION 1
AGENT_25 | STEP 1463 | SCORE 0.0 | AGE 0 | MEMBER 40/40 | GAME 1/1					
END RUNNING POPULATION | AVERAGE SCORE 1132.5 | LOW SCORE 0.0 | HIGH SCORE 6600.0
BEGIN TRAINING POPULATION
TRAINING AGENT_25 | AGE 0 | BATCH 70/70 | EPOCH 10/10 | LOSS 55.759639739990234 | | MEMBER 10/10
END TRAINING POPULATION | AVG LOSS 49.95564568601336
BEGIN RUNNING POPULATION | GENERATION 2
AGENT_6 | STEP 925 | SCORE 200.0 | AGE 0 | MEMBER 40/40 | GAME 1/1					
END RUNNING POPULATION | AVERAGE SCORE 1850.0 | LOW SCORE 0.0 | HIGH SCORE 9700.0
BEGIN TRAINING POPULATION
TRAINING AGENT_0 | AGE 0 | BATCH 81/81 | 

AGENT_18 | STEP 2071 | SCORE 5600.0 | AGE 5 | MEMBER 40/40 | GAME 1/1			
END RUNNING POPULATION | AVERAGE SCORE 5855.0 | LOW SCORE 2600.0 | HIGH SCORE 9900.0
BEGIN TRAINING POPULATION
TRAINING AGENT_6 | AGE 5 | BATCH 99/99 | EPOCH 10/10 | LOSS 40.568687438964844 | | MEMBER 10/10
END TRAINING POPULATION | AVG LOSS 44.27325942569309
BEGIN RUNNING POPULATION | GENERATION 23
AGENT_2 | STEP 2218 | SCORE 5500.0 | AGE 3 | MEMBER 40/40 | GAME 1/1				
END RUNNING POPULATION | AVERAGE SCORE 5240.0 | LOW SCORE 1000.0 | HIGH SCORE 9200.0
BEGIN TRAINING POPULATION
TRAINING AGENT_39 | AGE 3 | BATCH 99/99 | EPOCH 10/10 | LOSS 43.50441360473633 | | MEMBER 10/100
END TRAINING POPULATION | AVG LOSS 43.4912950296113
BEGIN RUNNING POPULATION | GENERATION 24
AGENT_24 | STEP 2258 | SCORE 6300.0 | AGE 5 | MEMBER 40/40 | GAME 1/1				
END RUNNING POPULATION | AVERAGE SCORE 5632.5 | LOW SCORE 2100.0 | HIGH SCORE 12100.0
BEGIN TRAINING POPULATION
TRAINING AGENT_13 | AGE 4 | BATCH 95/95 | EPOCH 10/10 | LOSS 44.61