In [1]:
# Import statements.
import numpy as np
import random as rand
import torch
import math
import matplotlib.pyplot as plt
from ExperimentManager import Experiment
from torch import nn
import torch.nn.functional as F
import torch.distributions as tdist
%matplotlib inline

In [2]:
manager = Experiment.start_experiment('experimentsHydra/', 'experiment', print)

Please enter a brief description of this experiment:
Switched end from softmax to relu, Hypers: (10**-4, 128, 64, 4, 2, 14, torch.device('cpu'), torch.device('cpu'))


In [3]:
# Generates network weights.
def generate_weights(starting_size, ending_size, weights_needed):
    difference = (starting_size - ending_size) / (weights_needed + 1)
    weights = []
    for i in range(weights_needed):
        weights.append(int(starting_size - (difference * (i+1))))
    return weights

In [4]:
# Action predictor based on two embedded states.
class ACTION_MAP(nn.Module):
    
    # Constructor.
    def __init__(self, input_size, output_size, layer_count, t_device):
        super().__init__()
        weights = generate_weights(input_size, output_size, layer_count)
        prev_weight = input_size
        self.t_device = t_device
        self.hidden_layers = []
        for w in weights:
            self.hidden_layers.append(nn.Linear(prev_weight, w).to(self.t_device))
            prev_weight = w
        self.output_layer = nn.Linear(prev_weight, output_size).to(self.t_device)
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=-1)
        self.sin = torch.sin
        self.relu = F.relu
        self.params = []
        for h in self.hidden_layers:
            self.params += list(h.parameters())
        self.params += list(self.output_layer.parameters())
            
    # Forward propogate input.
    def forward(self, x, train=False):
        for hidden in self.hidden_layers:
            x = self.sin(hidden(x))
        if train:
            return self.output_layer(x)
        else:
            return self.relu(self.output_layer(x))

In [5]:
# RNN Hydra map.
class HYDRA_MAP(nn.Module):
    
    # Constructor.
    def __init__(self, number_of_heads, input_size, hidden_size, pre_hidden_count, post_hidden_count, class_count, t_device):
        super().__init__()
        self.number_of_heads = number_of_heads
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.pre_hidden_count = pre_hidden_count
        self.post_hidden_count = post_hidden_count
        self.class_count = class_count
        self.t_device = t_device
        weights = generate_weights(input_size, hidden_size, pre_hidden_count)
        self.pre_hidden_layers = []
        prev_weight = input_size
        for w in weights:
            self.pre_hidden_layers.append(nn.Linear(prev_weight, w).to(t_device))
            prev_weight = w
        self.pre_hidden_layers.append(nn.Linear(prev_weight, hidden_size).to(t_device))
        self.heads = []
        for _ in range(number_of_heads):
            self.heads.append(ACTION_MAP(hidden_size, class_count, post_hidden_count, t_device))
        self.sin = torch.sin
        self.softmax = nn.Softmax(dim=-1)
        self.params = []
        for layer in self.pre_hidden_layers:
            self.params += list(layer.parameters())
        
    # Forward propogation.
    def forward(self, states, heads, hidden=None, train=False, train_rnn=True):
        outs = [[] for _ in range(len(heads))]
        if hidden is None:
            hidden = torch.zeros(self.hidden_size).to(self.t_device)
        for x in states:
            for layer in self.pre_hidden_layers:
                x = self.sin(layer(x))
            x += hidden
            if not train_rnn:
                x = x.detach()
            for i in range(len(heads)):
                outs[i].append(self.heads[heads[i]](x, train=train))
            hidden = x
        if train:
            return outs
        else:
            return outs, hidden

In [6]:
# RNN Hydra that plays the game.
class HYDRA:
    
    # Constructor.
    def __init__(self, learning_rate, input_size, hidden_size, pre_hidden_count, post_hidden_count, class_count, t_device, s_device):
        self.learning_rate = learning_rate
        self.t_device = t_device
        self.s_device = s_device
        self.policy_map = HYDRA_MAP(2, input_size, hidden_size, pre_hidden_count, post_hidden_count, class_count, t_device)
        self.optimizers = [
            torch.optim.Adam(self.policy_map.params + head.params, lr=learning_rate) for head in self.policy_map.heads
        ]
        self.cross_loss = nn.CrossEntropyLoss()
        self.age = 0
        
    # Train the agent.
    # Input data should be a list of trajectories, which should be of form [[states], [actions]].
    def train_head(self, head, train_rnn, trajectories, unroll_depth, batch_size=32, epochs=50, extra_info=''):
        optimizer = self.optimizers[head]
        batches = []
        batch = [[[] for _ in range(unroll_depth)], [[] for _ in range(unroll_depth)]]
        rand.shuffle(trajectories)
        count = 0
        for t in trajectories:
            for i in range(unroll_depth):
                batch[0][i].append(t[0][i])
                batch[1][i].append(t[1][i])
            count += 1
            if count >= batch_size:
                batch[0] = [torch.stack(state).to(self.t_device) for state in batch[0]]
                batch[1] = [torch.Tensor(actions).long().to(self.t_device) for actions in batch[1]]
                batches.append(batch)
                batch = [[[] for _ in range(unroll_depth)], [[] for _ in range(unroll_depth)]]
                count = 0
        if count > 0:
            batch[0] = [torch.stack(state).to(self.t_device) for state in batch[0]]
            batch[1] = [torch.Tensor(actions).long().to(self.t_device) for actions in batch[1]]
            batches.append(batch)
            batch = [[[] for _ in range(unroll_depth)], [[] for _ in range(unroll_depth)]]
            count = 0
        optimizer.zero_grad()
        losses = []
        for e in range(epochs):
            for b in range(len(batches)):
                batch = batches[b]
                inputs = batch[0]
                targets = batch[1]
                outputs = self.policy_map(inputs, heads=[head], train=True, train_rnn=train_rnn)[0]
                loss = 0
                for i in range(len(outputs)):
                    loss += self.cross_loss(outputs[i], targets[i])
                loss.backward()
                optimizer.step()
                losses.append(loss.detach().cpu().numpy())
                print('\rTRAINING HEAD {} | BATCH {}/{} | EPOCH {}/{} | LOSS {}'.format(head, b+1, len(batches), e+1, epochs, losses[-1], extra_info), end='')
        self.age += 1
        return sum(losses) / len(losses)
    
    # Plays the game.
    def play_game(self, env, render=False, extra_info=''):
        done = False
        action = 0
        score = 0
        step = 0
        lives = 4
        hidden_state = None
        groups = []
        group = []
        env.reset()
        tensor = None
        while not done:
            observation, reward, done, info = env.step(action)
            if render:
                env.render()
            print('\rSTEP {} | SCORE {} | AGE {} {}\t\t'.format(step, score, self.age, extra_info), end = '')
            score += reward
            if tensor is not None:
                group.append((tensor, action, reward))
            step += 1
            # Need to update to support multi-head
            tensor = torch.Tensor.float(torch.from_numpy(observation / 255)).to(self.t_device)
            policies, hidden_state = self.policy_map([tensor], hidden=hidden_state, heads=[0,1])
            policy = policies[0][0].to(self.s_device) + policies[1][0].to(self.s_device)
            if min(policy) < 0 or sum(policy) == 0:
                action = rand.randint(0, self.class_count - 1)
            else:
                distribution = torch.distributions.categorical.Categorical(policy)
                action = int(distribution.sample())
            if info['ale.lives'] != lives or done:
                groups.append(group)
                action = 0
                tensor = None
                lives = info['ale.lives']
                hidden_state = None
                group = []
        return groups, score, step
    
    # Sorts the trajectories into ones for the greedy head and non-greedy head.
    def prepare_data(self, groups, unroll_depth):
        trajectories = []
        for total_run in groups:
            index = unroll_depth
            total_reward = 0
            while index < len(total_run):
                actions = []
                states = []
                for group in total_run[index-unroll_depth:index]:
                    total_reward += group[2]
                    actions.append(group[1])
                    states.append(group[0])
                trajectories.append([states, actions])
                index += 1
        return trajectories
    
    # Collects data from n games and trains on it.
    def collect_and_train(self, env, number_of_games, unroll_depth, current_head, render = False):
        all_groups = []
        total_score = 0
        low_score = None
        high_score = None
        for g in range(number_of_games):
            groups, score, step = self.play_game(env, render, f'| GAME {g+1}/{number_of_games}')
            all_groups.append((groups, score, step))
            total_score += score
            if low_score is None or score < low_score:
                low_score = score
            if high_score is None or score > high_score:
                high_score = score
        all_groups.sort(key = lambda x: x[1], reverse=True)
        final_groups = []
        for g in all_groups[:int(0.3*len(all_groups))]:
            final_groups += g[0]
        print()
        manager.print(f'FINISHED {number_of_games} GAMES | AVERAGE SCORE {total_score/number_of_games} | LOW SCORE {low_score} | HIGH SCORE {high_score}')
        manager.save()
        trajectory = self.prepare_data(final_groups, unroll_depth)
        loss = self.train_head(current_head, True, trajectory, unroll_depth, batch_size=256, epochs=50, extra_info='')
        print()
        manager.print(f'FINISHED TRAINING HEAD {current_head} | LOSS {loss}')
        manager.save()

In [7]:
hydra = HYDRA(10**-4, 128, 64, 4, 2, 14, torch.device('cpu'), torch.device('cpu'))

In [8]:
import gym
env = gym.make('KungFuMaster-ram-v0')

In [None]:
heads = [0,1]
index = 0
while True:
    hydra.collect_and_train(env, 20, 20, heads[index], True)
    index += 1
    if index >= len(heads):
        index = 0

STEP 1236 | SCORE 400.0 | AGE 0 | GAME 20/20		
FINISHED 20 GAMES | AVERAGE SCORE 335.0 | LOW SCORE 0.0 | HIGH SCORE 1200.0
TRAINING HEAD 0 | BATCH 30/30 | EPOCH 50/50 | LOSS 49.744197845458984
FINISHED TRAINING HEAD 0 | LOSS 50.28081092834473
STEP 1329 | SCORE 200.0 | AGE 1 | GAME 20/20		
FINISHED 20 GAMES | AVERAGE SCORE 210.0 | LOW SCORE 0.0 | HIGH SCORE 900.0
TRAINING HEAD 1 | BATCH 32/32 | EPOCH 50/50 | LOSS 45.352684020996094
FINISHED TRAINING HEAD 1 | LOSS 47.01629961490631
STEP 1393 | SCORE 0.0 | AGE 2 | GAME 20/20				
FINISHED 20 GAMES | AVERAGE SCORE 30.0 | LOW SCORE 0.0 | HIGH SCORE 200.0
TRAINING HEAD 0 | BATCH 26/26 | EPOCH 50/50 | LOSS 39.526718139648444
FINISHED TRAINING HEAD 0 | LOSS 39.7252116394043
STEP 1129 | SCORE 0.0 | AGE 3 | GAME 20/20		
FINISHED 20 GAMES | AVERAGE SCORE 0.0 | LOW SCORE 0.0 | HIGH SCORE 0.0
TRAINING HEAD 1 | BATCH 22/22 | EPOCH 50/50 | LOSS 36.187240600585944
FINISHED TRAINING HEAD 1 | LOSS 37.33968730579723
STEP 588 | SCORE 0.0 | AGE 4 | GAME 17/