In [1]:
# Import statements.
import numpy as np
import random as rand
import torch
import math
import matplotlib.pyplot as plt
from ExperimentManager import Experiment
from torch import nn
import torch.nn.functional as F
import torch.distributions as tdist
%matplotlib inline

In [2]:
manager = Experiment.start_experiment('experimentsHydra/', 'experiment', print)

Please enter a brief description of this experiment:
Switched end to relu, Hypers: (10**-4, 128, 64, 4, 2, 14, torch.device('cpu'), torch.device('cpu'))


In [3]:
# Generates network weights.
def generate_weights(starting_size, ending_size, weights_needed):
    difference = (starting_size - ending_size) / (weights_needed + 1)
    weights = []
    for i in range(weights_needed):
        weights.append(int(starting_size - (difference * (i+1))))
    return weights

In [4]:
# Action predictor based on two embedded states.
class ACTION_MAP(nn.Module):
    
    # Constructor.
    def __init__(self, input_size, output_size, layer_count, t_device):
        super().__init__()
        weights = generate_weights(input_size, output_size, layer_count)
        prev_weight = input_size
        self.t_device = t_device
        self.hidden_layers = []
        for w in weights:
            self.hidden_layers.append(nn.Linear(prev_weight, w).to(self.t_device))
            prev_weight = w
        self.output_layer = nn.Linear(prev_weight, output_size).to(self.t_device)
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=-1)
        self.sin = torch.sin
        self.relu = F.relu
        self.params = []
        for h in self.hidden_layers:
            self.params += list(h.parameters())
        self.params += list(self.output_layer.parameters())
            
    # Forward propogate input.
    def forward(self, x, train=False):
        for hidden in self.hidden_layers:
            x = self.sin(hidden(x))
        if train:
            return self.output_layer(x)
        else:
            return self.relu(self.output_layer(x))

In [5]:
# RNN Hydra map.
class HYDRA_MAP(nn.Module):
    
    # Constructor.
    def __init__(self, number_of_heads, input_size, hidden_size, pre_hidden_count, post_hidden_count, class_count, t_device):
        super().__init__()
        self.number_of_heads = number_of_heads
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.pre_hidden_count = pre_hidden_count
        self.post_hidden_count = post_hidden_count
        self.class_count = class_count
        self.t_device = t_device
        weights = generate_weights(input_size, hidden_size, pre_hidden_count)
        self.pre_hidden_layers = []
        prev_weight = input_size
        for w in weights:
            self.pre_hidden_layers.append(nn.Linear(prev_weight, w).to(t_device))
            prev_weight = w
        self.pre_hidden_layers.append(nn.Linear(prev_weight, hidden_size).to(t_device))
        self.heads = []
        for _ in range(number_of_heads):
            self.heads.append(ACTION_MAP(hidden_size, class_count, post_hidden_count, t_device))
        self.sin = torch.sin
        self.softmax = nn.Softmax(dim=-1)
        self.params = []
        for layer in self.pre_hidden_layers:
            self.params += list(layer.parameters())
        
    # Forward propogation.
    def forward(self, states, heads, hidden=None, train=False, train_rnn=True):
        outs = [[] for _ in range(len(heads))]
        if hidden is None:
            hidden = torch.zeros(self.hidden_size).to(self.t_device)
        for x in states:
            for layer in self.pre_hidden_layers:
                x = self.sin(layer(x))
            x += hidden
            if not train_rnn:
                x = x.detach()
            for i in range(len(heads)):
                outs[i].append(self.heads[heads[i]](x, train=train))
            hidden = x
        if train:
            return outs
        else:
            return outs, hidden

In [6]:
# RNN Hydra that plays the game.
class DUAL_HYDRA:
    
    # Constructor.
    def __init__(self, learning_rate, input_size, hidden_size, pre_hidden_count, post_hidden_count, class_count, t_device, s_device):
        self.learning_rate = learning_rate
        self.t_device = t_device
        self.s_device = s_device
        self.policy_map = HYDRA_MAP(2, input_size, hidden_size, pre_hidden_count, post_hidden_count, class_count, t_device)
        self.greedy_optimizer = torch.optim.Adam(self.policy_map.params + self.policy_map.heads[0].params, lr=learning_rate)
        self.selfless_optimizer = torch.optim.Adam(self.policy_map.heads[1].params, lr=learning_rate)
        self.cross_loss = nn.CrossEntropyLoss()
        self.age = 0
        
    # Train the agent.
    # Input data should be a list of trajectories, which should be of form [[states], [actions]].
    def train_head(self, optimizer, head, train_rnn, trajectories, unroll_depth, batch_size=32, epochs=50, extra_info=''):
        batches = []
        batch = [[[] for _ in range(unroll_depth)], [[] for _ in range(unroll_depth)]]
        rand.shuffle(trajectories)
        count = 0
        for t in trajectories:
            for i in range(unroll_depth):
                batch[0][i].append(t[0][i])
                batch[1][i].append(t[1][i])
            count += 1
            if count >= batch_size:
                batch[0] = [torch.stack(state).to(self.t_device) for state in batch[0]]
                batch[1] = [torch.Tensor(actions).long().to(self.t_device) for actions in batch[1]]
                batches.append(batch)
                batch = [[[] for _ in range(unroll_depth)], [[] for _ in range(unroll_depth)]]
                count = 0
        if count > 0:
            batch[0] = [torch.stack(state).to(self.t_device) for state in batch[0]]
            batch[1] = [torch.Tensor(actions).long().to(self.t_device) for actions in batch[1]]
            batches.append(batch)
            batch = [[[] for _ in range(unroll_depth)], [[] for _ in range(unroll_depth)]]
            count = 0
        optimizer.zero_grad()
        losses = []
        for e in range(epochs):
            for b in range(len(batches)):
                batch = batches[b]
                inputs = batch[0]
                targets = batch[1]
                outputs = self.policy_map(inputs, heads=[head], train=True, train_rnn=train_rnn)[0]
                loss = 0
                for i in range(len(outputs)):
                    loss += self.cross_loss(outputs[i], targets[i])
                loss.backward()
                optimizer.step()
                losses.append(loss.detach().cpu().numpy())
                print('\rTRAINING HEAD {} | BATCH {}/{} | EPOCH {}/{} | LOSS {}'.format(head, b+1, len(batches), e+1, epochs, losses[-1], extra_info), end='')
        self.age += 1
        return sum(losses) / len(losses)
    
    # Plays the game.
    def play_game(self, env, render=False, extra_info=''):
        done = False
        action = 0
        score = 0
        step = 0
        lives = 4
        hidden_state = None
        groups = []
        group = []
        env.reset()
        tensor = None
        while not done:
            observation, reward, done, info = env.step(action)
            if render:
                env.render()
            print('\rSTEP {} | SCORE {} | AGE {} {}\t\t'.format(step, score, self.age, extra_info), end = '')
            score += reward
            if tensor is not None:
                group.append((tensor, action, reward))
            step += 1
            # Need to update to support multi-head
            tensor = torch.Tensor.float(torch.from_numpy(observation / 255)).to(self.t_device)
            policies, hidden_state = self.policy_map([tensor], hidden=hidden_state, heads=[0,1])
            policy = policies[0][0].to(self.s_device) + policies[1][0].to(self.s_device)
            if min(policy) < 0 or sum(policy) == 0:
                action = rand.randint(0, self.class_count - 1)
            else:
                distribution = torch.distributions.categorical.Categorical(policy)
                action = int(distribution.sample())
            if info['ale.lives'] != lives or done:
                groups.append(group)
                action = 0
                tensor = None
                lives = info['ale.lives']
                hidden_state = None
                group = []
        return groups, score, step
    
    # Sorts the trajectories into ones for the greedy head and non-greedy head.
    def prepare_data(self, groups, unroll_depth):
        greedy_trajectories = []
        non_greedy_trajectories = []
        for total_run in groups:
            index = unroll_depth
            total_reward = 0
            while index < len(total_run):
                actions = []
                states = []
                for group in total_run[index-unroll_depth:index]:
                    total_reward += group[2]
                    actions.append(group[1])
                    states.append(group[0])
                if total_reward > 0:
                    greedy_trajectories.append([states, actions])
                else:
                    non_greedy_trajectories.append([states, actions])
                index += 1
        return greedy_trajectories, non_greedy_trajectories
    
    # Collects data from n games and trains on it.
    def collect_and_train(self, env, number_of_games, unroll_depth, render = False):
        all_groups = []
        total_score = 0
        low_score = None
        high_score = None
        for g in range(number_of_games):
            groups, score, step = self.play_game(env, render, f'| GAME {g+1}/{number_of_games}')
            all_groups += groups
            total_score += score
            if low_score is None or score < low_score:
                low_score = score
            if high_score is None or score > high_score:
                high_score = score
        print()
        manager.print(f'FINISHED {number_of_games} GAMES | AVERAGE SCORE {total_score/number_of_games} | LOW SCORE {low_score} | HIGH SCORE {high_score}')
        manager.save()
        greedy, selfless = self.prepare_data(all_groups, unroll_depth)
        greedy_loss = self.train_head(self.greedy_optimizer, 0, True, greedy, unroll_depth, batch_size=256, epochs=50, extra_info='')
        print()
        manager.print(f'FINISHED TRAINING GREEDY HEAD | LOSS {greedy_loss}')
        manager.save()
        selfless_loss = self.train_head(self.selfless_optimizer, 1, False, selfless, unroll_depth, batch_size=256, epochs=50, extra_info='')
        print()
        manager.print(f'FINISHED TRAINING SELFLESS HEAD | LOSS {selfless_loss}')
        manager.save()

In [7]:
hydra = DUAL_HYDRA(10**-4, 128, 64, 4, 2, 14, torch.device('cpu'), torch.device('cpu'))

In [8]:
import gym
env = gym.make('KungFuMaster-ram-v0')

In [9]:
while True:
    hydra.collect_and_train(env, 20, 20, True)

STEP 1454 | SCORE 1200.0 | AGE 0 | GAME 20/20		
FINISHED 20 GAMES | AVERAGE SCORE 640.0 | LOW SCORE 100.0 | HIGH SCORE 1300.0
TRAINING HEAD 0 | BATCH 42/42 | EPOCH 50/50 | LOSS 50.253055572509766
FINISHED TRAINING GREEDY HEAD | LOSS 51.11092952546619
TRAINING HEAD 1 | BATCH 59/59 | EPOCH 50/50 | LOSS 50.171531677246094
FINISHED TRAINING SELFLESS HEAD | LOSS 50.257630197557354
STEP 1290 | SCORE 400.0 | AGE 2 | GAME 20/20		
FINISHED 20 GAMES | AVERAGE SCORE 395.0 | LOW SCORE 0.0 | HIGH SCORE 1600.0
TRAINING HEAD 0 | BATCH 26/26 | EPOCH 50/50 | LOSS 47.178356170654325
FINISHED TRAINING GREEDY HEAD | LOSS 47.26047481536865
TRAINING HEAD 1 | BATCH 62/62 | EPOCH 50/50 | LOSS 45.789443969726566
FINISHED TRAINING SELFLESS HEAD | LOSS 46.82081136395854
STEP 1337 | SCORE 300.0 | AGE 4 | GAME 20/20			
FINISHED 20 GAMES | AVERAGE SCORE 495.0 | LOW SCORE 0.0 | HIGH SCORE 1100.0
TRAINING HEAD 0 | BATCH 32/32 | EPOCH 50/50 | LOSS 44.140087127685554
FINISHED TRAINING GREEDY HEAD | LOSS 44.142608096599

TRAINING HEAD 1 | BATCH 65/65 | EPOCH 50/50 | LOSS 42.883655548095734
FINISHED TRAINING SELFLESS HEAD | LOSS 43.9048105867826
STEP 975 | SCORE 600.0 | AGE 44 | GAME 20/20				
FINISHED 20 GAMES | AVERAGE SCORE 545.0 | LOW SCORE 200.0 | HIGH SCORE 1500.0
TRAINING HEAD 0 | BATCH 32/32 | EPOCH 50/50 | LOSS 43.975051879882816
FINISHED TRAINING GREEDY HEAD | LOSS 43.94736580371857
TRAINING HEAD 1 | BATCH 59/59 | EPOCH 50/50 | LOSS 43.786132812526956
FINISHED TRAINING SELFLESS HEAD | LOSS 43.89768683094089
STEP 1299 | SCORE 700.0 | AGE 46 | GAME 20/20			
FINISHED 20 GAMES | AVERAGE SCORE 635.0 | LOW SCORE 100.0 | HIGH SCORE 1400.0
TRAINING HEAD 0 | BATCH 42/42 | EPOCH 50/50 | LOSS 43.980480194091845
FINISHED TRAINING GREEDY HEAD | LOSS 43.95015682038807
TRAINING HEAD 1 | BATCH 53/53 | EPOCH 50/50 | LOSS 43.849441528320314
FINISHED TRAINING SELFLESS HEAD | LOSS 43.892862672625846
STEP 1071 | SCORE 200.0 | AGE 48 | GAME 20/20			
FINISHED 20 GAMES | AVERAGE SCORE 635.0 | LOW SCORE 200.0 | HIGH S

TRAINING HEAD 0 | BATCH 24/24 | EPOCH 50/50 | LOSS 43.937763214111335
FINISHED TRAINING GREEDY HEAD | LOSS 43.95064398765564
TRAINING HEAD 1 | BATCH 56/56 | EPOCH 50/50 | LOSS 43.767303466796875
FINISHED TRAINING SELFLESS HEAD | LOSS 43.90226565633501
STEP 7 | SCORE 0.0 | AGE 88 | GAME 12/20		1/20		

AttributeError: 'DUAL_HYDRA' object has no attribute 'class_count'