In [1]:
# Import statements.
import numpy as np
import random as rand
import torch
import math
import matplotlib.pyplot as plt
from ExperimentManager import Experiment
from torch import nn
import torch.nn.functional as F
import torch.distributions as tdist
%matplotlib inline

In [2]:
manager = Experiment.start_experiment('experimentsHierarchy/', 'experiment', print)

Please enter a brief description of this experiment:
Hypers: (32, 40, 128, 14, 1, (6, 1, 1, 2), 10**-3, 0.9, 1/3, torch.device('cpu'), torch.device('cpu'))


In [3]:
# Generates network weights.
def generate_weights(starting_size, ending_size, weights_needed):
    difference = (starting_size - ending_size) / (weights_needed + 1)
    weights = []
    for i in range(weights_needed):
        weights.append(int(starting_size - (difference * (i+1))))
    return weights

In [4]:
# Siamese embedding network at the heart of this model, that finds the controllable state.
class EMBEDDING_MAP(nn.Module):
    
    # Constructor.
    def __init__(self, input_size, output_size, layer_count, t_device):
        super().__init__()
        weights = generate_weights(input_size, output_size, layer_count)
        prev_weight = input_size
        self.t_device = t_device
        self.hidden_layers = []
        for w in weights:
            self.hidden_layers.append(nn.Linear(prev_weight, w).to(self.t_device))
            prev_weight = w
        self.output_layer = nn.Linear(prev_weight, output_size).to(self.t_device)
        self.sigmoid = nn.Sigmoid()
        self.sin = torch.sin
        self.relu = F.relu
        self.params = []
        for h in self.hidden_layers:
            self.params += list(h.parameters())
        self.params += list(self.output_layer.parameters())
            
    # Forward propogate input.
    def forward(self, x, train=False):
        for hidden in self.hidden_layers:
            x = self.relu(hidden(x))
        return self.sin(self.output_layer(x))
    

In [5]:
# Action predictor based on two embedded states.
class ACTION_MAP(nn.Module):
    
    # Constructor.
    def __init__(self, input_size, output_size, layer_count, t_device):
        super().__init__()
        weights = generate_weights(input_size, output_size, layer_count)
        prev_weight = input_size
        self.t_device = t_device
        self.hidden_layers = []
        for w in weights:
            self.hidden_layers.append(nn.Linear(prev_weight, w).to(self.t_device))
            prev_weight = w
        self.output_layer = nn.Linear(prev_weight, output_size).to(self.t_device)
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=-1)
        self.sin = torch.sin
        self.relu = F.relu
        self.params = []
        for h in self.hidden_layers:
            self.params += list(h.parameters())
        self.params += list(self.output_layer.parameters())
            
    # Forward propogate input.
    def forward(self, in_1, in_2, train=False):
        #x = torch.cat([in_1, in_2], -1)
        x = in_2 - in_1
        for hidden in self.hidden_layers:
            x = self.relu(hidden(x))
        if train:
            return self.output_layer(x)
        else:
            return self.softmax(self.output_layer(x))

In [6]:
# Rating network based on two embedded states.
class RATING_MAP(nn.Module):
    
    # Constructor.
    def __init__(self, input_size, output_size, layer_count, t_device):
        super().__init__()
        weights = generate_weights(input_size, output_size, layer_count)
        prev_weight = input_size
        self.t_device = t_device
        self.hidden_layers = []
        for w in weights:
            self.hidden_layers.append(nn.Linear(prev_weight, w).to(self.t_device))
            prev_weight = w
        self.output_layer = nn.Linear(prev_weight, output_size).to(self.t_device)
        self.sigmoid = nn.Sigmoid()
        self.sin = torch.sin
        self.relu = F.relu
        self.softmax = nn.Softmax(dim=-1)
        self.params = []
        for h in self.hidden_layers:
            self.params += list(h.parameters())
        self.params += list(self.output_layer.parameters())
            
    # Forward propogate input.
    def forward(self, in_1, in_2, final_activation=False):
        x = torch.cat([in_1, in_2], -1)
        for hidden in self.hidden_layers:
            x = self.sin(hidden(x))
        if not final_activation:
            return self.output_layer(x)
        else:
            return self.softmax(self.output_layer(x))

In [7]:
# For a given embedded state, outputs the desired next embedded state.
class STRATEGY_MAP(nn.Module):
    
    # Constructor.
    def __init__(self, input_size, output_size, layer_count, t_device):
        super().__init__()
        weights = generate_weights(input_size, output_size, layer_count)
        prev_weight = input_size
        self.t_device = t_device
        self.hidden_layers = []
        for w in weights:
            self.hidden_layers.append(nn.Linear(prev_weight, w).to(self.t_device))
            prev_weight = w
        self.output_layer = nn.Linear(prev_weight, output_size).to(self.t_device)
        self.sigmoid = nn.Sigmoid()
        self.sin = torch.sin
        self.relu = F.relu
        self.softmax = nn.Softmax(dim=-1)
        self.params = []
        for h in self.hidden_layers:
            self.params += list(h.parameters())
        self.params += list(self.output_layer.parameters())
            
    # Forward propogate input.
    def forward(self, x, train=False):
        for hidden in self.hidden_layers:
            x = self.sin(hidden(x))
        if train:
            return self.output_layer(x)
        else:
            return self.softmax(self.output_layer(x))

In [8]:
# Agent that plays a given game and attempts to achieve a maximum score.
# This implementation will only use one agent that contains both selector and strategy networks.
class AGENT:
    
    # Constructor.
    def __init__(self, embedding_size, strategy_count, state_size, action_size, stack_size, layer_counts, learning_rate, gamma, teach_percent, s_device, t_device):
        self.embedding_size = embedding_size
        self.strategy_count = strategy_count
        self.state_size = state_size
        self.action_size = action_size
        self.stack_size = stack_size
        self.layer_counts = layer_counts
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.teach_percent = teach_percent
        self.age = 0
        self.s_device = s_device
        self.t_device = t_device
        # Maps that form the model.
        self.embedding_map = EMBEDDING_MAP(state_size * stack_size, embedding_size, layer_counts[0], t_device)
        self.action_map = ACTION_MAP(embedding_size, action_size, layer_counts[1], t_device)
        self.greedy_rating_map = RATING_MAP(embedding_size + action_size, 2, layer_counts[2], t_device)
        self.strategies = [
            STRATEGY_MAP(embedding_size, action_size, layer_counts[3], t_device) for _ in range(strategy_count)
        ]
        # Optimizers for maps.
        self.action_embedding_optimizer = torch.optim.Adam(self.action_map.params + self.embedding_map.params, lr=learning_rate)
        self.greedy_rating_optimizer = torch.optim.Adam(self.greedy_rating_map.params, lr=learning_rate)
        self.strategy_optimizers = [
            torch.optim.Adam(s.params, lr=learning_rate) for s in self.strategies
        ]
        self.cross_loss = nn.CrossEntropyLoss()
        self.bce_loss = nn.BCEWithLogitsLoss()
        self.mse_loss = nn.MSELoss()
        self.alpha = 1/self.action_size
        self.exploitation_weight = 0.7
        self.exploration_weight = 0.4
        
    # Trains the action and embedding maps together, this is the only place the embedding map should be trained.
    # Groups should be of the form (state (i), state (i+1), action (i))
    def train_action_map(self, groups, batch_size=1024, epochs=50):
        rand.shuffle(groups)
        batches = []
        position = 0
        batch = ([],[],[])
        losses = []
        while position < len(groups):
            batch[0].append(groups[position][0])
            batch[1].append(groups[position][1])
            batch[2].append(groups[position][2])
            position += 1
            if len(batch[0]) >= batch_size:
                batches.append(batch)
                batch = ([],[],[])
        if len(batch) > 0:
            batches.append(batch)
        for e in range(epochs):
            for i in range(len(batches)):
                inputs_1 = torch.stack(batches[i][0])
                inputs_2 = torch.stack(batches[i][1])
                peer_inputs_1 = [t for t in batches[i][0]]
                rand.shuffle(peer_inputs_1)
                peer_inputs_1 = torch.stack(peer_inputs_1)
                peer_inputs_2 = [t for t in batches[i][1]]
                rand.shuffle(peer_inputs_2)
                peer_inputs_2 = torch.stack(peer_inputs_2)
                peer_outputs = [t for t in batches[i][2]]
                rand.shuffle(peer_outputs)
                peer_outputs = torch.Tensor(peer_outputs).long()
                outputs = torch.Tensor(batches[i][2]).long()
                out = self.action_map(self.embedding_map(inputs_1, True), self.embedding_map(inputs_2, True), True)
                peer_out = self.action_map(self.embedding_map(peer_inputs_1, True), self.embedding_map(peer_inputs_2, True), True)
                loss = self.cross_loss(out, outputs) - (self.alpha * self.cross_loss(peer_out, peer_outputs))
                print('\rACTION MAP | EPOCH {}/{} | BATCH {}/{} | CURRENT BATCH COUNT {} | LOSS {:0.4f}\t\t'.format(e+1, epochs, i+1, len(batches), len(batches[i][0]), loss.detach().cpu().numpy()), end='')
                loss.backward()
                self.action_embedding_optimizer.step()
                losses.append(loss.detach().cpu().numpy())
        return sum(losses)/len(losses)
    
    # Trains the greedy rating map.
    # Groups should be of the form (state (i), state (i+1), label (i))
    def train_greedy_rating_map(self, groups, batch_size=1024, epochs=50):
        rand.shuffle(groups)
        eye = torch.eye(self.action_size)
        batches = []
        position = 0
        batch = ([],[],[])
        losses = []
        while position < len(groups):
            batch[0].append(groups[position][0])
            batch[1].append(eye[groups[position][1]])
            batch[2].append(groups[position][2])
            position += 1
            if len(batch[0]) >= batch_size:
                batches.append(batch)
                batch = ([],[],[])
        if len(batch) > 0:
            batches.append(batch)
        for e in range(epochs):
            for i in range(len(batches)):
                inputs_1 = torch.stack(batches[i][0])
                inputs_2 = torch.stack(batches[i][1])
                peer_inputs_1 = [t for t in batches[i][0]]
                rand.shuffle(peer_inputs_1)
                peer_inputs_1 = torch.stack(peer_inputs_1)
                peer_inputs_2 = [t for t in batches[i][1]]
                rand.shuffle(peer_inputs_2)
                peer_inputs_2 = torch.stack(peer_inputs_2)
                peer_outputs = [t for t in batches[i][2]]
                rand.shuffle(peer_outputs)
                peer_outputs = torch.Tensor(peer_outputs).long()
                outputs = torch.Tensor(batches[i][2]).long()
                out = self.greedy_rating_map(self.embedding_map(inputs_1, True).detach(), inputs_2, False)
                peer_out = self.greedy_rating_map(self.embedding_map(peer_inputs_1, True).detach(), peer_inputs_2, False)
                loss = self.cross_loss(out, outputs) - (self.alpha * self.cross_loss(peer_out, peer_outputs))
                print('\rGREEDY RATING MAP | EPOCH {}/{} | BATCH {}/{} | CURRENT BATCH COUNT {} | LOSS {:0.4f}\t\t'.format(e+1, epochs, i+1, len(batches), len(batches[i][0]), loss.detach().cpu().numpy()), end='')
                loss.backward()
                self.greedy_rating_optimizer.step()
                losses.append(loss.detach().cpu().numpy())
        return sum(losses)/len(losses)
    
    # Trains the strategy at the passed index.
    # Groups should be of the form (state (i), state (i+1))
    def train_strategy_map(self, index, groups, batch_size=1024, epochs=50):
        rand.shuffle(groups)
        batches = []
        position = 0
        batch = ([],[])
        losses = []
        while position < len(groups):
            batch[0].append(groups[position][0])
            #batch[1].append(groups[position][1])
            batch[1].append(groups[position][2])
            position += 1
            if len(batch[0]) >= batch_size:
                batches.append(batch)
                batch = ([],[])
        if len(batch) > 0:
            batches.append(batch)
        for e in range(epochs):
            for i in range(len(batches)):
                inputs = torch.stack(batches[i][0])
                inputs = self.embedding_map(inputs).detach()
                peer_inputs = [t for t in batches[i][0]]
                rand.shuffle(peer_inputs)
                peer_inputs = torch.stack(peer_inputs)
                peer_inputs = self.embedding_map(peer_inputs).detach()
                peer_outputs = [t for t in batches[i][1]]
                rand.shuffle(peer_outputs)
                peer_outputs = torch.Tensor(peer_outputs).long()
                #peer_outputs = self.embedding_map(peer_outputs).detach()
                outputs = torch.Tensor(batches[i][1]).long()
                #outputs = self.embedding_map(outputs)
                out = self.strategies[index](inputs, True)
                peer_out = self.strategies[index](peer_inputs, True)
                loss = self.cross_loss(out, outputs) - (self.alpha * self.cross_loss(peer_out, peer_outputs))
                print('\rSTRATEGY {}/{} | EPOCH {}/{} | BATCH {}/{} | CURRENT BATCH COUNT {} | LOSS {:0.4f}\t\t'.format(index+1, len(self.strategies), e+1, epochs, i+1, len(batches), len(batches[i][0]), loss.detach().cpu().numpy()), end='')
                loss.backward()
                self.strategy_optimizers[index].step()
                losses.append(loss.detach().cpu().numpy())
        return sum(losses)/len(losses)
    
    # Returns labeled examples for a given set, based on a threshold.
    def label_groups(self, groups):
        groups.sort(key = lambda x: x[3], reverse=True)
        good_examples = groups[:int(len(groups) * self.teach_percent)]
        bad_examples = groups[int(len(groups) * self.teach_percent):]
        examples = []
        for group in good_examples:
            examples.append((group[0], group[2], 1))
        for group in bad_examples:
            examples.append((group[0], group[2], 0))
        rand.shuffle(examples)
        return examples
    
    # Plays a single game on a compatible enviroment with the provided strategy and returns the score and trajectories.
    def play_game_with_strategy(self, env, strategy, render=False, extra_info=''):
        done = False
        previous_tensor = None
        previous_action = None
        action = 0
        score = 0
        step = 0
        lives = 4
        groups = []
        env.reset()
        frames = []
        while not done:
            observation, reward, done, info = env.step(action)
            previous_action = action
            state = observation / 255
            frames.append(torch.Tensor.float(torch.from_numpy(state)))
            if len(frames) < self.stack_size:
                action = rand.randint(0, self.action_size - 1)    
            else:
                tensor = torch.cat(frames[-self.stack_size:], 0)
                embedding = self.embedding_map(tensor).detach()
                policy = self.strategies[strategy](embedding)
                if sum(policy) <= 0 or min(policy) < 0 or rand.uniform(0,1) < self.gamma:
                    action = rand.randint(0, self.action_size - 1)
                else:
                    distribution = torch.distributions.categorical.Categorical(policy)
                    action = (int(distribution.sample()))
                if previous_tensor is not None:
                    groups.append((previous_tensor, tensor, previous_action))
                previous_tensor = tensor
            if render:
                env.render()
            if info['ale.lives'] != lives or done:
                lives = info['ale.lives']
                previous_tensor = None
                previous_action = None
                action = 0
                frames = []
            step += 1
            score += reward
            print('\rSTEP {} | SCORE {} | {}\t\t'.format(step, score, extra_info), end = '')
        return groups, score
    
    # Runs the agent on the provided enviroment.
    def run(self, env, games, loop, render = False):
        running = True
        while running:
            if not loop:
                running = False
            # Game playing section.
            initial_groups = []
            total_score = 0
            high_score = None
            low_score = None
            if games is not None:
                for g in range(games):
                    group, score = self.play_game(env,render,f'GAME {g+1}/{games}')
                    initial_groups.append((group, score))
                    total_score += score
                    if high_score is None or high_score < score:
                        high_score = score
                    if low_score is None or low_score > score:
                        low_score = score
            else:
                for s in range(len(self.strategies)):
                    group, score = self.play_game_with_strategy(env,s,render,f'GAME {s+1}/{len(self.strategies)}')
                    initial_groups.append((group, score))
                    total_score += score
                    if high_score is None or high_score < score:
                        high_score = score
                    if low_score is None or low_score > score:
                        low_score = score
            avg_score = total_score / len(initial_groups)
            print('')
            manager.print(f'END PLAY | AVERAGE SCORE {avg_score} | LOW SCORE {low_score} | HIGH SCORE {high_score}')
            manager.save()
            # Training action and embedding maps section.
            action_groups = []
            for group in initial_groups:
                action_groups += group[0]
            a_e_loss = self.train_action_map(action_groups)
            print('')
            manager.print(f'END ACTION/EMBEDDING TRAINING | AVERAGE LOSS {a_e_loss}')
            manager.save()
            # Training greedy and exploratory rating maps section. 
            score_groups = [] # Groups should be of form (state (i), state (i+1), score)
            for group in initial_groups:
                for pairs in group[0]:
                    score_groups.append((pairs[0], pairs[1], pairs[2], group[1]))
            score_groups = self.label_groups(score_groups)
            g_r_loss = self.train_greedy_rating_map(score_groups)
            print('')
            manager.print(f'END GREEDY RATING TRAINING | AVERAGE LOSS {g_r_loss}')
            manager.save()
            # Training strategies section.
            strategies_training_groups = [] # Groups should be of form (state (i), state (i+1))
            eye = torch.eye(self.action_size)
            for group in initial_groups:
                for pairs in group[0]:
                    in_1 = self.embedding_map(pairs[0])
                    greedy_rating = self.greedy_rating_map(in_1, eye[pairs[2]], True).detach().cpu().numpy()
                    if np.argmax(greedy_rating) == 1:
                        strategies_training_groups.append((pairs[0], pairs[1], pairs[2]))
            initial_groups.sort(key = lambda x: x[1], reverse=True)
            teach_groups = initial_groups[:int(len(initial_groups) * self.teach_percent)]
            for group in teach_groups:
                for pairs in group[0]:
                    strategies_training_groups.append((pairs[0], pairs[1], pairs[2]))
            losses = []
            if len(strategies_training_groups) > 0:
                for i in range(len(self.strategies)):
                    training_groups = []
                    while len(training_groups) < len(strategies_training_groups) / 3:
                        index = rand.randint(0, len(strategies_training_groups) - 1)
                        training_groups.append(strategies_training_groups[index])
                    loss = self.train_strategy_map(i, training_groups)
                    losses.append(loss)
            avg_strat_loss = sum(losses)/len(losses) if len(losses) > 0 else 'NA'
            print('')
            manager.print(f'END STRATEGY TRAINING | AVERAGE LOSS {avg_strat_loss}')
            manager.save()
            self.age += 1
            

In [9]:
agent = AGENT(32, 40, 128, 14, 1, (6, 1, 1, 2), 10**-3, 0.9, 1/3, torch.device('cpu'), torch.device('cpu'))

In [10]:
import gym
env = gym.make('KungFuMaster-ram-v0')

In [None]:
while True:
    agent.run(env, None, False, True)

STEP 1301 | SCORE 1000.0 | GAME 40/40		
END PLAY | AVERAGE SCORE 540.0 | LOW SCORE 0.0 | HIGH SCORE 1400.0
ACTION MAP | EPOCH 50/50 | BATCH 52/52 | CURRENT BATCH COUNT 99 | LOSS 2.4489				
END ACTION/EMBEDDING TRAINING | AVERAGE LOSS 2.454339648760282
GREEDY RATING MAP | EPOCH 50/50 | BATCH 52/52 | CURRENT BATCH COUNT 99 | LOSS 1.0276				
END GREEDY RATING TRAINING | AVERAGE LOSS 0.9417984543167628
STRATEGY 40/40 | EPOCH 50/50 | BATCH 13/13 | CURRENT BATCH COUNT 552 | LOSS 2.9206			
END STRATEGY TRAINING | AVERAGE LOSS 2.4470399898290633
STEP 1015 | SCORE 900.0 | GAME 40/40			
END PLAY | AVERAGE SCORE 567.5 | LOW SCORE 0.0 | HIGH SCORE 1400.0
ACTION MAP | EPOCH 50/50 | BATCH 52/52 | CURRENT BATCH COUNT 90 | LOSS 2.5020				
END ACTION/EMBEDDING TRAINING | AVERAGE LOSS 2.4783575595342198
GREEDY RATING MAP | EPOCH 50/50 | BATCH 52/52 | CURRENT BATCH COUNT 90 | LOSS 1.0898				
END GREEDY RATING TRAINING | AVERAGE LOSS 1.207827760027005
STRATEGY 40/40 | EPOCH 50/50 | BATCH 11/11 | CURRENT B