In [1]:
# Import statements.
import numpy as np
import random as rand
import torch
import math
import matplotlib.pyplot as plt
from ExperimentManager import Experiment
from torch import nn
import torch.nn.functional as F
import torch.distributions as tdist
%matplotlib inline

In [2]:
manager = Experiment.start_experiment('experiments2/', 'experiment', print)

Please enter a brief description of this experiment:
Went back to all relu, Hypers: (40, 1, agent_hyper, 1/8, 1/4, 5)


In [3]:
# Generates network weights.
def generate_weights(starting_size, ending_size, weights_needed):
    difference = (starting_size - ending_size) / (weights_needed + 1)
    weights = []
    for i in range(weights_needed):
        weights.append(int(starting_size - (difference * (i+1))))
    return weights

In [4]:
# Policy recommendor network.
class POLICY_NET(nn.Module):
    
    # Constructor.
    def __init__(self, input_size, output_size, layer_count, output_count, t_device):
        super().__init__()
        weights = generate_weights(input_size, output_size, layer_count)
        prev_weight = input_size
        self.t_device = t_device
        self.hidden_layers = []
        for w in weights:
            self.hidden_layers.append(nn.Linear(prev_weight, w).to(self.t_device))
            prev_weight = w
        self.output_layers = []
        for i in range(output_count):
            self.output_layers.append(nn.Linear(prev_weight, output_size).to(self.t_device))
        self.sigmoid = nn.Sigmoid()
        self.relu = F.relu
        self.sin = torch.sin
        self.softmax = nn.Softmax(dim=0)
        self.params = []
        for h in self.hidden_layers:
            self.params += list(h.parameters())
        for o in self.output_layers:
            self.params += list(o.parameters())
            
    # Forward propogate input.
    def forward(self, x, train=False):
        for hidden in self.hidden_layers:
            x = self.sin(hidden(x))
        outputs = []
        for out in self.output_layers:
            if train:
                outputs.append(self.relu(out(x)))
            else:
                outputs.append(self.relu(out(x)))
        return outputs

In [None]:
# Policy rating network.
class RATING_NET(nn.Module):
    
    # Constructor.
    def __init__(self, input_size, output_size, layer_count, t_device):
        super().__init__()
        weights = generate_weights(input_size, output_size, layer_count)
        prev_weight = input_size
        self.t_device = t_device
        self.hidden_layers = []
        for w in weights:
            self.hidden_layers.append(nn.Linear(prev_weight, w).to(self.t_device))
            prev_weight = w
        self.output_layer = nn.Linear(prev_weight, output_size).to(self.t_device)
        self.sigmoid = nn.Sigmoid()
        self.relu = F.relu
        self.sin = torch.sin
        self.softmax = nn.Softmax(dim=0)
        self.params = []
        for h in self.hidden_layers:
            self.params += list(h.parameters())
        self.params += list(self.output_layer.parameters())
            
    # Forward propogate input.
    def forward(self, x, train=False):
        for hidden in self.hidden_layers:
            x = self.sin(hidden(x))
        outputs = []
        for out in self.output_layers:
            if train:
                outputs.append(self.relu(out(x)))
            else:
                outputs.append(self.relu(out(x)))
        return outputs

In [5]:
# Exploratory agent.
class AGENT:
    
    # Constructor.
    def __init__(self, name, state_size, action_size, layer_count, step_size, learning_rate, gamma, stack_size, t_device, s_device):
        self.name = name
        self.state_size = state_size
        self.action_size = action_size
        self.layer_count = layer_count
        self.step_size = step_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.stack_size = stack_size
        self.alpha = 0.1
        self.t_device = t_device
        self.s_device = s_device
        self.age = 1
        self.policy_map = POLICY_NET(state_size * stack_size, action_size, layer_count, step_size, t_device)
        self.optimizer = torch.optim.Adam(self.policy_map.params, lr=learning_rate)
        self.loss_func = nn.CrossEntropyLoss()#nn.MSELoss()
    
    # Train policy network.
    def train_policy_network(self, inputs, outputs, extra_info='', batch_size=1024, epochs=50):
        batches = []
        position = 0
        eye = torch.eye(self.action_size)
        batch = ([],[[] for _ in range(self.step_size)])
        losses = []
        while position < len(inputs):
            batch[0].append(inputs[position])
            for i in range(len(batch[1])):
                #batch[1][i].append(eye[outputs[i][position]])
                batch[1][i].append(outputs[i][position])
            position += 1
            if len(batch[0]) >= batch_size:
                batches.append(batch)
                batch = ([],[[] for _ in range(self.step_size)])
        if len(batch) > 0:
            batches.append(batch)
        for e in range(epochs):
            for i in range(len(batches)):
                inputs = torch.stack(batches[i][0])
                #outputs = [torch.stack(o) for o in batches[i][1]]
                outputs = [torch.Tensor(o).long() for o in batches[i][1]]
                out = self.policy_map(inputs, train=True)
                loss = None
                for j in range(len(outputs)):
                    if loss is None:
                        loss = self.loss_func(out[j], outputs[j])
                    else:
                        loss += self.loss_func(out[j], outputs[j])
                print('\r{} | EPOCH {}/{} | BATCH {}/{} | CURRENT BATCH COUNT {} | LOSS {:0.4f} | AGE {} {}\t\t'.format(self.name, e+1, epochs, i+1, len(batches), len(batches[i][0]), loss.detach().cpu().numpy(), self.age, extra_info), end='')
                loss.backward()
                self.optimizer.step()
                losses.append(loss.detach().cpu().numpy())
        self.age += 1
        return sum(losses)/len(losses)
        
    # Plays a game.
    def play_game(self, env, render=False, extra_info=''):
        done = False
        previous_state = None
        action = 0
        score = 0
        inner_score = 0 # Score inside inner steps.
        overall_step = 0
        step = 0
        depth = 0
        first_step = True
        lives = 4
        action_queue = None
        groups = []
        env.reset()
        frames = []
        while not done:
            if first_step:
                observation, reward, done, info = env.step(action)
                state = observation
                frames.append(state)
                tensor = torch.Tensor.float(torch.from_numpy(state))
                tensor = torch.cat([tensor for _ in range(self.stack_size)], 0)
                previous_state = tensor.detach().cpu().numpy()
                dists = self.policy_map(tensor)
                action_queue = []
                for d in dists:
                    if rand.uniform(0,1) > self.gamma or min(d) < 0 or sum(d) == 0:
                        action_queue.append(rand.randint(0, self.action_size - 1))
                    else:
                        distribution = torch.distributions.categorical.Categorical(d)
                        action_queue.append(int(distribution.sample()))
                first_step = False
            else:
                action = action_queue[step]
                observation, reward, done, info = env.step(action)
                score += reward
                inner_score += reward
                state = observation
                frames.append(state)
                step += 1
                if step == self.step_size:
                    groups.append((previous_state, action_queue))
                    step = 0
                    inner_score = 0
                    if len(frames) < self.stack_size:
                        tensor = torch.Tensor.float(torch.from_numpy(state))
                        tensor = torch.cat([tensor for _ in range(self.stack_size)], 0)
                    else:
                        tensors = [torch.Tensor.float(torch.from_numpy(f)) for f in frames[-self.stack_size:]]
                        tensor = torch.cat(tensors, 0)
                    previous_state = tensor.detach().cpu().numpy()
                    dists = self.policy_map(tensor)
                    action_queue = []
                    try_rand = rand.uniform(0,1)
                    for d in dists:
                        if try_rand > self.gamma or sum(d) == 0:
                            action_queue.append(rand.randint(0, self.action_size - 1))
                        else:
                            if min(d) < 0:
                                d += abs(min(d))
                            distribution = torch.distributions.categorical.Categorical(d)
                            action_queue.append(int(distribution.sample()))
            print('\r{} | STEP {} | SCORE {} | AGE {} {}\t\t'.format(self.name, overall_step, score, self.age, extra_info), end = '')
            if render:
                env.render()
            if info['ale.lives'] != lives or done:
                lives = info['ale.lives']
                previous_state = None
                action = 0
                inner_score = 0 # Score inside inner steps.
                step = 0
                depth = 0
                first_step = True
                action_queue = None
            overall_step += 1
        return groups, score
        

In [6]:
# Population of agents that learn from each other.
class POPULATION:
    
    # Constructor.
    def __init__(self, population_size, number_of_attempts, agent_params, teach_percent, train_percent, age_cutoff):
        self.population_size = population_size
        self.number_of_attempts = number_of_attempts
        self.teach_percent = teach_percent
        self.train_percent = train_percent
        self.population = []
        self.agents_created = population_size
        self.age_cutoff = age_cutoff
        self.agent_params = agent_params
        for i in range(population_size):
            agent = AGENT(f'AGENT_{i}', agent_params[0], agent_params[1], agent_params[2], agent_params[3], agent_params[4], agent_params[5], agent_params[6], agent_params[7], agent_params[8])
            self.population.append(agent)
        self.generation = 0
        
    # Converts a list of trajectories into valid inputs and outputs for training.
    def convert_to_training_data(self, trajectories):
        step_size = len(trajectories[0][1])
        inputs = []
        outputs = [[] for _ in range(step_size)]
        for t in trajectories:
            inputs.append(torch.Tensor.float(torch.from_numpy(t[0])))
            for i in range(step_size):
                outputs[i].append(t[1][i])
        return inputs, outputs
    
    # Replaces the given agent with a new agent that is returned.
    def replace_agent(self, agent):
        for i in range(len(self.population)):
            if agent.name == self.population[i].name:
                new_agent = AGENT(f'AGENT_{self.agents_created}', self.agent_params[0], self.agent_params[1], self.agent_params[2], self.agent_params[3], self.agent_params[4], self.agent_params[5], self.agent_params[6], self.agent_params[7], self.agent_params[8])
                self.population[i] = new_agent
                self.agents_created += 1
                return new_agent
        return agent
        
    # Runs and trains the agents.
    def run_population(self, env, render=False):
        new_pop = []
        total_score = 0
        high_score = None
        low_score = None
        manager.print('BEGIN RUNNING POPULATION | GENERATION {}'.format(self.generation))
        rand.shuffle(self.population)
        for agent in self.population:
            candidate_runs = []
            for g in range(self.number_of_attempts):
                groups, score = agent.play_game(env, render, f'| MEMBER {len(new_pop) + 1}/{len(self.population)} | GAME {g+1}/{self.number_of_attempts}')
                total_score += score
                candidate_runs.append((groups, score))
                if high_score is None or high_score < score:
                    high_score = score
                if low_score is None or low_score > score:
                    low_score = score
            candidate_runs.sort(key = lambda x: x[1], reverse=True)
            new_pop.append((agent, candidate_runs[0][0], candidate_runs[0][1]))
            
            #all_groups = []
            #agent_score = 0
            #for g in range(self.number_of_attempts):
            #    groups, score = agent.play_game(env, render, f'| MEMBER {len(new_pop) + 1}/{len(self.population)} | GAME {g+1}/{self.number_of_attempts}')
            #    all_groups += groups
            #    agent_score += score
            #    total_score += score
            #    if high_score is None or high_score < score:
            #        high_score = score
            #    if low_score is None or low_score > score:
            #        low_score = score
            #new_pop.append((agent, all_groups, agent_score / self.number_of_attempts))
        print('\n')
        manager.print('END RUNNING POPULATION | AVERAGE SCORE {} | LOW SCORE {} | HIGH SCORE {}'.format(total_score / (len(new_pop) * self.number_of_attempts), low_score, high_score))
        new_pop.sort(key = lambda x: x[2], reverse=True)
        teach_pop = new_pop[:int(len(new_pop) * self.teach_percent)]
        train_pop = new_pop[-int(len(new_pop) * self.train_percent):]
        examples = []
        for exp in teach_pop:
            examples += exp[1]
        manager.print('BEGIN TRAINING POPULATION')
        count = 0
        losses = []
        for train in train_pop:
            agent = train[0]
            if agent.age > self.age_cutoff and rand.uniform(0,1) > 0.5:
                agent = self.replace_agent(agent)
            trajectories = []
            for _ in range(int(len(examples) / 2)):
                index = rand.randint(0, len(examples) - 1)
                trajectories.append(examples[index])
            inputs, outputs = self.convert_to_training_data(trajectories)
            loss = agent.train_policy_network(inputs, outputs, extra_info=f'| MEMBER {count+1}/{len(train_pop)}', batch_size=1024, epochs=50)
            losses.append(loss)
            count += 1
        print('\n')
        manager.print('END TRAINING POPULATION | AVG LOSS {}'.format(sum(losses)/len(losses)))
        self.generation += 1
        
        

In [7]:
agent_hyper = (128, 14, 10, 2, 0.001, 0.95, 5, torch.device('cpu'), torch.device('cpu'))
population = POPULATION(40, 1, agent_hyper, 1/8, 1/4, 5)

In [8]:
import gym
env = gym.make('KungFuMaster-ram-v0')

In [None]:
while True:
    population.run_population(env, True)
    manager.save()

BEGIN RUNNING POPULATION | GENERATION 0
AGENT_19 | STEP 1783 | SCORE 2100.0 | AGE 1 | MEMBER 40/40 | GAME 1/1		

END RUNNING POPULATION | AVERAGE SCORE 872.5 | LOW SCORE 0.0 | HIGH SCORE 5000.0
BEGIN TRAINING POPULATION
AGENT_27 | EPOCH 50/50 | BATCH 3/3 | CURRENT BATCH COUNT 403 | LOSS 5.2057 | AGE 1 | MEMBER 10/10			

END TRAINING POPULATION | AVG LOSS 5.1997364384333284
BEGIN RUNNING POPULATION | GENERATION 1
AGENT_4 | STEP 1411 | SCORE 1200.0 | AGE 2 | MEMBER 40/40 | GAME 1/1			

END RUNNING POPULATION | AVERAGE SCORE 1150.0 | LOW SCORE 0.0 | HIGH SCORE 3800.0
BEGIN TRAINING POPULATION
AGENT_2 | EPOCH 50/50 | BATCH 3/3 | CURRENT BATCH COUNT 738 | LOSS 5.1905 | AGE 1 | MEMBER 10/10			

END TRAINING POPULATION | AVG LOSS 5.127672968228658
BEGIN RUNNING POPULATION | GENERATION 2
AGENT_1 | STEP 1424 | SCORE 1200.0 | AGE 2 | MEMBER 40/40 | GAME 1/1			

END RUNNING POPULATION | AVERAGE SCORE 1352.5 | LOW SCORE 0.0 | HIGH SCORE 3600.0
BEGIN TRAINING POPULATION
AGENT_23 | EPOCH 50/50 | BAT

AGENT_1 | STEP 2362 | SCORE 4400.0 | AGE 5 | MEMBER 40/40 | GAME 1/1				

END RUNNING POPULATION | AVERAGE SCORE 3765.0 | LOW SCORE 800.0 | HIGH SCORE 9100.0
BEGIN TRAINING POPULATION
AGENT_83 | EPOCH 50/50 | BATCH 4/4 | CURRENT BATCH COUNT 563 | LOSS 3.9835 | AGE 4 | MEMBER 10/10			

END TRAINING POPULATION | AVG LOSS 4.420437060236931
BEGIN RUNNING POPULATION | GENERATION 44
AGENT_67 | STEP 2908 | SCORE 6300.0 | AGE 3 | MEMBER 40/40 | GAME 1/1		

END RUNNING POPULATION | AVERAGE SCORE 3705.0 | LOW SCORE 1200.0 | HIGH SCORE 6900.0
BEGIN TRAINING POPULATION
AGENT_83 | EPOCH 50/50 | BATCH 4/4 | CURRENT BATCH COUNT 297 | LOSS 4.1297 | AGE 5 | MEMBER 10/10			

END TRAINING POPULATION | AVG LOSS 4.186454089641571
BEGIN RUNNING POPULATION | GENERATION 45
AGENT_81 | STEP 2787 | SCORE 5700.0 | AGE 3 | MEMBER 40/40 | GAME 1/1		

END RUNNING POPULATION | AVERAGE SCORE 3910.0 | LOW SCORE 600.0 | HIGH SCORE 6900.0
BEGIN TRAINING POPULATION
AGENT_92 | EPOCH 50/50 | BATCH 4/4 | CURRENT BATCH COUNT 