In [1]:
# Import statements.
import numpy as np
import random as rand
import torch
import math
import matplotlib.pyplot as plt
from ExperimentManager import Experiment
from torch import nn
import torch.nn.functional as F
import torch.distributions as tdist
%matplotlib inline

In [2]:
manager = Experiment.start_experiment('experimentsConvolutional/', 'experiment', print)

Please enter a brief description of this experiment:
Cross loss test, Hypers: (20, 1, agent_hyper, 1/4, 1/4, 5)


In [3]:
# Generates network weights.
def generate_weights(starting_size, ending_size, weights_needed):
    difference = (starting_size - ending_size) / (weights_needed + 1)
    weights = []
    for i in range(weights_needed):
        weights.append(int(starting_size - (difference * (i+1))))
    return weights

In [4]:
# Maps input frames to hidden state.
class STATE_MAP(nn.Module):
    
    # Constructor.
    def __init__(self, frame_count):
        super().__init__()
        # Residual conv (1)
        self.conv1 = nn.Conv2d(frame_count, 16, 3, padding=1)  
        # First set of hidden non-residual layers.
        self.conv2 = nn.Conv2d(16, 16, 3, padding=1)
        self.conv3 = nn.Conv2d(16, 16, 3, padding=1)
        # Residual conv (2)
        self.conv4 = nn.Conv2d(16, 4, 3, padding=1)  
        # Second set of hidden non-residual layers.
        self.conv5 = nn.Conv2d(4, 4, 3, padding=1)
        self.conv6 = nn.Conv2d(4, 4, 3, padding=1)
        # Final convolution.
        self.conv7 = nn.Conv2d(4, 1, 3, padding=1)
        
        # Pooling operation.
        self.pool = nn.MaxPool2d(2, 2)
        
        # Initialized sigmoid.
        self.sigmoid = nn.Sigmoid()
        
    # Propogates input thru layers.
    def forward(self, x):
        # Activation for hidden layers.
        c_a = F.relu
        
        # First residual layer.
        x = c_a(self.conv1(x))
        # First non-residual layers.
        x_prime = c_a(self.conv2(x))
        x_prime = c_a(self.conv3(x_prime))
        # First combination
        x = x + x_prime
        # First pool.
        x = self.pool(x)
        # Second residual layer.
        x = c_a(self.conv4(x))
        # Second non-residual layers.
        x_prime = c_a(self.conv5(x))
        x_prime = c_a(self.conv6(x_prime))
        # Second combination
        x = x + x_prime
        # Second pool.
        x = self.pool(x)
        # Final conv.
        x = c_a(self.conv7(x))
        # Final pool.
        x = self.pool(x)
        
        # Reshape into 1d tensor.
        x = x.view(x.size(0),-1)
        
        # Return hidden state.
        return x

In [5]:
# Policy recommendor network.
class POLICY_NET(nn.Module):
    
    # Constructor.
    def __init__(self, conv_map, input_size, output_size, layer_count, output_count, t_device):
        super().__init__()
        self.conv_map = conv_map
        weights = generate_weights(input_size, output_size, layer_count)
        prev_weight = input_size
        self.t_device = t_device
        self.hidden_layers = []
        for w in weights:
            self.hidden_layers.append(nn.Linear(prev_weight, w).to(self.t_device))
            prev_weight = w
        self.output_layers = []
        for i in range(output_count):
            self.output_layers.append(nn.Linear(prev_weight, output_size).to(self.t_device))
        self.sigmoid = nn.Sigmoid()
        self.relu = F.relu
        self.sin = torch.sin
        self.softmax = nn.Softmax(dim=0)
        self.params = []
        for h in self.hidden_layers:
            self.params += list(h.parameters())
        for o in self.output_layers:
            self.params += list(o.parameters())
        self.params += list(conv_map.parameters())
            
    # Forward propogate input.
    def forward(self, x, train=False):
        x = self.conv_map(x)
        for hidden in self.hidden_layers:
            x = self.sin(hidden(x))
        outputs = []
        for out in self.output_layers:
            if not train:
                outputs.append(self.sigmoid(out(x)))
            else:
                outputs.append(out(x))
        return outputs

In [6]:
# Preprocesses the game frame so the agent can use it.
def process_frame(frame):
    frame = frame.mean(axis=2)
    #frame = frame[95:155, 8:]
    frame = frame * (1/255)
    return frame

In [7]:
# Exploratory agent.
class AGENT:
    
    # Constructor.
    def __init__(self, name, state_size, action_size, layer_count, step_size, learning_rate, gamma, stack_size, t_device, s_device):
        self.name = name
        self.state_size = state_size
        self.action_size = action_size
        self.layer_count = layer_count
        self.step_size = step_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.stack_size = stack_size
        self.alpha = 0.1
        self.t_device = t_device
        self.s_device = s_device
        self.age = 1
        self.policy_map = POLICY_NET(STATE_MAP(stack_size).to(self.t_device), 520, action_size, layer_count, step_size, t_device)
        self.optimizer = torch.optim.Adam(self.policy_map.params, lr=learning_rate)
        self.loss_func = nn.CrossEntropyLoss()
        #self.loss_func = nn.BCEWithLogitsLoss()
    
    # Train policy network.
    def train_policy_network(self, inputs, outputs, extra_info='', batch_size=1024, epochs=50):
        batches = []
        position = 0
        eye = torch.eye(self.action_size)
        batch = ([],[[] for _ in range(self.step_size)])
        losses = []
        while position < len(inputs):
            batch[0].append(inputs[position])
            for i in range(len(batch[1])):
                #batch[1][i].append(eye[outputs[i][position]])
                batch[1][i].append(outputs[i][position])
            position += 1
            if len(batch[0]) >= batch_size:
                batches.append(batch)
                batch = ([],[[] for _ in range(self.step_size)])
        if len(batch) > 0:
            batches.append(batch)
        for e in range(epochs):
            for i in range(len(batches)):
                self.optimizer.zero_grad()
                #inputs = torch.stack(batches[i][0])
                inputs = torch.cat(batches[i][0], 0)
                #outputs = [torch.stack(o).to(self.t_device) for o in batches[i][1]]
                outputs = [torch.Tensor(o).long().to(self.t_device) for o in batches[i][1]]
                #peer_outputs = [torch.Tensor(o).long() for o in batches[i][1]]
                out = self.policy_map(inputs.to(self.t_device), train=True)
                loss = None
                for j in range(len(outputs)):
                    if loss is None:
                        loss = self.loss_func(out[j], outputs[j])
                    else:
                        loss += self.loss_func(out[j], outputs[j])
                print('\r{} | EPOCH {}/{} | BATCH {}/{} | CURRENT BATCH COUNT {} | LOSS {:0.4f} | AGE {} {}\t\t'.format(self.name, e+1, epochs, i+1, len(batches), len(batches[i][0]), loss.detach().cpu().numpy(), self.age, extra_info), end='')
                loss.backward()
                self.optimizer.step()
                losses.append(loss.detach().cpu().numpy())
        self.age += 1
        return sum(losses)/len(losses)
        
    # Plays a game.
    def play_game(self, env, render=False, extra_info=''):
        done = False
        previous_state = None
        action = 0
        score = 0
        inner_score = 0 # Score inside inner steps.
        overall_step = 0
        step = 0
        depth = 0
        first_step = True
        lives = 4
        action_queue = None
        groups = []
        env.reset()
        frames = []
        while not done:
            if first_step:
                observation, reward, done, info = env.step(action)
                state = process_frame(observation)
                frames.append(state)
                tensor = torch.Tensor.float(torch.from_numpy(state))
                #tensor = torch.cat([tensor for _ in range(self.stack_size)], 0)
                tensor = torch.stack([torch.stack([tensor for _ in range(self.stack_size)])])
                previous_state = tensor.detach().cpu().numpy()
                dists = self.policy_map(tensor.to(self.t_device))
                action_queue = []
                for d in dists:
                    d = d[0]
                    if rand.uniform(0,1) > self.gamma or min(d) < 0 or sum(d) == 0:
                        action_queue.append(rand.randint(0, self.action_size - 1))
                    else:
                        distribution = torch.distributions.categorical.Categorical(d)
                        action_queue.append(int(distribution.sample()))
                first_step = False
            else:
                action = action_queue[step]
                observation, reward, done, info = env.step(action)
                score += reward
                inner_score += reward
                state = process_frame(observation)
                frames.append(state)
                step += 1
                if step == self.step_size:
                    groups.append((previous_state, action_queue))
                    step = 0
                    inner_score = 0
                    if len(frames) < self.stack_size:
                        tensor = torch.Tensor.float(torch.from_numpy(state))
                        #tensor = torch.cat([tensor for _ in range(self.stack_size)], 0)
                        tensor = torch.stack([torch.stack([tensor for _ in range(self.stack_size)])])
                    else:
                        tensors = [torch.Tensor.float(torch.from_numpy(f)) for f in frames[-self.stack_size:]]
                        #tensor = torch.cat(tensors, 0)
                        tensor = torch.stack([torch.stack(tensors)])
                    previous_state = tensor.detach().cpu().numpy()
                    dists = self.policy_map(tensor.to(self.t_device))
                    action_queue = []
                    try_rand = rand.uniform(0,1)
                    for d in dists:
                        d = d[0]
                        if try_rand > self.gamma or sum(d) == 0:
                            action_queue.append(rand.randint(0, self.action_size - 1))
                        else:
                            if min(d) < 0:
                                d += abs(min(d))
                            distribution = torch.distributions.categorical.Categorical(d)
                            action_queue.append(int(distribution.sample()))
            print('\r{} | STEP {} | SCORE {} | AGE {} {}\t\t'.format(self.name, overall_step, score, self.age, extra_info), end = '')
            if render:
                env.render()
            if info['ale.lives'] != lives or done:
                lives = info['ale.lives']
                previous_state = None
                action = 0
                inner_score = 0 # Score inside inner steps.
                step = 0
                depth = 0
                first_step = True
                action_queue = None
            overall_step += 1
        return groups, score
        

In [8]:
# Population of agents that learn from each other.
class POPULATION:
    
    # Constructor.
    def __init__(self, population_size, number_of_attempts, agent_params, teach_percent, train_percent, age_cutoff):
        self.population_size = population_size
        self.number_of_attempts = number_of_attempts
        self.teach_percent = teach_percent
        self.train_percent = train_percent
        self.population = []
        self.agents_created = population_size
        self.age_cutoff = age_cutoff
        self.agent_params = agent_params
        for i in range(population_size):
            agent = AGENT(f'AGENT_{i}', agent_params[0], agent_params[1], agent_params[2], agent_params[3], agent_params[4], agent_params[5], agent_params[6], agent_params[7], agent_params[8])
            self.population.append(agent)
        self.generation = 0
        
    # Converts a list of trajectories into valid inputs and outputs for training.
    def convert_to_training_data(self, trajectories):
        step_size = len(trajectories[0][1])
        inputs = []
        outputs = [[] for _ in range(step_size)]
        for t in trajectories:
            inputs.append(torch.Tensor.float(torch.from_numpy(t[0])))
            for i in range(step_size):
                outputs[i].append(t[1][i])
        return inputs, outputs
    
    # Replaces the given agent with a new agent that is returned.
    def replace_agent(self, agent):
        for i in range(len(self.population)):
            if agent.name == self.population[i].name:
                new_agent = AGENT(f'AGENT_{self.agents_created}', self.agent_params[0], self.agent_params[1], self.agent_params[2], self.agent_params[3], self.agent_params[4], self.agent_params[5], self.agent_params[6], self.agent_params[7], self.agent_params[8])
                self.population[i] = new_agent
                self.agents_created += 1
                return new_agent
        return agent
        
    # Runs and trains the agents.
    def run_population(self, env, render=False):
        new_pop = []
        total_score = 0
        high_score = None
        low_score = None
        manager.print('BEGIN RUNNING POPULATION | GENERATION {}'.format(self.generation))
        rand.shuffle(self.population)
        for agent in self.population:
            candidate_runs = []
            for g in range(self.number_of_attempts):
                groups, score = agent.play_game(env, render, f'| MEMBER {len(new_pop) + 1}/{len(self.population)} | GAME {g+1}/{self.number_of_attempts}')
                total_score += score
                candidate_runs.append((groups, score))
                if high_score is None or high_score < score:
                    high_score = score
                if low_score is None or low_score > score:
                    low_score = score
            candidate_runs.sort(key = lambda x: x[1], reverse=True)
            new_pop.append((agent, candidate_runs[0][0], candidate_runs[0][1]))
            
            #all_groups = []
            #agent_score = 0
            #for g in range(self.number_of_attempts):
            #    groups, score = agent.play_game(env, render, f'| MEMBER {len(new_pop) + 1}/{len(self.population)} | GAME {g+1}/{self.number_of_attempts}')
            #    all_groups += groups
            #    agent_score += score
            #    total_score += score
            #    if high_score is None or high_score < score:
            #        high_score = score
            #    if low_score is None or low_score > score:
            #        low_score = score
            #new_pop.append((agent, all_groups, agent_score / self.number_of_attempts))
        print('\n')
        manager.print('END RUNNING POPULATION | AVERAGE SCORE {} | LOW SCORE {} | HIGH SCORE {}'.format(total_score / (len(new_pop) * self.number_of_attempts), low_score, high_score))
        new_pop.sort(key = lambda x: x[2], reverse=True)
        teach_pop = new_pop[:int(len(new_pop) * self.teach_percent)]
        train_pop = new_pop[-int(len(new_pop) * self.train_percent):]
        examples = []
        for exp in teach_pop:
            examples += exp[1]
        manager.print('BEGIN TRAINING POPULATION')
        count = 0
        losses = []
        for train in train_pop:
            agent = train[0]
            if agent.age > self.age_cutoff and rand.uniform(0,1) > 0.5:
                agent = self.replace_agent(agent)
            trajectories = []
            for _ in range(int(len(examples) / 2)):
                index = rand.randint(0, len(examples) - 1)
                trajectories.append(examples[index])
            inputs, outputs = self.convert_to_training_data(trajectories)
            loss = agent.train_policy_network(inputs, outputs, extra_info=f'| MEMBER {count+1}/{len(train_pop)}', batch_size=128, epochs=50)
            losses.append(loss)
            count += 1
        print('\n')
        manager.print('END TRAINING POPULATION | AVG LOSS {}'.format(sum(losses)/len(losses)))
        self.generation += 1
        
        

In [9]:
agent_hyper = (128, 14, 10, 1, 0.001, 0.95, 5, torch.device('cuda'), torch.device('cpu'))
population = POPULATION(20, 1, agent_hyper, 1/4, 1/4, 5)

In [10]:
import gym
env = gym.make('KungFuMaster-v0')

In [None]:
while True:
    population.run_population(env, True)
    manager.save()

BEGIN RUNNING POPULATION | GENERATION 0
AGENT_18 | STEP 1800 | SCORE 700.0 | AGE 1 | MEMBER 20/20 | GAME 1/1		

END RUNNING POPULATION | AVERAGE SCORE 630.0 | LOW SCORE 200.0 | HIGH SCORE 1300.0
BEGIN TRAINING POPULATION
AGENT_3 | EPOCH 50/50 | BATCH 27/27 | CURRENT BATCH COUNT 120 | LOSS 2.6407 | AGE 1 | MEMBER 5/5			

END TRAINING POPULATION | AVG LOSS 2.6367879356808137
BEGIN RUNNING POPULATION | GENERATION 1
AGENT_11 | STEP 1337 | SCORE 900.0 | AGE 1 | MEMBER 20/20 | GAME 1/1		

END RUNNING POPULATION | AVERAGE SCORE 530.0 | LOW SCORE 0.0 | HIGH SCORE 1000.0
BEGIN TRAINING POPULATION
AGENT_10 | EPOCH 50/50 | BATCH 28/28 | CURRENT BATCH COUNT 126 | LOSS 2.6355 | AGE 1 | MEMBER 5/5		

END TRAINING POPULATION | AVG LOSS 2.6372040938309262
BEGIN RUNNING POPULATION | GENERATION 2
AGENT_13 | STEP 1323 | SCORE 700.0 | AGE 2 | MEMBER 20/20 | GAME 1/1			

END RUNNING POPULATION | AVERAGE SCORE 600.0 | LOW SCORE 100.0 | HIGH SCORE 1300.0
BEGIN TRAINING POPULATION
AGENT_10 | EPOCH 27/50 | BAT