In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gym
from gym_utils import AtariEnv
from gym_utils import AtariFrame

import numpy as np
import random

environment_name = "SpaceInvaders-v4"
typical_bad_game_frame_count = 1200
reward_frame_shift = -15

# environment_name = "Pong-v4"
# typical_bad_game_frame_count = 1100
# reward_frame_shift = -1

action_count = gym.make(environment_name).action_space.n

In [2]:
# define a pytorch model.  for now, accept a 210 x 160 greyscale image and output an array of actions


class AtariModel(nn.Module):

    def __init__(self, action_count, dropout=0.25):
        """
        Initialize the PyTorch AtariModel Module
        :param dropout: dropout to add in between LSTM/GRU layers
        """
        super(AtariModel, self).__init__()
        
        # convolutional layer 1  (in_channels, out_channels, kernel_size, stride=1, padding=0)
        self.conv1 = nn.Conv2d(3, 32, 3, stride=2, padding=1)
        # convolutional layer 2
        self.conv2 = nn.Conv2d(32, 128, 3, stride=1, padding=1)
        # convolutional layer 3
        self.conv3 = nn.Conv2d(128, 512, 3, stride=1, padding=1)

        # max pooling layer
        self.maxpool = nn.MaxPool2d(2, 2)
        
        self.fc1 = nn.Linear(8320*8, 256)    #64 * 14 * 14 = 12544
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, action_count)
        
        self.dropout = nn.Dropout(0.20)

    def forward(self, img_array):
        """
        Forward propagation of the neural network
        :param img_array: The input img array to the neural network
        :return
        """
        ## Define forward behavior
        
        #print("forward received img_array of shape: {}".format(img_array.shape))
        
        #convolutional layers
        x = self.maxpool(F.relu(self.conv1(img_array)))
        x = self.maxpool(F.relu(self.conv2(x)))
        x = self.maxpool(F.relu(self.conv3(x)))  
        
        #flatten
        x = x.view(-1, 8320*8)  
        #print("x.view shape: {}".format(x.shape))  #torch.Size([1, 8320])
        
        #fc layers
        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    


In [7]:
#play a game. feed each frame into the model and see what we get
def play_game(env_name, model, use_probability_based_action, max_frames=5000):
    model.eval()
    atari_env = AtariEnv(environment_name, reward_frame_shift)
    current_action = 0
    done = False
    frame_counter = 0
    
    while not done:
        atari_frame = atari_env.step(current_action)
        img_array = atari_frame.img_array
        img_array = img_array.reshape((3,160,210))
        img_array = img_array.reshape((1,3,160,210))
        img_tensor = torch.from_numpy(img_array).float().cuda()

        if frame_counter % 10 == 0:  #keep action for 5 steps
            output = model(img_tensor)
            action_array = output.detach().cpu().numpy()[0]
            
            if use_probability_based_action:
                choices = np.arange(0,6)
                action_array_softmax = softmax(action_array)

                if np.max(action_array_softmax) < 0.7:
                    probability_based_action = np.random.choice(choices, p=action_array_softmax)
                else:
                    probability_based_action = np.random.choice(choices)
                    action_array_softmax[probability_based_action] = 1.0

                #print("{} - {}".format(np.argmax(action_array), probability_based_action))

                #### use probability_based_action
                atari_frame.action_array = action_array_softmax
                current_action = probability_based_action
            else:
                atari_frame.action_array = action_array
                current_action = np.argmax(action_array)
                
            last_action_array = atari_frame.action_array
            last_action = current_action
        else:
            atari_frame.action_array = last_action_array
                
        #print(f"current_action: {current_action}, action_array: {action_array}")
        done = atari_frame.done_bool
        frame_counter += 1
        if frame_counter > max_frames:
            break

    atari_env.close()
    return atari_env

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def get_batch(atari_env, batch_size):
    rand_arr = np.arange(len(atari_env.frame_buffer))
    np.random.shuffle(rand_arr)
    
    frame_batch = np.zeros((batch_size, 3, 160, 210))
    target_batch = np.zeros(batch_size)
    reward_batch = np.zeros(batch_size)
    batch_tally = np.zeros(6)
    
    batch_counter = 0
    #for ii in range(batch_size):
    for ii in range(len(rand_arr)):
        i = rand_arr[ii]
        atari_frame = atari_env.frame_buffer[i]
        reward = atari_frame.discounted_reward
        if reward > 0:
            img_array = atari_frame.img_array
            img_array = img_array.reshape((3,160,210))
            
            action_array = atari_frame.action_array
            #train_action = np.argmax(action_array)  ###################3
            train_action = atari_frame.action_taken

            if train_action != np.argmax(batch_tally):
                reward_batch[batch_counter] = reward
                frame_batch[batch_counter] = img_array
                target_batch[batch_counter] = train_action
                batch_counter += 1
                batch_tally[train_action] += 1
        if batch_counter >= batch_size:
            break
    #print("batch_tally: {}".format(batch_tally))
    return frame_batch, target_batch, reward_batch
    

def train(atari_env, model, optimizer, criterion):
    model.train()
    action_count = atari_env.env.action_space.n
    discounted_rewards = atari_env.get_discounted_rewards()
    frame_buffer = atari_env.frame_buffer
    action_tally = np.zeros(action_count)
    train_tally = np.zeros(action_count)
    
    reward_mean_shift = 0
    if len(discounted_rewards) > typical_bad_game_frame_count:
        sorted_rewards = np.sort(discounted_rewards)
        desired_median = sorted_rewards[typical_bad_game_frame_count//2]
#        discounted_rewards_mean = np.mean(discounted_rewards)
#        reward_mean_shift = (discounted_rewards_mean - desired_median)  #/4.0
        reward_mean_shift = -1.0 * desired_median
    print("shifting rewards by: {:.3f}".format(reward_mean_shift))
    
    total_loss = 0
    epochs = 25
    #for ii, reward_ii in enumerate(discounted_rewards):
    for i in range(epochs):
    
        #print("{}: {}".format(i, reward))
        optimizer.zero_grad()

        frame_batch, target_batch, reward_batch = get_batch(atari_env, 50)
        
        reward_batch = reward_batch + reward_mean_shift #shift rewards for long games
        #iterate through rewards, update targets as necessary
        for i_batch in range(len(reward_batch)):
            if reward_batch[i_batch] < 0:
                target_batch[i_batch] = target_batch[i_batch]+1
                if target_batch[i_batch] >= action_count:
                    target_batch[i_batch] = 0
                #target_batch[i_batch] = random.randint(0,action_count-1) #update to use min action
        
        img_tensor = torch.from_numpy(frame_batch).float().cuda()
        output = model(img_tensor)
        output_actions = np.sum(output.cpu().detach().numpy(), axis=0)
        #print("output_actions: {}".format(output_actions))
        #print("output: {}".format(output))

        target = torch.from_numpy(target_batch)
        target = target.long().cuda()
        #print("target: {}".format(target))
        
        loss = criterion(output, target)
        #print("loss: {}".format(loss))
        total_loss += loss
        
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        
        # perform a single optimization step (parameter update)
        optimizer.step()
    
    print("avg loss: {:.3f}".format(total_loss / epochs))
#    print("model action_tally: {}".format(action_tally))
#    print("train_tally:        {}".format(train_tally))


In [4]:
#new model

atari_model = AtariModel(action_count)
atari_model.cuda()

### loss function
atari_criterion = nn.CrossEntropyLoss()




In [16]:
### optimizer
atari_optimizer = optim.Adam(atari_model.parameters(), lr=0.0001)

for i in range(5):
    #play a game
    use_probability_based_action = False #i % 2 != 0
    atari_env = play_game(environment_name, atari_model, use_probability_based_action)

    #discounted_rewards = atari_env.get_discounted_rewards()
    #print()
    #print(discounted_rewards)
    print("\n{}) frames played: {}, score: {}".format(i, len(atari_env.frame_buffer), atari_env.get_total_score()))
    print("actions taken: {}".format(atari_env.get_actions_taken()))

    #train the model
    #if use_probability_based_action:
    for y in range(10):
        train(atari_env, atari_model, atari_optimizer, atari_criterion)



0) frames played: 967, score: 285.0
actions taken: [  1. 160.   0.   0.   0. 806.]
shifting rewards by: 0.000
avg loss: 0.160
shifting rewards by: 0.000
avg loss: 0.083
shifting rewards by: 0.000
avg loss: 0.067
shifting rewards by: 0.000
avg loss: 0.048
shifting rewards by: 0.000
avg loss: 0.044
shifting rewards by: 0.000
avg loss: 0.023
shifting rewards by: 0.000
avg loss: 0.025
shifting rewards by: 0.000
avg loss: 0.022
shifting rewards by: 0.000
avg loss: 0.018
shifting rewards by: 0.000
avg loss: 0.020

1) frames played: 963, score: 285.0
actions taken: [  1. 130.   0.   0.   0. 832.]
shifting rewards by: 0.000
avg loss: 0.156
shifting rewards by: 0.000
avg loss: 0.061
shifting rewards by: 0.000
avg loss: 0.032
shifting rewards by: 0.000
avg loss: 0.030
shifting rewards by: 0.000
avg loss: 0.018
shifting rewards by: 0.000
avg loss: 0.013
shifting rewards by: 0.000
avg loss: 0.008
shifting rewards by: 0.000
avg loss: 0.007
shifting rewards by: 0.000
avg loss: 0.003
shifting reward

In [15]:
from gym_utils import AtariEnv
from gym_utils import AtariFrame

#display frame
frame_num=600
discounted_rewards = atari_env.get_discounted_rewards()
discounted_rewards_mean_shifted = atari_env.get_discounted_rewards()

print("discounted_rewards mean: {}".format(np.mean(discounted_rewards)))
if len(discounted_rewards_mean_shifted) > typical_bad_game_frame_count:
    sorted_rewards = np.sort(discounted_rewards_mean_shifted)
    desired_median = sorted_rewards[typical_bad_game_frame_count//2]
    discounted_rewards_mean = np.mean(discounted_rewards_mean_shifted)
    reward_shift = (discounted_rewards_mean - desired_median)/2.0
    print("Shifting rewards by {}".format(reward_shift))
    discounted_rewards_mean_shifted = discounted_rewards_mean_shifted + reward_shift
    print("new discounted_rewards mean: {}".format(np.mean(discounted_rewards_mean_shifted)))

discounted_rewards mean: 2.4975140097015822e-17


In [358]:
frame_num += 4
atari_frame = atari_env.frame_buffer[frame_num]

print("frame: {}, original reward: {:.3f}, shifted reward: {:.3f}".format(
    frame_num, discounted_rewards[frame_num], discounted_rewards_shifted[frame_num]))
atari_frame.show_frame()

IndexError: deque index out of range