In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gym
from gym_utils import AtariEnv
from gym_utils import AtariFrame

import numpy as np
import random

environment_name = "SpaceInvaders-v4"
typical_bad_game_frame_count = 700
reward_frame_shift = -15

# environment_name = "Pong-v4"
# typical_bad_game_frame_count = 1050
# reward_frame_shift = -1

In [2]:
# define a pytorch model.  for now, accept a 210 x 160 greyscale image and output an array of actions


class AtariModel(nn.Module):

    def __init__(self, action_count, dropout=0.25):
        """
        Initialize the PyTorch AtariModel Module
        :param dropout: dropout to add in between LSTM/GRU layers
        """
        super(AtariModel, self).__init__()
        
        # convolutional layer 1  (in_channels, out_channels, kernel_size, stride=1, padding=0)
        self.conv1 = nn.Conv2d(3, 16, 3, stride=2, padding=1)
        # convolutional layer 2
        self.conv2 = nn.Conv2d(16, 32, 3, stride=1, padding=1)
        # convolutional layer 3
        self.conv3 = nn.Conv2d(32, 64, 3, stride=1, padding=1)

        # max pooling layer
        self.maxpool = nn.MaxPool2d(2, 2)
        
        self.fc1 = nn.Linear(8320, 512)    #64 * 14 * 14 = 12544
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, action_count)
        
        self.dropout = nn.Dropout(0.25)

    def forward(self, img_array):
        """
        Forward propagation of the neural network
        :param img_array: The input img array to the neural network
        :return
        """
        ## Define forward behavior
        
        #print("forward received img_array of shape: {}".format(img_array.shape))
        
        #convolutional layers
        x = self.maxpool(F.relu(self.conv1(img_array)))
        x = self.maxpool(F.relu(self.conv2(x)))
        x = self.maxpool(F.relu(self.conv3(x)))  
        
        #flatten
        x = x.view(-1, 8320)  
        #print("x.view shape: {}".format(x.shape))  #torch.Size([1, 8320])
        
        #fc layers
        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    


In [20]:
#play a game. feed each frame into the model and see what we get
def play_game(env_name, model, max_frames=5000):
    model.eval()
    atari_env = AtariEnv(environment_name, reward_frame_shift)
    current_action = 0
    done = False
    frame_counter = 0
    
    while not done:
        atari_frame = atari_env.step(current_action)
        img_array = atari_frame.img_array
        img_array = img_array.reshape((3,160,210))
        img_array = img_array.reshape((1,3,160,210))
        img_tensor = torch.from_numpy(img_array).float().cuda()
        output = model(img_tensor)
        action_array = output.detach().cpu().numpy()[0]
        atari_frame.action_array = action_array
        current_action = np.argmax(action_array)
        #print("{} - {}".format(current_action, output.detach().cpu().numpy()[0]))
        done = atari_frame.done_bool
        frame_counter += 1
        if frame_counter > max_frames:
            break

    atari_env.close()
    return atari_env

def get_batch(atari_env, batch_size):
    rand_arr = np.arange(len(atari_env.frame_buffer))
    np.random.shuffle(rand_arr)
    
    frame_batch = np.zeros((batch_size, 3, 160, 210))
    target_batch = np.zeros(batch_size)
    reward_batch = np.zeros(batch_size)
    
    for ii in range(batch_size):
        i = rand_arr[ii]
        atari_frame = atari_env.frame_buffer[i]
        reward = atari_frame.discounted_reward
        reward_batch[ii] = reward
        
        img_array = atari_frame.img_array
        img_array = img_array.reshape((3,160,210))
        frame_batch[ii] = img_array
        
        action_array = atari_frame.action_array
        train_action = np.argmax(action_array)
        target_batch[ii] = train_action
        
    return frame_batch, target_batch, reward_batch
    

def train(atari_env, model, optimizer, criterion):
    model.train()
    action_count = atari_env.env.action_space.n
    discounted_rewards = atari_env.get_discounted_rewards()
    frame_buffer = atari_env.frame_buffer
    action_tally = np.zeros(action_count)
    train_tally = np.zeros(action_count)
    
    rand_arr = np.arange(len(discounted_rewards))
    np.random.shuffle(rand_arr)
    
    reward_mean_shift = 0
    if len(discounted_rewards) > typical_bad_game_frame_count:
        sorted_rewards = np.sort(discounted_rewards)
        desired_median = sorted_rewards[typical_bad_game_frame_count//2]
        discounted_rewards_mean = np.mean(discounted_rewards)
        reward_mean_shift = (discounted_rewards_mean - desired_median)/4.0
    print("shifting rewards by: {:.3f}".format(reward_mean_shift))
    
    total_loss = 0
    epochs = 25
    #for ii, reward_ii in enumerate(discounted_rewards):
    for i in range(epochs):
    
        #print("{}: {}".format(i, reward))
        optimizer.zero_grad()

        frame_batch, target_batch, reward_batch = get_batch(atari_env, 50)
        
        reward_batch = reward_batch + reward_mean_shift #shift rewards for long games
        #iterate through rewards, update targets as necessary
        for i_batch in range(len(reward_batch)):
            if reward_batch[i_batch] < 0:
                target_batch[i_batch] = random.randint(0,5) #update to use min action
        
        img_tensor = torch.from_numpy(frame_batch).float().cuda()
        output = model(img_tensor)
        output_actions = np.sum(output.cpu().detach().numpy(), axis=0)
        #print("output_actions: {}".format(output_actions))
        #print("output: {}".format(output))

        target = torch.from_numpy(target_batch)
        target = target.long().cuda()
        #print("target: {}".format(target))
        
        loss = criterion(output, target)
        #print("loss: {}".format(loss))
        total_loss += loss
        
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        
        # perform a single optimization step (parameter update)
        optimizer.step()
    
    print("avg loss: {:.3f}".format(total_loss / epochs))
#    print("model action_tally: {}".format(action_tally))
#    print("train_tally:        {}".format(train_tally))


In [5]:
#new model
action_count = gym.make(environment_name).action_space.n
atari_model = AtariModel(action_count)
atari_model.cuda()

### loss function
atari_criterion = nn.CrossEntropyLoss()

### optimizer
atari_optimizer = optim.Adam(atari_model.parameters(), lr=0.00001)


In [None]:
for i in range(25):
    #play a game
    atari_env = play_game(environment_name, atari_model)

    #discounted_rewards = atari_env.get_discounted_rewards()
    #print()
    #print(discounted_rewards)
    print("\n{}) frames played: {}, score: {}".format(i, len(atari_env.frame_buffer), atari_env.get_total_score()))

    #train the model
    train(atari_env, atari_model, atari_optimizer, atari_criterion)



0) frames played: 676, score: 245.0
shifting rewards by: 0.000
avg loss: 1.400

1) frames played: 770, score: 350.0
shifting rewards by: 0.064
avg loss: 1.427

2) frames played: 749, score: 235.0
shifting rewards by: 0.079
avg loss: 1.320

3) frames played: 767, score: 280.0
shifting rewards by: 0.098
avg loss: 1.294

4) frames played: 377, score: 85.0
shifting rewards by: 0.000
avg loss: 1.332

5) frames played: 1710, score: 770.0
shifting rewards by: 0.157
avg loss: 1.396

6) frames played: 690, score: 260.0
shifting rewards by: 0.000
avg loss: 1.391

7) frames played: 630, score: 220.0
shifting rewards by: 0.000
avg loss: 1.495

8) frames played: 1042, score: 480.0
shifting rewards by: 0.123
avg loss: 1.408

9) frames played: 381, score: 85.0
shifting rewards by: 0.000
avg loss: 1.250

10) frames played: 631, score: 185.0
shifting rewards by: 0.000
avg loss: 1.238

11) frames played: 916, score: 320.0
shifting rewards by: 0.117
avg loss: 1.317

12) frames played: 1099, score: 345.0

In [276]:
from gym_utils import AtariEnv
from gym_utils import AtariFrame

#display frame
frame_num=600
discounted_rewards = atari_env.get_discounted_rewards()
discounted_rewards_mean_shifted = atari_env.get_discounted_rewards()

print("discounted_rewards mean: {}".format(np.mean(discounted_rewards)))
if len(discounted_rewards_mean_shifted) > typical_bad_game_frame_count:
    sorted_rewards = np.sort(discounted_rewards_mean_shifted)
    desired_median = sorted_rewards[typical_bad_game_frame_count//2]
    discounted_rewards_mean = np.mean(discounted_rewards_mean_shifted)
    reward_shift = (discounted_rewards_mean - desired_median)/2.0
    print("Shifting rewards by {}".format(reward_shift))
    discounted_rewards_mean_shifted = discounted_rewards_mean_shifted + reward_shift
    print("new discounted_rewards mean: {}".format(np.mean(discounted_rewards_mean_shifted)))

discounted_rewards mean: 5.4898445367778163e-17
Shifting rewards by 0.48340174895687293
new discounted_rewards mean: 0.48340174895687305


In [358]:
frame_num += 4
atari_frame = atari_env.frame_buffer[frame_num]

print("frame: {}, original reward: {:.3f}, shifted reward: {:.3f}".format(
    frame_num, discounted_rewards[frame_num], discounted_rewards_shifted[frame_num]))
atari_frame.show_frame()

IndexError: deque index out of range