In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gym
from gym_utils import AtariEnv
from gym_utils import AtariFrame

import numpy as np

environment_name = "SpaceInvaders-v4"

In [2]:
# define a pytorch model.  for now, accept a 210 x 160 greyscale image and output an array of actions


class AtariModel(nn.Module):

    def __init__(self, dropout=0.25):
        """
        Initialize the PyTorch AtariModel Module
        :param dropout: dropout to add in between LSTM/GRU layers
        """
        super(AtariModel, self).__init__()
        
        # convolutional layer 1  (in_channels, out_channels, kernel_size, stride=1, padding=0)
        self.conv1 = nn.Conv2d(3, 16, 3, stride=2, padding=1)
        # convolutional layer 2
        self.conv2 = nn.Conv2d(16, 32, 3, stride=1, padding=1)
        # convolutional layer 3
        self.conv3 = nn.Conv2d(32, 64, 3, stride=1, padding=1)

        # max pooling layer
        self.maxpool = nn.MaxPool2d(2, 2)
        
        self.fc1 = nn.Linear(8320, 512)    #64 * 14 * 14 = 12544
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 6)
        
        self.dropout = nn.Dropout(0.25)

    def forward(self, img_array):
        """
        Forward propagation of the neural network
        :param img_array: The input img array to the neural network
        :return
        """
        ## Define forward behavior
        
        #print("forward received img_array of shape: {}".format(img_array.shape))
        
        #convolutional layers
        x = self.maxpool(F.relu(self.conv1(img_array)))
        x = self.maxpool(F.relu(self.conv2(x)))
        x = self.maxpool(F.relu(self.conv3(x)))  
        
        #flatten
        x = x.view(-1, 8320)  
        #print("x.view shape: {}".format(x.shape))  #torch.Size([1, 8320])
        
        #fc layers
        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    


In [3]:
#play a game. feed each frame into the model and see what we get
def play_game(env_name, model):
    model.eval()
    atari_env = AtariEnv(environment_name)
    current_action = 0
    done = False

    while not done:
        atari_frame = atari_env.step(current_action)
        img_array = atari_frame.img_array
        img_array = img_array.reshape((3,160,210))
        img_array = img_array.reshape((1,3,160,210))
        img_tensor = torch.from_numpy(img_array).float().cuda()
        output = model(img_tensor)
        action_array = output.detach().cpu().numpy()[0]
        atari_frame.action_array = action_array
        current_action = np.argmax(action_array)
        #print("{} - {}".format(current_action, output.detach().cpu().numpy()[0]))
        done = atari_frame.done_bool

    atari_env.close()
    return atari_env


def train(atari_env, model, optimizer, criterion):
    model.train()
    discounted_rewards = atari_env.get_discounted_rewards()
    frame_buffer = atari_env.frame_buffer
    action_tally = np.zeros(6)
    train_tally = np.zeros(6)
    
    total_loss = 0
    for i, reward in enumerate(discounted_rewards):
        #print("{}: {}".format(i, reward))
        optimizer.zero_grad()
        
        #get frame from the frame buffer and run it through the model
        atari_frame = atari_env.frame_buffer[i]
        img_array = atari_frame.img_array
        img_array = img_array.reshape((3,160,210))
        img_array = img_array.reshape((1,3,160,210))
        img_tensor = torch.from_numpy(img_array).float().cuda()
        output = model(img_tensor)
        #print("train output: {}".format(output))
        
        #if the reward was positive, keep the same.  if not, choose lowest option
        action_array_from_model_in_training = output.detach().cpu().numpy()[0]
        action_array = atari_frame.action_array
        train_action = np.argmax(action_array)
        action_tally[train_action] += 1
        
        if reward < 0:
            train_action = np.argmin(action_array)
            #train_action = np.argsort(action_array)[3] #fourth highest
            
        if np.argmax(train_tally) == train_action and np.sum(train_tally) != 0:
            #keep things even to not introduce bias that will get it stuck on one action
            continue

        train_tally[train_action] += 1
        
        target = torch.empty(1, dtype=torch.int64)
        target[0] = int(train_action)
        target = target.cuda()
        
        loss = criterion(output, target)
        total_loss += loss
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
    
    print("avg loss: {:.3f}".format(total_loss / len(train_tally)))
    print("model action_tally: {}".format(action_tally))
    print("train_tally:        {}".format(train_tally))
    
    
    

In [4]:
#new model
atari_model = AtariModel()
atari_model.cuda()

### loss function
atari_criterion = nn.CrossEntropyLoss()

### optimizer
atari_optimizer = optim.Adam(atari_model.parameters(), lr=0.0001)


In [5]:
for i in range(50):
    #play a game
    atari_env = play_game(environment_name, atari_model)

    #discounted_rewards = atari_env.get_discounted_rewards()
    #print()
    #print(discounted_rewards)
    print("\n{}) frames played: {}, score: {}".format(i, len(atari_env.frame_buffer), atari_env.get_total_score()))

    #train the model
    for ii in range(1):
        train(atari_env, atari_model, atari_optimizer, atari_criterion)



0) frames played: 665, score: 25.0
avg loss: 13.238
model action_tally: [  0.  10.  28. 616.   0.  11.]
train_tally:        [0. 4. 1. 5. 6. 5.]

1) frames played: 643, score: 35.0
avg loss: 11.388
model action_tally: [  0.   0.   0. 439.   0. 204.]
train_tally:        [11.  0.  0. 11.  0. 11.]

2) frames played: 920, score: 0.0
no points granted. Setting all discounted rewards to -1
avg loss: 15.624
model action_tally: [920.   0.   0.   0.   0.   0.]
train_tally:        [ 0. 13. 16.  0. 16.  0.]

3) frames played: 506, score: 105.0
avg loss: 4.467
model action_tally: [  0. 336. 160.   0.  10.   0.]
train_tally:        [4. 4. 4. 0. 3. 0.]

4) frames played: 1076, score: 30.0
avg loss: 29.347
model action_tally: [   0.    8. 1060.    0.    8.    0.]
train_tally:        [21.  6.  7. 37.  5. 37.]

5) frames played: 690, score: 105.0
avg loss: 4.796
model action_tally: [  0.   0.   0. 398.   0. 292.]
train_tally:        [0. 0. 5. 3. 0. 5.]

6) frames played: 686, score: 105.0
avg loss: 9.9