In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gym
from gym_utils import AtariEnv
from gym_utils import AtariFrame

import numpy as np
import random

environment_name = "SpaceInvaders-v4"
typical_bad_game_frame_count = 1200
reward_frame_shift = -15

# environment_name = "Pong-v4"
# typical_bad_game_frame_count = 1100
# reward_frame_shift = -1

action_count = gym.make(environment_name).action_space.n

In [13]:
# define a pytorch model.  for now, accept a 210 x 160 greyscale image and output an array of actions


class AtariModel(nn.Module):

    def __init__(self, action_count, dropout=0.25):
        """
        Initialize the PyTorch AtariModel Module
        :param dropout: dropout to add in between LSTM/GRU layers
        """
        super(AtariModel, self).__init__()
        self.action_count = action_count
        
        # convolutional layer 1  (in_channels, out_channels, kernel_size, stride=1, padding=0)
        self.conv1 = nn.Conv2d(3, 32, 3, stride=2, padding=1)
        # convolutional layer 2
        self.conv2 = nn.Conv2d(32, 128, 3, stride=1, padding=1)
        # convolutional layer 3
        self.conv3 = nn.Conv2d(128, 512, 3, stride=1, padding=1)

        # max pooling layer
        self.maxpool = nn.MaxPool2d(2, 2)
        
        #then into an RNN
        #self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.n_layers = 2
        self.hidden_dim = 512
        self.lstm = nn.LSTM(512*10*13, self.hidden_dim, self.n_layers, dropout=dropout, batch_first=True)  #10 frames???
        
        #self.fc1 = nn.Linear(8320*8, 512)  
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, action_count)
        
        self.dropout = nn.Dropout(0.25)

    def forward(self, img_array, hidden):
        """
        Forward propagation of the neural network
        :param img_array: The input img array to the neural network
        :return
        """
        ## Define forward behavior
        
        #print("forward received img_array of shape: {}".format(img_array.shape))
        
        batch_size = img_array.size(0)
        sequence_length = img_array.size(1)
        #print("batch_size: {}, sequence_length: {}".format(batch_size, sequence_length))
        
        #convolutional layers
        x = self.maxpool(F.relu(self.conv1(img_array)))
        x = self.maxpool(F.relu(self.conv2(x)))
        x = self.maxpool(F.relu(self.conv3(x)))  
        
        #print("x.shape after exiting last max pool: {}".format(x.shape)) #([1, 512, 10, 13])
        
        #flatten
        x = x.view(-1, 8320*8)  
        #print("x.view shape: {}".format(x.shape))  #torch.Size([1, 8320])
        
        out_contiguous = x.contiguous().view(-1, batch_size, 512*10*13)  #66560
        
        #into LTSM
        r_out, hidden = self.lstm(out_contiguous, hidden) 
        
        #fc layers
        #x = self.dropout(x)
        #x = F.relu(self.fc1(x))
        r_out = self.dropout(r_out)
        r_out = F.relu(self.fc2(r_out))
        out_fc = self.fc3(r_out)
        #print("out_fc.shape: {}".format(out_fc.shape))  #out_fc.shape: torch.Size([1, 1, 6])
        out_reshaped = out_fc.view(batch_size, -1, self.action_count)  # reshape to be batch_size first
        #print("out shape if full RNN output is: {}".format(out.shape))
        out = out_reshaped[:, -1] ##### get last batch of labels
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        '''
        Initialize the hidden state of an LSTM/GRU
        :param batch_size: The batch_size of the hidden state
        :return: hidden state of dims (n_layers, batch_size, hidden_dim)
        '''
        # Implement function
        
        # initialize hidden state with zero weights, and move to GPU if available
        
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
       
        return hidden


In [24]:
#play a game. feed each frame into the model and see what we get
def play_game(env_name, model, use_probability_based_action, max_frames=5000):
    model.eval()
    atari_env = AtariEnv(environment_name, reward_frame_shift)
    current_action = 0
    done = False
    frame_counter = 0
    
    hidden = model.init_hidden(5)
    
    while not done:
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        hidden = tuple([each.data for each in hidden])
    
        atari_frame = atari_env.step(current_action)
#         img_array = atari_frame.img_array
#         img_array = img_array.reshape((3,160,210))
#         img_array = img_array.reshape((1,3,160,210))
        img_array = get_play_batch(atari_env, 5)
        img_tensor = torch.from_numpy(img_array).float().cuda()
        output, hidden = model(img_tensor, hidden)
        #print("output.shape: {}".format(output.shape))
        action_array = output.detach().cpu().numpy()[0]
        
        if use_probability_based_action:
            choices = np.arange(0,6)
            action_array_softmax = softmax(action_array)
            probability_based_action = np.random.choice(choices, p=action_array_softmax)
            #print("{} - {}".format(np.argmax(action_array), probability_based_action))

            #### use probability_based_action
            atari_frame.action_array = action_array_softmax
            current_action = probability_based_action
        else:
            atari_frame.action_array = action_array
            current_action = np.argmax(action_array)

        done = atari_frame.done_bool
        frame_counter += 1
        if frame_counter > max_frames:
            break

    atari_env.close()
    return atari_env, hidden

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def get_play_batch(atari_env, batch_size):
    frame_batch = np.zeros((batch_size, 3, 160, 210))
    
    for ii in range(-batch_size, 0):
        
        this_frame_index = len(atari_env.frame_buffer) + ii
        if this_frame_index < 0:
            this_frame_index = 0
        
        atari_frame = atari_env.frame_buffer[this_frame_index]
        
        img_array = atari_frame.img_array
        img_array = img_array.reshape((3,160,210))
        frame_batch[ii + batch_size] = img_array
        
    return frame_batch

def get_train_batch(atari_env, batch_size):
    rand_arr = np.arange(len(atari_env.frame_buffer))
    np.random.shuffle(rand_arr)
    
    frame_batch = np.zeros((batch_size, 3, 160, 210))
    target_batch = np.zeros(batch_size)
    reward_batch = np.zeros(batch_size)
    
    for ii in range(batch_size):
        i = rand_arr[ii]
        atari_frame = atari_env.frame_buffer[i]
        reward = atari_frame.discounted_reward
        reward_batch[ii] = reward
        
        img_array = atari_frame.img_array
        img_array = img_array.reshape((3,160,210))
        frame_batch[ii] = img_array
        
        action_array = atari_frame.action_array
        train_action = np.argmax(action_array)
        target_batch[ii] = train_action
        
    return frame_batch, target_batch, reward_batch
    

def train(atari_env, model, optimizer, criterion):
    model.train()
    hidden = model.init_hidden(1)
    action_count = atari_env.env.action_space.n
    discounted_rewards = atari_env.get_discounted_rewards()
    frame_buffer = atari_env.frame_buffer
    action_tally = np.zeros(action_count)
    train_tally = np.zeros(action_count)
    
    reward_mean_shift = 0
    if len(discounted_rewards) > typical_bad_game_frame_count:
        sorted_rewards = np.sort(discounted_rewards)
        desired_median = sorted_rewards[typical_bad_game_frame_count//2]
#        discounted_rewards_mean = np.mean(discounted_rewards)
#        reward_mean_shift = (discounted_rewards_mean - desired_median)  #/4.0
        reward_mean_shift = -1.0 * desired_median
    print("shifting rewards by: {:.3f}".format(reward_mean_shift))
    
    total_loss = 0
    epochs = 25
    #for ii, reward_ii in enumerate(discounted_rewards):
    for i in range(epochs):
    
        #print("{}: {}".format(i, reward))
        optimizer.zero_grad()

        frame_batch, target_batch, reward_batch = get_batch(atari_env, 5)
        
        reward_batch = reward_batch + reward_mean_shift #shift rewards for long games
        #iterate through rewards, update targets as necessary
        for i_batch in range(len(reward_batch)):
            if reward_batch[i_batch] < 0:
                target_batch[i_batch] = target_batch[i_batch]+1
                if target_batch[i_batch] >= action_count:
                    target_batch[i_batch] = 0
                #target_batch[i_batch] = random.randint(0,action_count-1) #update to use min action
        
        img_tensor = torch.from_numpy(frame_batch).float().cuda()
        output, hidden = model(img_tensor, hidden)
        output_actions = np.sum(output.cpu().detach().numpy(), axis=0)
        #print("output_actions: {}".format(output_actions))
        #print("output: {}".format(output))

        target = torch.from_numpy(target_batch)
        target = target.long().cuda()
        #print("target: {}".format(target))
        
        loss = criterion(output, target)
        #print("loss: {}".format(loss))
        total_loss += loss
        
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        
        # perform a single optimization step (parameter update)
        optimizer.step()
    
    print("avg loss: {:.3f}".format(total_loss / epochs))
#    print("model action_tally: {}".format(action_tally))
#    print("train_tally:        {}".format(train_tally))


In [25]:
#new model

atari_model = AtariModel(action_count)
atari_model.cuda()

### loss function
atari_criterion = nn.CrossEntropyLoss()




In [26]:
### optimizer
atari_optimizer = optim.Adam(atari_model.parameters(), lr=0.0001)

for i in range(5):
    #play a game
    use_probability_based_action = i % 1 != 0
    atari_env, hidden = play_game(environment_name, atari_model, use_probability_based_action)

    #discounted_rewards = atari_env.get_discounted_rewards()
    #print()
    #print(discounted_rewards)
    print("\n{}) frames played: {}, score: {}".format(i, len(atari_env.frame_buffer), atari_env.get_total_score()))
    print("actions taken: {}".format(atari_env.get_actions_taken()))

    #train the model
    train(atari_env, atari_model, atari_optimizer, atari_criterion)


RuntimeError: Expected hidden[0] size (2, 1, 512), got (2, 5, 512)

In [38]:
from gym_utils import AtariEnv
from gym_utils import AtariFrame

#display frame
frame_num=600
discounted_rewards = atari_env.get_discounted_rewards()
discounted_rewards_mean_shifted = atari_env.get_discounted_rewards()

print("discounted_rewards mean: {}".format(np.mean(discounted_rewards)))
if len(discounted_rewards_mean_shifted) > typical_bad_game_frame_count:
    sorted_rewards = np.sort(discounted_rewards_mean_shifted)
    desired_median = sorted_rewards[typical_bad_game_frame_count//2]
    discounted_rewards_mean = np.mean(discounted_rewards_mean_shifted)
    reward_shift = (discounted_rewards_mean - desired_median)/2.0
    print("Shifting rewards by {}".format(reward_shift))
    discounted_rewards_mean_shifted = discounted_rewards_mean_shifted + reward_shift
    print("new discounted_rewards mean: {}".format(np.mean(discounted_rewards_mean_shifted)))

discounted_rewards mean: -1.569682037761635e-17


In [358]:
frame_num += 4
atari_frame = atari_env.frame_buffer[frame_num]

print("frame: {}, original reward: {:.3f}, shifted reward: {:.3f}".format(
    frame_num, discounted_rewards[frame_num], discounted_rewards_shifted[frame_num]))
atari_frame.show_frame()

IndexError: deque index out of range