<img width="260" src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRcO6ATtznWETiIO6GoFD2SpmQfgILeMbxsTJe4P3sJ2xZgQReu">

In [None]:
!pip install pillow
!pip install tqdm retrowrapper gym-retro
# copy your roms to data/roms/

import retro
!python -m retro.import data/roms/

# Start the Game



In [None]:
from IPython import display
import numpy as np
import PIL.Image
import io
import retro
import time
import base64
from ipykernel.comm import Comm


def showarray(a, fmt='png'):
    a = np.uint8(a)
    f = io.BytesIO()
    ima = PIL.Image.fromarray(a).save(f, fmt)
    return base64.b64encode(f.getvalue()).decode('ascii')


nrplayers = 1
verbosity = 0

env = retro.make(game=selected_game, record=False, players=nrplayers)
env.reset()


imagedata = showarray(env.render(mode='rgb_array'))

jsc = "Jupyter.notebook.kernel.comm_manager.register_target('gymscreen', function(comm, msg) { comm.on_msg(function(msg) { document.getElementById('gymscr').src = 'data:image/png;base64,'+msg.content.data.img_data; }); });"

imagehandle = display.display(display.HTML("<div><img id='gymscr' style='width:700px;height:520px;background-color:#000;' src='data:image/png;base64,"+imagedata+"'></div><script>"+jsc+"</script>"), display_id='gymscreen_container')

my_comm = Comm(target_name='gymscreen', data={'img_data': imagedata})



print(env.action_space)
print((env.render(mode='rgb_array')).shape)


try:
    while True:
        ob = env.reset()
        t = 0
        totrew = [0] * nrplayers
        while True:
            ac = env.action_space.sample()
            ob, rew, done, info = env.step(ac)
            t += 1
            if t % 3 == 0:
                my_comm.send({'img_data': imagedata })
            else:
                imagedata = showarray(env.render(mode='rgb_array'))
            if t % 10 == 0:
                if verbosity > 1:
                    infostr = ''
                    if info:
                        infostr = ', info: ' + ', '.join(['%s=%i' % (k, v) for k, v in info.items()])
                    print(('t=%i' % t) + infostr)
            if nrplayers == 1:
                rew = [rew]
            for i, r in enumerate(rew):
                totrew[i] += r
                if verbosity > 0:
                    if r > 0:
                        print('t=%i p=%i got reward: %g, current reward: %g' % (t, i, r, totrew[i]))
                    if r < 0:
                        print('t=%i p=%i got penalty: %g, current reward: %g' % (t, i, r, totrew[i]))
            if done:
                endmsg = ''
                try:
                    if verbosity >= 0:
                        if args.players > 1:
                            endmsg = "done! total reward: time=%i, reward=%r" % (t, totrew)
                        else:
                            endmsg = "done! total reward: time=%i, reward=%d" % (t, totrew[0])
                        input("press enter to continue")
                        env.close()
                        print()
                    else:
                        input("")
                except EOFError:
                    env.close
                    exit(0)
                break
except KeyboardInterrupt:
    env.close()
    exit(0)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class QNetwork(nn.Module):

    def __init__(self, state_size, action_size, seed, fc1_units=560):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
        """
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.layer1 = nn.Sequential(
            nn.Conv2d(state_size, 128, kernel_size=5, stride=2, padding=2),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(4),
            nn.Dropout(0.3))
        self.layer2 = nn.Sequential(
            nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=2),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(4),
            nn.Dropout(0.3))
        self.fc1 = nn.Linear(3*5*256, fc1_units)
        self.fc2 = nn.Linear(fc1_units, action_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        #state = state.unsqueeze(1)
        #print(state.shape)
        x = self.layer1(state)
        x = self.layer2(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(F.relu(self.fc1(x)))
        return self.fc2(x)

In [None]:
import numpy as np
import random
import matplotlib.pyplot as plt
from collections import namedtuple, deque

import torch
import torch.nn.functional as F
import torch.optim as optim

BUFFER_SIZE = int(400)  # replay buffer size
BATCH_SIZE = 40         # minibatch size
GAMMA = 0.999            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR = 0.005              # learning rate 
UPDATE_EVERY = 20       # how often to update the network

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.positiveqs_local = QNetwork(state_size, action_size, seed).to(device)
        self.positiveqs_target = QNetwork(state_size, action_size, seed).to(device)
        self.negativeqs_local = QNetwork(state_size, action_size, seed).to(device)
        self.negativeqs_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer_pos = optim.SGD(self.positiveqs_local.parameters(), lr=LR)
        self.optimizer_neg = optim.Adam(self.negativeqs_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """        
        # only the green color channel
        state = torch.from_numpy(np.stack([np.sum([np.multiply(0.2989, state[:,:,0]), np.multiply(0.5870 , state[:,:,1]), np.multiply(0.1140, state[:,:,2])], axis=0), state[:,:,0], state[:,:,1], state[:,:,2]])).float().unsqueeze(0).to(device)
        self.positiveqs_local.eval()
        self.negativeqs_local.eval()
        with torch.no_grad():
            action_values_raw = self.positiveqs_local(state)
            action_values_adjust = self.negativeqs_local(state)
            action_values = np.maximum(action_values_raw - action_values_adjust, np.zeros(action_values_raw.shape))
        self.positiveqs_local.train()
        self.negativeqs_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            action_choice = action_values.cpu().data.numpy()
            return np.random.choice(np.flatnonzero(action_choice == action_choice.max()))
            #return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states_pos, actions_pos, rewards_pos, next_states_pos, dones_pos = experiences[0]
        
        if(len(next_states_pos) > 0):
            # Get predicted actions from max Q values in local model
            next_states_local_pos = self.positiveqs_local(next_states_pos).detach()
            pred_actions_pos = next_states_local_pos.argmax(1)[:,None] == torch.from_numpy(np.arange(next_states_local_pos.shape[1]))
            # Get predicted Q values (for predicted actions) from target model
            Q_targets_next_pos = self.positiveqs_target(next_states_pos).detach()[pred_actions_pos].unsqueeze(1)
            # Compute Q targets for current states 
            Q_targets_pos = rewards_pos + (gamma * Q_targets_next_pos * (1 - dones_pos))
            # Get expected Q values from local model
            Q_expected_pos = self.positiveqs_local(states_pos).gather(1, actions_pos)

            # Compute loss
            loss_pos = F.mse_loss(Q_expected_pos, Q_targets_pos)
            # Minimize the loss
            self.optimizer_pos.zero_grad()
            loss_pos.backward()
            self.optimizer_pos.step()


        states_neg, actions_neg, rewards_neg, next_states_neg, dones_neg = experiences[1]

        if(len(next_states_neg) > 0):
            # Get predicted actions from max Q values in local model
            next_states_local_neg = self.negativeqs_local(next_states_neg).detach()
            pred_actions_neg = next_states_local_neg.argmax(1)[:,None] == torch.from_numpy(np.arange(next_states_local_neg.shape[1]))
            # Get predicted Q values (for predicted actions) from target model
            Q_targets_next_neg = self.negativeqs_target(next_states_neg).detach()[pred_actions_neg].unsqueeze(1)
            # Compute Q targets for current states 
            Q_targets_neg = rewards_neg + (gamma * Q_targets_next_neg * (1 - dones_neg))
            # Get expected Q values from local model
            Q_expected_neg = self.negativeqs_local(states_neg).gather(1, actions_neg)

            # Compute loss
            loss_neg = F.mse_loss(Q_expected_neg, Q_targets_neg)
            # Minimize the loss
            self.optimizer_neg.zero_grad()
            loss_neg.backward()
            self.optimizer_neg.step()


        # ------------------- update target network ------------------- #
        self.soft_update(self.positiveqs_local, self.positiveqs_target, self.negativeqs_local, self.negativeqs_target, TAU)                   

    def soft_update(self, local_model_pos, target_model_pos, local_model_neg, target_model_neg, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param_pos, local_param_pos in zip(target_model_pos.parameters(), local_model_pos.parameters()):
            target_param_pos.data.copy_(tau*local_param_pos.data + (1.0-tau)*target_param_pos.data)
        for target_param_neg, local_param_neg in zip(target_model_neg.parameters(), local_model_neg.parameters()):
            target_param_neg.data.copy_(tau*local_param_neg.data + (1.0-tau)*target_param_neg.data)

class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.
        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""        
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.stack([np.stack([np.sum([np.multiply(0.2989, e.state[:,:,0]), np.multiply(0.5870 , e.state[:,:,1]), np.multiply(0.1140, e.state[:,:,2])], axis=0), e.state[:,:,0], e.state[:,:,1], e.state[:,:,2]]) for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.stack([np.stack([np.sum([np.multiply(0.2989, e.state[:,:,0]), np.multiply(0.5870 , e.state[:,:,1]), np.multiply(0.1140 , e.state[:,:,2])], axis=0), e.state[:,:,0], e.state[:,:,1], e.state[:,:,2]]) for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
    
        pos_idx = torch.from_numpy(np.array([x[0] for x in rewards > 0]))
        neg_idx = torch.from_numpy(np.array([x[0] for x in rewards < -10]))
        
        return [(states[pos_idx], actions[pos_idx], rewards[pos_idx], next_states[pos_idx], dones[pos_idx]), 
                (states[neg_idx], actions[neg_idx], np.abs(rewards[neg_idx]), next_states[neg_idx], dones[neg_idx])]

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [None]:
agent = Agent(state_size=4, action_size=6, seed=0)

In [None]:
from IPython import display
import numpy as np
import PIL.Image
import io
import retro
import time
import base64
from ipykernel.comm import Comm


selected_game = 'SonicTheHedgehog-Genesis' #@param ["SonicTheHedgehog-Genesis", "SonicTheHedgehog2-Genesis", "SonicAndKnuckles3-Genesis"] {allow-input: true}


def showarray(a, fmt='png'):
    a = np.uint8(a)
    f = io.BytesIO()
    ima = PIL.Image.fromarray(a).save(f, fmt)
    return base64.b64encode(f.getvalue()).decode('ascii')


nrplayers = 1
verbosity = 0

env = retro.make(game=selected_game, record=False, players=nrplayers)
env.reset()


imagedata = showarray(env.render(mode='rgb_array'))
jsc = "Jupyter.notebook.kernel.comm_manager.register_target('gymscreen', function(comm, msg) { comm.on_msg(function(msg) { document.getElementById('gymscr').src = 'data:image/png;base64,'+msg.content.data.img_data; }); });"
imagehandle = display.display(display.HTML("<div><img id='gymscr' style='width:700px;height:520px;background-color:#000;' src='data:image/png;base64,"+imagedata+"'></div><script>"+jsc+"</script>"), display_id='gymscreen_container')
my_comm = Comm(target_name='gymscreen', data={'img_data': imagedata})


#button_labels = ["B", "A", "MODE", "START", "UP", "DOWN", "LEFT", "RIGHT", "C", "Y", "X", "Z"]
button_combos = [['RIGHT'], ['LEFT'], ['RIGHT', 'UP'], ['DOWN', 'B'], ['B'], ['8 x RIGHT']]
action_arrays = [[0,0,0,0,0,0,0,1,0,0,0,0], [0,0,0,0,0,0,1,0,0,0,0,0], [0,0,0,0,1,0,0,1,0,0,0,0], 
                 [1,0,0,0,0,1,0,0,0,0,0,0], [1,0,0,0,0,0,0,0,0,0,0,0]]


def dqn(n_episodes=200, max_t=2500, eps_start=1.0, eps_end=0.1, eps_decay=0.05):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): factor for decreasing epsilon
    """
    global imagedata
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        score = 0
        incentive = 0
        rings = 0
        screen_x_max = 0
        state_prev = deque(maxlen=5)
        action_prev = deque(maxlen=5)
        foresight = deque(maxlen=5)
        y_prev = 1000
        lives = 3
        for t in range(max_t):
            action = agent.act(state, eps)
            incentive = 0
            next_state_w = []
            reward = 0
            j = 2
            if(action == 5):
                j = 16
                action = 0
            for i in range(j):
                next_st, rew, done, info = env.step(action_arrays[action])
                next_state_w.append(next_st)
                reward += rew
            
            rings_new = info['rings']
            screen_x_new = info['screen_x']
            lives_new = info['lives']
            y_new = info['y']

            if lives - lives_new == 1:
                incentive -= 500
            else:   
                if rings_new != rings:
                    if rings_new == 0:
                        incentive -= 1000
                    incentive += 100*(rings_new - rings)
                if (screen_x_new - screen_x_max) < 0:
                    incentive += 5
                    screen_x_max = screen_x_new
                if (y_new - y_prev) - 100 > 0:
                    incentive += 500
                    
            lives = lives_new
            rings = rings_new
            y_prev = y_new
                
            agent.step(state, action, reward+incentive, next_state_w[-1], done)
            
            state_prev.append(state)
            action_prev.append(action)
            foresight.append(reward+incentive)
            
            if (np.array(foresight) > 0).all():
                if np.sum(foresight) > 500:
                    for i in range(4):
                        agent.step(state_prev[-1-i], action_prev[-1-i], (reward+incentive)*np.power(0.8, i), state_prev[-i], done)
            
            my_comm.send({'img_data': imagedata })
            imagedata = showarray(env.render(mode='rgb_array'))
            state = next_state_w[-1]
            score += reward
            if done:
                break 
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, np.exp(eps_decay*eps)) # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
        if i_episode == n_episodes:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            torch.save(agent.positiveqs_local.state_dict(), 'Sonic1_GreenZone_PositiveCues.pth')
            torch.save(agent.negativeqs_local.state_dict(), 'Sonic1_GreenZone_NegativeCues.pth')
            break
    return scores

scores = dqn()

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

In [None]:
# load the weights from file
# agent.positiveqs_local.load_state_dict(torch.load('Sonic1_GreenZone_PositiveCues.pth'))
# agent.negativeqs_local.load_state_dict(torch.load('Sonic1_GreenZone_NegativeCues.pth'))