# Here Explanation of the code

# Things to check

## Check the evaluate function

## Enhance the prioritized experience buffer


# HERE QUESTIONS FOR GEORG

## HOW TO DEAL WITH EPSILON GREEDY 

In [1]:
# Importing Section

import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gym

from collections import deque

# Instatiating the environment

In [2]:
env = gym.make('Breakout-ram-v0')

In [3]:
env.observation_space

Box(128,)

In [4]:
env.action_space.n

4

In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

In [6]:
device

device(type='cuda', index=0)

In [7]:
np.log(10)

2.302585092994046

# Hyper-Parameters

In [8]:
SCORES_MEAN = 100
INPUT_SIZE = 128
HIDDEN_DIMS = (256, 128, 64)
HIDDEN_CELL_STATE_DIM = 64 
OUTPUT_SIZE = env.action_space.n
SEQ_SIZE = 80

EPSILON_FINAL = 0.001
EPSILON_INIT = 1
GAMMA = 0.99

ETA = 0.9
ALPHA = 0.9
BETA_INIT = 0.4
CAPACITY = int(1e6)
BATCH_SIZE = 128

LR = 1e-12
UPDATE_EVERY = 40
NUM_UPDATES = 1
TAU = 0.90

# R2D2 Priority Buffer (Naive Version)

## This first version of the Naive Version we pratically do not consider end cases.
## To-do Check if there is a better bootstrap

In [9]:
class buffer(object):
    def __init__(self, capacity, batch_size, alpha):
        self.alpha = alpha
        self.capacity = capacity
        self.batch_size = batch_size
        self.memory = []
        self.stores = []
        self.pos = 0
        self.priorities = np.zeros((self.capacity), dtype = np.float32)
        
        return
    
    # TODO Add a better way to handling is_dones with a better bootstrap
    def add(self, seq, store ):
        ##print(' 2 BUFFER SEQ DIM ', len(seq))
        ##print(' 2 BUFFER store ', len(store))
        if len(self.memory)>0:
            maximum_priority = self.priorities.max()
        else:
            maximum_priority = 1.0
            
        if len(self.memory)<self.capacity:
            self.memory.append(seq)
            
            hidden_state = store[0].squeeze(0).squeeze(0)
            ##print(' 2 BUFFER hidden_state and cell state shape with double squeeze ', hidden_state.shape)
            cell_state = store[1].squeeze(0).squeeze(0)
            store = (hidden_state, cell_state)
            self.stores.append(store)
            self.priorities[self.pos] = maximum_priority
        else:
            self.memory[self.pos] = seq
            hidden_state = store[0].squeeze(0).squeeze(0)
            cell_state = store[1].squeeze(0).squeeze(0)
            store = (hidden_state, cell_state)
            self.stores[self.pos] = store
            self.priorities[self.pos] = maximum_priority
        
        self.pos = (self.pos+1)%self.capacity
        
        return
    
    def sample(self, beta):
        if len(self.memory) == self.capacity:
            prios = self.priorities
        else:
            prios = self.priorities[:self.pos]
        
        probs_temp = prios ** self.alpha
        probs = probs_temp/probs_temp.sum()
        
        indexes = np.random.choice(len(self.memory), self.batch_size, p = probs)
        
        # Weight construction
        total = len(self.memory)
        weights_temp = (total * probs[indexes]) ** (-beta)
        weights = weights_temp/weights_temp.max()
        weights = np.array(weights, dtype = np.float32)
        
        # Sequences selection
        
        batch_exp = []
        
        for idx in indexes:
            batch_exp += self.memory[idx]
        
        batch_h_c_s = [self.stores[idx] for idx in indexes]
        
        ##print(' 3 BUFFER BATCH EXP', batch_exp.shape())
        ##print(' 3 BUFFER HC STATE EXP', batch_h_c_s.shape())
        return batch_exp, batch_h_c_s, weights, indexes
    
    def memory_update (self, batch_indices, batch_priorities):
        for idx, prio in zip(batch_indices, batch_priorities):
            self.priorities[idx] = prio
    
    def __len__(self):
        return len(self.memory)
    

# R2D2 Net

## This Should work it has been checked once. Just another couple of checks needed.

In [10]:
class R2D2_Net(nn.Module):
    def __init__(self, input_size, output_size, hidden_sizes):
        super(R2D2_Net, self).__init__()
        """
        This class contains the net. Takes as inputs input_size, output_size and hidden_sizes.
        
        The forward function takes as inputs the sequence of the states, the batch_size and a seq_size
        along with hidden_states and cell_states useful to initialize the LSTM.
        
        Gives back as aoutput a series of Q_values of the form [batch_size, seq_size, number_of_actions]
        """
        
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_sizes = hidden_sizes
        
        self.Linear_1 = nn.Linear(self.input_size, self.hidden_sizes[0])
        self.Linear_2 = nn.Linear(self.hidden_sizes[0], self.hidden_sizes[1])
        self.Linear_3 = nn.Linear(self.hidden_sizes[1], self.hidden_sizes[2])
        
        self.batch_norm_1 = nn.BatchNorm1d(self.input_size)
        self.batch_norm_2 = nn.BatchNorm1d(self.hidden_sizes[0])
        self.batch_norm_3 = nn.BatchNorm1d(self.hidden_sizes[1])
        self.batch_norm_4 = nn.BatchNorm1d(self.hidden_sizes[2])
        
        self.LSTM = nn.LSTM(input_size = self.hidden_sizes[2], hidden_size = self.hidden_sizes[2], num_layers = 1, batch_first = True)
        
        self.adv = nn.Linear(self.hidden_sizes[2], self.output_size)
        self.value = nn.Linear(self.hidden_sizes[2], 1)
        
        self.relu = nn.ReLU()
        
    def forward(self, state, batch_size, seq_size, hidden_state, cell_state):
        
        state = state.view(batch_size*seq_size, -1)
        
        state = self.batch_norm_1(state)
        
        state = self.relu(self.batch_norm_2(self.Linear_1(state)))
        state = self.relu(self.batch_norm_3(self.Linear_2(state)))
        state = self.relu(self.batch_norm_4(self.Linear_3(state)))
        
        state = state.view(batch_size, seq_size, self.hidden_sizes[2])
        
        lstm_out = self.LSTM(state, (hidden_state, cell_state))
        
        hidden_state = lstm_out[1][0]
        cell_state = lstm_out[1][1]
 
        out = lstm_out[0]
        out = out.contiguous().view(batch_size * seq_size, -1)
        
        # TODO: CHECK THE DIMENSION OF THE NEXT LINE OF CODE
        adv = self.adv(out)
        value = self.value(out)
        mean = adv.mean(dim = 1).unsqueeze(1)
        
        value = value.contiguous().view(batch_size, seq_size, -1)
        adv = adv.contiguous().view(batch_size, seq_size, -1)
        mean = mean.contiguous().view(batch_size, seq_size, -1)

        # BE CAREFUL. THIS MIGHT BE WRONG IN THE UPDATING AGENT NET CONTEXT
        # CHECK ALSO AN UNSQUEEZE ON DIM 1 in adv.mean( dim = 1 ) which might be necessary
        # out_finale = value.expand(batch_size, self.output_size) + (adv - adv.mean(dim = 1).expand(batch_size, self.output_size))
        out_finale = value + (adv - mean)
        
        return out_finale, (hidden_state, cell_state)                                                                  

In [11]:
class agent(): 
    def __init__(self, input_size, output_size, hidden_sizes):
        
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_sizes = hidden_sizes
        
        self.net = R2D2_Net(self.input_size, self.output_size, self.hidden_sizes).to(device)
        self.net_tg = R2D2_Net(self.input_size, self.output_size, self.hidden_sizes).to(device)
        
        self.hard_update(self.net, self.net_tg)
        
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr = LR)
        
        return
    
    def act(self, state, hidden_state, cell_state, epsilon):
        
        state = torch.FloatTensor(state).view(1, 1, self.input_size).to(device)
                
        self.net.eval()
        with torch.no_grad():
            Q_values, (hidden_state, cell_state) = self.net(state, 1, 1, hidden_state, cell_state)
                    
        self.net.train()
        
        if random.random() > epsilon:
            # TODO (DURING THE RUN) CHECK IF THE DIMENSIONS OF ACTION ARE CORRECT
            action = torch.argmax(Q_values, dim = 2).squeeze(0).cpu().data.numpy()
            #print(action.dtype)
            ##print('1 ACT AGENT GREEDY', action.shape)
        else:
            action = [random.choice(range(self.output_size))]
            #print('1 ACT AGENT RANDOM', action.shape)

        return action, (hidden_state, cell_state)
    
        ### NOTICE THIS HAS TO BE RLLY SERIOUSL CHECKED. AS THE EVALUATE FUNCTION
    def update(self, batch_exp, batch_h_c_s, weights, beta, indices):
        hidden_states, cell_states = zip(*batch_h_c_s)
        states, actions, rewards = zip(*batch_exp)
        ##print(' 4 UPDATES hidden_states shape after zip ', hidden_states.shape())
        hidden_states = torch.cat(hidden_states)
        ##print(' 4 UPDATES hidden_states shape after zip and torch.cat ', hidden_states.size())
        cell_states = torch.cat(cell_states)
        hidden_states = hidden_states.view(1, BATCH_SIZE, HIDDEN_CELL_STATE_DIM).to(device)
        cell_states = cell_states.view(1, BATCH_SIZE, HIDDEN_CELL_STATE_DIM).to(device)
        
        states = torch.FloatTensor(states).to(device)
        actions = torch.LongTensor(actions).to(device)

        weights = torch.FloatTensor(weights).to(device)
        states = states.view(BATCH_SIZE * SEQ_SIZE * self.input_size)
        ##print(' 4 UPDATES states shape after states.view(BATCH_SIZE * SEQ_SIZE * self.input_size)', states.size())


        rewards = np.asarray(rewards)
        rewards = rewards.reshape(BATCH_SIZE, SEQ_SIZE, -1)
        ##print(' 4 UPDATES rewards shape after rewards.view(BATCH_SIZE * SEQ_SIZE, -1 )', rewards.shape)
        
        # Net Update
        
        #  TO DO CHECK ALL THE POSSIBLE CASTING PROBLEMS
            
        Q_values, (hiddens_Q_values, cell_Q_values) = self.net(states, BATCH_SIZE, SEQ_SIZE, hidden_states, cell_states)
        # TO DO. DOUBLE CHECK THIS
        actions = actions.view(BATCH_SIZE, SEQ_SIZE, -1)
        ##print(' 4 UPDATES actions shape after actions.view(BATCH_SIZE * SEQ_SIZE )', actions.size())

        Q_values = Q_values.gather(dim = 2, index = actions)
        Q_values = Q_values[:, 40:-1, :]
        ##print(' 4 UPDATES Q_values shape ', Q_values.size())
        
        
        Q_next_values, (hiddens_Q_values, cell_Q_values) = self.net_tg(states, BATCH_SIZE, SEQ_SIZE, hidden_states, cell_states)

        ##print('4 UPDATES Q_next_values shape', Q_next_values[:,-1,:].size())

        Q_next_values = torch.max(Q_next_values[:,-1,:], dim = 1)[0]

        Q_targets = self.bootrstap_rewards(rewards, Q_next_values)

        td = (Q_targets.detach() - Q_values).pow(2).sum(dim = 1) * weights
        
        ## NEXT THREE LINES SHOULD BE USED FOR PARALLELISM
        delta_max = torch.max(td, dim = 1)[0]
        delta_average = torch.mean(td, dim = 1)
        prios = 0.9 * delta_max + (1-0.9) * delta_average + 1e-5
        
        loss = td.mean()
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        buffer.memory_update(indices  , prios.data.cpu().numpy())
        self.soft_update(self.net, self.net_tg, TAU)
        
        return loss
        
        # NOTICE THIS MIGHT NOT BE NECESSARY CHECK OUT 
        # MAYBE IS_DONES, HIDDEN_STATES and CELL_STATES????
    #def evaluate(self, sequence, hidden_state, cell_state ):
     #   states, actions, rewards = zip(*sequence)
        
        # NOTICE: HERE IT IS VERY LIKELY WE NEED AN UNSQUEEZE
        # FOR STATES, ACTIONS (DURING GATHER) AND REWARDS FOR CASTING PROBLEMS
      #  states = torch.FloatTensor(states).to(device)
       # actions = torch.LongTensor(actions).to(device)
       # rewards = torch.FloatTensor(rewards).to(device)
        # MAYBE THIS IS UN-NECESSARY WRITE MAIN AND CHECK 
       # hidden_state = hidden_state.view(1, 1, -1)
       # cell_state = cell_state.view(1, 1, -1)
        
        # Q_values
        
        # NOTICE: THIS SHOULD BE CORRECT BUT THERE MIGHT BE A CASTING PROBLEM.
        # CHECK THIS PART.
       # actions = actions.view(BATCH_SIZE, SEQ_SIZE, -1)
       # actions = actions[:,40:-1,:]
       # Q_values = self.net(states[:-1], 1, SEQ_SIZE - 1, hidden_state, cell_state)
       # Q_values = Q_values[40:]
       # Q_values = Q_values.gather(dim = 2, index = actions)
        
        # Q_next
        
        # NOTICE: THIS IS DONE WITH THE ADD OF DDQN TECHNIQUE
        # CHECK THIS PART
        #Q_next_values, _, _ = self.net(states, BATCH_SIZE, SEQ_SIZE, hidden_state, cell_state)
        #Q_next_values = Q_next_values[41:]
        #new_actions = torch.argmax(Q_next_values, dim = 2)
        
        #Q_next_values_tg, _, _ = self.net_tg(states, BATCH_SIZE, SEQ_SIZE, hidden_state, cell_state)
        #Q_next_values_tg = Q_next_values_tg[41:]
        #Q_next_values_tg = Q_next_values.gather(dim = 2, new_actions)
        # NOTICE: IT IS UNCLEAR IF GAMMA HAS TO BE DISCOUNTED OR NOT AND, IN THIS CASE, WHEN
        # IT HAS TO BE STARTED TO COMPUTED
        
        ## NOTICE: IT IS UNCLEAR HOW TO WRITE IS_DONES
        #Q_targets = rewards[40:79] + GAMMA * Q_next_values_tg() * (1.0 - is_dones)
        
        #td = (Q_targets - Q_values).pow(2)
        #delta_max = td.max()
        #delta_sum = td.mean()
        #prio = 
        
    def bootrstap_rewards(self, rewards, last_bootstrapping_value):
        lista_bootstrap = []
        last_bootstrapping_value = last_bootstrapping_value.detach().cpu().numpy()
        ##print('5 BOOTSTRAP REWARDS last Q(s,a) size', last_bootstrapping_value.shape)
        
        for i in range(BATCH_SIZE):
            temp = last_bootstrapping_value[i]
            #print(rewards[i].shape)
            rewards_reversed = np.flip(rewards[i])
            lista_temp_bootstrap = []
            #print(rewards_reversed)
            #print(rewards_reversed.shape)
            
            for j in range(1,40):
                temp = GAMMA * temp
                temp += rewards_reversed[j] 
                lista_temp_bootstrap.append(temp)
            
            lista_temp_bootstrap = np.flip(np.asarray(lista_temp_bootstrap))
            ##print(' 5 BOOTSTRAP REWARDS Single reward bootstrap flipped ', lista_temp_bootstrap.shape)
            lista_bootstrap.append(lista_temp_bootstrap)
    
        lista_bootstrap = torch.FloatTensor(lista_bootstrap).to(device)
        ##print('5 BOOTSTRAP REWARDS list of list shape ', lista_bootstrap.size())
        return lista_bootstrap
           
    def soft_update(self, network, target_network, tau):
        
        for param, target_param in zip(network.parameters(), target_network.parameters()):
            target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)
    
    def hard_update(self, network, target_network):
        target_network.load_state_dict(network.state_dict())
    
    ## TODO ADD THE VIDEO
    def play(self):
        state = env.reset()
        score = 0
        hidden_state = torch.zeros(HIDDEN_CELL_STATE_DIM).view(1, 1, HIDDEN_CELL_STATE_DIMENSION)
        cell_state = torch.zeros(HIDDEN_CELL_STATE_DIM).view(1, 1, HIDDEN_CELL_STATE_DIMENSION)
        
        while True:
            action, hidden_state, cell_state = self.act(state, hidden_state, cell_state)
            next_state, reward, is_done, _ = env.step(action)
            score += reward
            state = next_state
            
            if is_done == True:
                return score
# l'ORDINE DEL TEMPO ROVELLI


In [12]:
def epsilon_decay(numbers, epsilon_init, epsilon_final):
    return epsilon_final + (epsilon_init-epsilon_final)/np.log(1+(np.log(1+numbers)))**3

In [13]:
def beta_decay(numbers, beta_init):
    return min(1.0, beta_init + numbers * (1.0 - beta_init))

In [14]:
buffer = buffer(CAPACITY, BATCH_SIZE, ALPHA)

In [15]:
agent = agent(INPUT_SIZE, OUTPUT_SIZE, HIDDEN_DIMS)

In [16]:
def main(num_episodes):    
    
    counter = 1 # Needed for epsilon decay, beta decay
    scores = []
    scores_deque = deque(maxlen = SCORES_MEAN) # Useful to check if we reached the goal.
    losses = []
    loss_tracking = deque(maxlen = 40)
    hidden_layers_deque = deque(maxlen = 80)
    sequence_deque = deque(maxlen = 80)
    best_value = -np.inf
    flag = 0
       
    for episode in range(1, num_episodes + 1):
        counter_for_episode = 1
        state = env.reset()
        score = 0
        # NOTICE this is needed since the first time one needs to reshape the hidden_state and cell_state under the proper dimensions
        hidden_state = torch.zeros(HIDDEN_CELL_STATE_DIM).view(1, 1, HIDDEN_CELL_STATE_DIM).to(device)
        cell_state = torch.zeros(HIDDEN_CELL_STATE_DIM).view(1, 1, HIDDEN_CELL_STATE_DIM).to(device)
        
        while True:
            hidden_layers_deque.append((hidden_state, cell_state))
            if flag == 1:
                counter += 1
            epsilon = epsilon_decay(counter, EPSILON_INIT, EPSILON_FINAL)
            action, (hidden_state, cell_state) = agent.act(state, hidden_state, cell_state, epsilon)
            next_state, reward, is_done, _ = env.step(action)
            
            sequence_deque.append((state, action, reward))
            score += reward
            counter_for_episode += 1
            
            
            if is_done and counter_for_episode <80:
                scores.append(score)
                scores_deque.append(score)
                print('The score for episode {} is {}'.format(episode, score))
                print('counter for ep ', counter_for_episode)
                break
            elif is_done and counter_for_episode >=80:
                # One should check out if the deque creates problem later
                scores.append(score)
                scores_deque.append(score)
                buffer.add(sequence_deque, hidden_layers_deque[0])
                print('The score for episode {} is {}'.format(episode, score))
                print('counter for ep ', counter_for_episode)
                break
            elif(counter_for_episode % 40 == 0) and counter_for_episode >= 80:
                print('episode added element {}', counter_for_episode)
                buffer.add(sequence_deque, hidden_layers_deque[0])
                
            if len(buffer) > 10*BATCH_SIZE: 
                flag == 1
                if counter % UPDATE_EVERY == 0:
                    for update in range(NUM_UPDATES):
                        ##print('update')
                        beta = beta_decay(counter, BETA_INIT)
                        batch_exp, batch_h_c_s, weights, indices = buffer.sample(beta)
                        loss = agent.update(batch_exp, batch_h_c_s, weights, beta, indices)
                        loss_tracking.append(loss.cpu().detach().numpy())
                        losses.append((episode, loss))
            
            # TODO Improve the next if using a deque for losses as well and giving back a mean of the losses
            if (counter_for_episode % 40 == 0) and (counter > 2*BATCH_SIZE) :
                temp_losses = np.mean(np.asarray(loss_tracking))
                print('The loss is {}'.format(loss))
                    
            state = next_state
        
        if episode % 10 == 0:
            temp_score = scores[-10:]
            temp_score = np.asarray(temp_score)
            print('The average score of the last ten episodes is ', temp_score.mean())
        
        if episode % SCORES_MEAN == 0:
            mean = np.asarray(scores_deque).mean()
            print('The average of the last {} is {}'.format(SCORES_MEAN, mean))    
            
            # TODO To modify to see what is the best way of saving
            if best_value < mean:
                best_value = mean
                #torch.save({'Q': agent.net, 'Q_tg': agent.net_tg, 'buffer': buffer}, 'Saving/dict.pth')

In [17]:
main(2000000)

episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 1 is 0.0
counter for ep  173
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 2 is 0.0
counter for ep  183
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 3 is 0.0
counter for ep  175
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 4 is 0.0
counter for ep  178
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 5 is 1.0
counter for ep  208
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
The score for episode 6 is 2.0
counter for ep  302
episode added element {} 80
episode added element {} 120
episo

episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 44 is 0.0
counter for ep  166
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
The score for episode 45 is 2.0
counter for ep  276
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 46 is 0.0
counter for ep  186
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 47 is 0.0
counter for ep  191
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
episode added element {} 320
The score for episode 48 is 3.0
counter for ep  339
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element

episode added element {} 320
The score for episode 87 is 3.0
counter for ep  358
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 88 is 1.0
counter for ep  215
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 89 is 0.0
counter for ep  161
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
The score for episode 90 is 2.0
counter for ep  255
The average score of the last ten episodes is  1.2
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 91 is 0.0
counter for ep  163
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
The score for episode 92 is 1.0
counter for ep  244
episode added element {} 80
episo

episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
episode added element {} 320
episode added element {} 360
The score for episode 130 is 3.0
counter for ep  366
The average score of the last ten episodes is  1.2
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 131 is 0.0
counter for ep  185
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 132 is 1.0
counter for ep  219
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
The score for episode 133 is 2.0
counter for ep  284
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added ele

episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
The score for episode 175 is 2.0
counter for ep  310
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
episode added element {} 320
episode added element {} 360
The score for episode 176 is 4.0
counter for ep  388
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 177 is 0.0
counter for ep  190
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 178 is 0.0
counter for ep  190
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 179 is 0.0
counter for ep  181
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added e

episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
The score for episode 216 is 2.0
counter for ep  258
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 217 is 0.0
counter for ep  178
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 218 is 0.0
counter for ep  185
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 219 is 0.0
counter for ep  167
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
episode added element {} 320
episode added element {} 360
episode added element {} 400
The score for episode 220 is 4.0
counter for ep  422
The average score of the last ten episodes is  1.2
episode added element 

episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
episode added element {} 320
The score for episode 261 is 3.0
counter for ep  332
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 262 is 0.0
counter for ep  168
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 263 is 0.0
counter for ep  192
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 264 is 0.0
counter for ep  188
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
The score for episode 265 is 1.0
counter for ep  242
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added e

episode added element {} 240
The score for episode 301 is 2.0
counter for ep  273
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 302 is 0.0
counter for ep  174
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
episode added element {} 320
episode added element {} 360
episode added element {} 400
The score for episode 303 is 4.0
counter for ep  420
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 304 is 1.0
counter for ep  215
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
The score for episode 305 is 3.0
counter for ep  308
episode added element {} 80
episode added element {} 120
episode added e

episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 344 is 1.0
counter for ep  204
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 345 is 0.0
counter for ep  164
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 346 is 0.0
counter for ep  180
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 347 is 1.0
counter for ep  232
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 348 is 0.0
counter for ep  172
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
episode added element {} 320
episode added element {} 360
episode added element {} 400
episode added e

The score for episode 386 is 2.0
counter for ep  309
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
The score for episode 387 is 2.0
counter for ep  272
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 388 is 0.0
counter for ep  174
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 389 is 1.0
counter for ep  215
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
The score for episode 390 is 1.0
counter for ep  251
The average score of the last ten episodes is  1.4
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
The score for episode 391 is 2.0
counter for ep  26

episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 430 is 0.0
counter for ep  172
The average score of the last ten episodes is  1.2
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
episode added element {} 320
The score for episode 431 is 3.0
counter for ep  323
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 432 is 0.0
counter for ep  170
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 433 is 0.0
counter for ep  180
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 434 is 1.0
counter for ep  204
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {

episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
episode added element {} 320
The score for episode 473 is 3.0
counter for ep  356
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 474 is 1.0
counter for ep  219
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 475 is 0.0
counter for ep  201
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 476 is 1.0
counter for ep  233
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 477 is 1.0
counter for ep  211
episode added element {} 80
episode added element {} 120
episode added el

episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
The score for episode 515 is 2.0
counter for ep  274
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
The score for episode 516 is 1.0
counter for ep  251
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
episode added element {} 320
episode added element {} 360
episode added element {} 400
episode added element {} 440
The score for episode 517 is 5.0
counter for ep  476
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
episode added element {} 320
The score for episode 518 is 2.0
counter for ep  322
episode added element {} 80
episode a

The score for episode 555 is 5.0
counter for ep  462
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
The score for episode 556 is 2.0
counter for ep  301
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 557 is 1.0
counter for ep  214
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
The score for episode 558 is 1.0
counter for ep  241
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 559 is 0.0
counter for ep  198
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 560 is 0.0
counter for ep  184
The average score of the last ten episodes is  1.5
episode added element {} 80

episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
The score for episode 599 is 3.0
counter for ep  319
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 600 is 1.0
counter for ep  213
The average score of the last ten episodes is  1.0
The average of the last 100 is 1.39
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 601 is 1.0
counter for ep  213
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
The score for episode 602 is 1.0
counter for ep  252
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 603 is 0.0
counter for ep  181
episode added e

episode added element {} 160
episode added element {} 200
episode added element {} 240
The score for episode 645 is 2.0
counter for ep  268
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 646 is 0.0
counter for ep  185
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
episode added element {} 320
episode added element {} 360
episode added element {} 400
episode added element {} 440
The score for episode 647 is 5.0
counter for ep  466
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
The score for episode 648 is 1.0
counter for ep  249
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 649 is 1.0
counter for ep  211
episode added 

The score for episode 687 is 1.0
counter for ep  243
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 688 is 0.0
counter for ep  179
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 689 is 1.0
counter for ep  234
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
The score for episode 690 is 2.0
counter for ep  284
The average score of the last ten episodes is  1.6
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
The score for episode 691 is 2.0
counter for ep  269
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 692 is 1.0
counter for ep  21

episode added element {} 360
The score for episode 729 is 3.0
counter for ep  377
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 730 is 0.0
counter for ep  188
The average score of the last ten episodes is  1.8
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 731 is 1.0
counter for ep  209
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 732 is 0.0
counter for ep  182
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 733 is 0.0
counter for ep  186
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 734 is 1.0
counter for ep  233
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200

episode added element {} 280
The score for episode 771 is 2.0
counter for ep  309
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 772 is 1.0
counter for ep  234
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 773 is 0.0
counter for ep  174
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 774 is 0.0
counter for ep  170
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
The score for episode 775 is 1.0
counter for ep  241
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 776 is 1.0
counter for ep  236
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element

episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 816 is 1.0
counter for ep  215
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 817 is 0.0
counter for ep  180
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
episode added element {} 240
episode added element {} 280
The score for episode 818 is 2.0
counter for ep  298
episode added element {} 80
episode added element {} 120
episode added element {} 160
episode added element {} 200
The score for episode 819 is 1.0
counter for ep  234
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 820 is 0.0
counter for ep  189
The average score of the last ten episodes is  0.7
episode added element {} 80
episode added element {} 120
episode added element {} 160
The score for episode 821 is 0.0
counter for ep  19

episode added element {} 80
episode added element {} 120
episode added element {} 160


KeyboardInterrupt: 

In [None]:
scores

# Experiment CELL

In [None]:
alpha = torch.FloatTensor(1)

In [None]:
alpha.cpu().numpy()

In [None]:
prova = [(1, 2, 3), (4,5,6), (7,8,9)]

In [None]:
a, b, c = zip(*prova)

In [None]:
a = torch.FloatTensor(a)

In [None]:
a

In [None]:
prova_2 = [(1, 2), (3, 4), (5,6)]

In [None]:
alpha_1 = torch.FloatTensor((1,2)).view(1, 1, 2).squeeze(0).squeeze(0)

In [None]:
alpha_1

In [None]:
beta_1 = torch.FloatTensor((3,4)).view(1, 1, 2).squeeze(0).squeeze(0)

In [None]:
alpha_2 = torch.FloatTensor((5,6)).view(1, 1, 2).squeeze(0).squeeze(0)

In [None]:
beta_2 = torch.FloatTensor((7,8)).view(1, 1, 2).squeeze(0).squeeze(0)

In [None]:
alpha_3 = torch.FloatTensor((9,10)).view(1, 1, 2).squeeze(0).squeeze(0)

In [None]:
beta_3 = torch.FloatTensor((11, 12)).view(1, 1, 2).squeeze(0).squeeze(0)

In [None]:
batch_emulation = [(alpha_1, beta_1), (alpha_2, beta_2), (alpha_3, beta_3)]

In [None]:
a, b = zip(*batch_emulation)

In [None]:
a

In [None]:
torch.cat(a).view(1, 3, -1)

In [None]:
random.random()

In [None]:
random.choice(range(10))

In [None]:
prova = np.arange(10).reshape(5,2)
prova = torch.FloatTensor(prova)

In [None]:
torch.mean(prova, dim = 1)

In [None]:
#a = torch.FloatTensor([[[1,2],[2,4]],[[3,4],[5,6]]])

In [None]:
#a = torch.FloatTensor([[1,2],[2,4],[3,4],[5,6]])

In [None]:
#a.shape

In [None]:
#a.view(2*2*2)

# Epsilon_Decay

## Maybe it is the case to find a less aggressive epsilon_decay strategy.

# BETA_DECAY

In [None]:
# TODO beta decay function

In [None]:
def main(num_episodes):    
    
    count = 1 # Needed for epsilon decay, beta decay
    scores = []
    scores_deque = deque(maxlen = SCORES_MEAN) # Useful to check if we reached the goal.
    critic_losses = []
       
    for episode in range(1, num_episodes + 1):
        state = env.reset()
        score = 0
        
        # NOTICE this is needed since the first time one needs to reshape the hidden_state and cell_state under the proper dimensions
        hidden_state = torch.zeros(HIDDEN_CELL_STATE_DIM).view(1, 1, HIDDEN_CELL_STATE_DIM).to(device)
        cell_state = torch.zeros(HIDDEN_CELL_STATE_DIM).view(1, 1, HIDDEN_CELL_STATE_DIM).to(device)

        store_1 = ((hidden_state, cell_state))
        
        first_seq_to_store = []
        second_seq_to_store = []
        
        counting_1_for_storing = 0
        counting_2_for_storing = 0
        
        # TODO Check if the way I handled is done is correct
        
        for counter in range(40):
            epsilon = epsilon_decay(count, EPSILON_INIT, EPSILON_FINAL)
            action, (hidden_state, cell_state) = agent.act(state, hidden_state, cell_state, epsilon)
            next_state, reward, is_done, _ = env.step(action)
            
            counting_1_for_storing += 1
            count += 1
            
            first_seq_to_store.append((state, action, reward))
            
            score += reward
            if is_done == True:
                count -= countin_1_for_storing 
                scores.append(score)
                scores_deque.append(score)
                print('score ', score)
                break
            
        while True:
            
            # TODO CHECK IF THE HANDLING OF IS_DONES IS CORRECT
            if counting_1_for_storing == 40:
                store_2 = ((hidden_state, cell_state))
                
                epsilon = epsilon_decay(count, EPSILON_INIT, EPSILON_FINAL)
                action, (hidden_state, cell_state) = agent.act(state, hidden_state, cell_state, epsilon)              
                next_state, reward, is_done, _ = env.step(action)            
                
                counting_1_for_storing += 1
                counting_2_for_storing += 1
                count += 1
                
                first_seq_to_store.append((state, action, reward))
                second_seq_to_store.append((state, action, reward))
                
                score += reward
                if is_done == True:
                    # TODO DOUBLE CHECK THIS ABOUT COUNT SINCE IT MIGHT BE WRONG
                    count -= 40
                    scores.append(score)
                    scores_deque.append(score)
                    print('score ', score)
                    break
            
            elif counting_2_for_storing == 79:
                epsilon = epsilon_decay(count, EPSILON_INIT, EPSILON_FINAL)
                action, (hidden_state, cell_state) = agent.act(state, hidden_state, cell_state, epsilon)              
                next_state, reward, is_done, _ = env.step(action)            
                
                counting_1_for_storing += 1
                counting_2_for_storing = 0
                count += 1
                
                first_seq_to_store.append((state, action, reward))
                second_seq_to_store.append((state, action, reward))
                
                # TODO DOUBLE CHECK .add method in replay buffer
                buffer.add(second_seq_to_store, store_2)
                
                second_seq_to_store = []
                score += reward
                if is_done == True:
                    scores.append(score)
                    scores_deque.append(score)
                    print('score ', score)
                    break
                
            elif counting_1_for_storing == 79:
                epsilon = epsilon_decay(count, EPSILON_INIT, EPSILON_FINAL)
                action, (hidden_state, cell_state) = agent.act(state, hidden_state, cell_state, epsilon)              
                
                next_state, reward, is_done, _ = env.step(action)            
                counting_1_for_storing = 0
                counting_2_for_storing += 1
                count += 1
                
                first_seq_to_store.append((state, action, reward))
                second_seq_to_store.append((state, action, reward))
                
                buffer.add(first_seq_to_store, store_1)
                score += reward
                first_seq_to_store = []
                if is_done == True:
                    scores.append(score)
                    scores_deque.append(score)
                    print('score ', score)
                    break
            
            elif counting_2_for_storing == 40:
                store_1 = ((hidden_state, cell_state))
                
                epsilon = epsilon_decay(count, EPSILON_INIT, EPSILON_FINAL)
                action, (hidden_state, cell_state) = agent.act(state, hidden_state, cell_state, epsilon)              
                next_state, reward, is_done, _ = env.step(action)            
                
                counting_1_for_storing += 1
                counting_2_for_storing += 1
                count += 1
                score += reward
                first_seq_to_store.append((state, action, reward))
                second_seq_to_store.append((state, action, reward))
                
                if is_done == True:
                    count -= 40
                    scores.append(score)
                    scores_deque.append(score)
                    print('score ', score)
                    break
                    
            else:
                epsilon = epsilon_decay(count, EPSILON_INIT, EPSILON_FINAL)
                action, (hidden_state, cell_state) = agent.act(state, hidden_state, cell_state, epsilon)
                next_state, rewad, is_done,_ = env.step(action)
                
                counting_1_for_storing += 1
                counting_2_for_storing += 1
                count += 1
                score += reward
                first_seq_to_store.append((state, action, reward))
                second_seq_to_store.append((state, action, reward))
                # TODO CHECK COUNT
                if is_done == True:
                    minus_val = (count-1)%80
                    count -= minus_val
                    scores.append(score)
                    scores_deque.append(score)
                    print('score ', score)
                    break
                
            if len(buffer) > BATCH_SIZE:    
                if count % UPDATE_EVERY == 0:
                    for update in range(NUM_UPDATES):
                        print('update')
                        beta = beta_decay(count, BETA_INIT)
                        # CORRECTING THE BATCHING IN ORDER TO GET ALSO IS_DONES
                        batch_exp, batch_h_c_s, weights, indices = buffer.sample(beta)
                        # TODO CHECK IF IT IS CORRECT THE WAY THE UPDATE IS DONE
                        agent.update(batch_exp, batch_h_c_s, weights, beta, indices)
                    
            state = next_state

In [None]:
main(2000)

In [None]:



### VALUES TO TRY THE UPDATE THE NET
net = R2D2_Net(1, 2, (1,1,2))

a = [1 , 2 , 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
b = [1, 2, 3, 4, 5, 6, 7, 8]
c = [4, 5, 6, 7, 8 , 9, 10, 11]

a = torch.FloatTensor(a).view(4, 3, 1)
b = torch.FloatTensor(b).view(1, 4, 2)
c = torch.FloatTensor(c).view(1, 4, 2)
out, (alpha, beta) = net(a, 4, 3, b ,c)
### END OF SECTION VALUES TO TRY TO UPDATE THE NERT

### VALUES TO TRY TO SEE WHAT NEEDS THE .act METHOD TO WORK
A = [1]
B = [2, 3]
C = [4, 5]

A = torch.FloatTensor(A).view(1, 1, 1)
B = torch.FloatTensor(B).view(1, 1, 2)
C = torch.FloatTensor(C).view(1, 1, 2)
OUT, (ALPHA, BETA) = net(A, 1, 1, B ,C)

print ('OUT', OUT)
print ('OUT SHAPE', OUT.shape)
print ('OUT', torch.argmax(OUT, dim = 2))
print(' ALPHA', ALPHA)
print('ALPHA', ALPHA.shape)
print('BETA', BETA)
print('BETA', BETA.shape)

print('########################')
print(out)
print(out.shape)
target_actions = torch.argmax(out, dim = 2)
print(target_actions)
print(target_actions.shape)
print('########################')

print(alpha.shape)
out_boot_strap = out.max(dim = 2)
out_boot_strap = out_boot_strap[0].unsqueeze(2)
print(out_boot_strap)
print(out_boot_strap.shape)
print(out_boot_strap[:, :-1, :])
# CHEKCING IF GATHER WORKS WELL OVER HERE
actions = (0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1)
actions = torch.LongTensor(actions)
actions = actions.view(4, 3, 1)
print(actions)
out_new = out.gather(dim =2, index = actions)
print(out_new)
print(out_new.shape)
print(a.shape)
A = a[:,:-1, :]
print(A)
print(A.shape)

In [None]:
from collections import deque

ciao = deque(maxlen = 3)

In [None]:
ciao.append(4)
ciao.append(5)
ciao.append(6)
ciao.append(7)

In [None]:
import numpy as np

In [None]:
ciao = np.asarray(ciao)

In [None]:
ciao.clear()

In [None]:
ciao

In [None]:
ciao.append(1)

In [None]:
ciao

In [None]:
lista_ciao = list()

In [None]:
lista_ciao = ciao

In [None]:
lista_ciao

In [None]:
import torch
import os

In [None]:
save_dir = 'Saving'
model_name = 'model_name'

In [None]:
save_prefix = os.path.join(save_dir, model_name)

In [None]:
os.mkdir('Saving')

In [None]:
torch.save(lista_ciao,'Saving/lista.pth')

In [None]:
np.inf

In [None]:
deque_1 = deque(maxlen = 3)
deque_2 = deque(maxlen = 3)
deque_3 = deque(maxlen = 3)
deque_1.append((2,3,4))
deque_1.append((5,6,7))
deque_1.append((7,8,9))
deque_2.append((12,13,14))
deque_2.append((14,15,16))
deque_2.append((17,18,19))
deque_3.append((22,23,24))
deque_3.append((24,25,26))
deque_3.append((27,28,29))

lista_1 = []
lista_2 = []
lista_3 = []
lista_1.append((2,3,4))
lista_1.append((5,6,7))
lista_1.append((7,8,9))
lista_2.append((12,13,14))
lista_2.append((14,15,16))
lista_2.append((17,18,19))
lista_3.append((22,23,24))
lista_3.append((24,25,26))
lista_3.append((27,28,29))


In [None]:

ola = []
ola += deque_1
print(ola)
ola += deque_2
print(ola)
ola += deque_3
print(ola)
states, actions, reward = zip(*ola)
states
actions

In [None]:
indexes = [1,5, 7]

In [None]:
batch_h_c_s = [ola[idx] for idx in indexes]
batch_h_c_s

In [None]:
rewards = np.arange(0, 80*3)
rewards = np.array(rewards)

In [None]:
rewards = rewards.reshape(3, 80)
print(rewards)

In [None]:
gam = 1
lista_bootstrap = []

temp = 1
for i in range(3):
    rewards_reversed = np.flip(rewards[i])
    print(rewards_reversed)
    lista_temp_bootstrap = []
    for j in range(1,40):
        temp = gam * temp
        temp += rewards_reversed[j] 
        lista_temp_bootstrap.append(temp)
    lista_bootstrap.append(lista_temp_bootstrap)
    
lista_bootstrap = np.asarray(lista_bootstrap)
print(lista_bootstrap)
print(lista_bootstrap.shape)

# New_Main

In [None]:
main(200)

In [None]:
dictionary = {'a':123}
dictionary['a']

In [None]:
# COSE DA FARE: Controllare quando stipare l'hidden_state e l'is_done
# COSE DA FARE: Tentare di capire come aggiungere l'ultimo is_done
# COSE DA FARE Ricontrollare l-update e usare uno bootstrap piu' lungo, cioe' rigirare le rewards tenendo in conto is_dones
# e tenendo in conto l'ultimo valore via targets.
# Chiedere a Georg se e' il modo in cui l' ha fatto anche lui
# In generale ridare una controllata al codice
# COSE DA FARE: Vedere se e' il caso di trasformare la DEQUE Nella lista e se funziona bene pure cosi
# COSE DA FARE: Aggiungere una funzione play
# COSE DA FARE: Aggiungere funzione salvataggio modello migliore

# COSE DA FARE CON PIU' CALMA Cercare una vera implementazione di Prioritized Exp Replay e aggiungergliela sopra
# COSE DA FARE CON PIU' CALMA Studiare Reinbow che e' quello che alla fine veramente 

In [None]:
def main(num_episodes):    
    
    count = 1 # Needed for epsilon decay, beta decay
    scores = []
    scores_deque = deque(maxlen = SCORES_MEAN) # Useful to check if we reached the goal.
    losses = []
       
    for episode in range(1, num_episodes + 1):
        counter_episode = 1 # Needed to slice sub-parts of episodes of lenghts 80
        state = env.reset()
        score = 0
        
        # NOTICE this is needed since the first time one needs to reshape the hidden_state and cell_state under the proper dimensions
        hidden_state = torch.zeros(HIDDEN_CELL_STATE_DIM).view(1, 1, HIDDEN_CELL_STATE_DIM).to(device)
        cell_state = torch.zeros(HIDDEN_CELL_STATE_DIM).view(1, 1, HIDDEN_CELL_STATE_DIM).to(device)
        store = ((hidden_state, cell_state))
        
        while True:
            epsilon = epsilon_decay(count, EPSILON_INIT, EPSILON_FINAL)
            action, (hidden_state, cell_state) = agent.act(state, hidden_state, cell_state, epsilon)
            next_state, reward, is_done, _ = env.step(action)
            
            counter_episode += 1
            
            if is_done and counter_episode<80:
                break
                
            if (counter_episode % 40 == 0) and (counter_episode % 80 != 0):
            
            