In [2]:
import torch
import torch.nn as nn

import numpy as np
import random
from collections import defaultdict

import wandb


batch_size = 50
run_name = " hns som exp"

cuda = torch.device('cuda:2')


wandb.init(entity = "dexfrost89", project="coin game", name=str(batch_size)+run_name)

session_saver = []

class env:
    def __init__(self, size, step_limit, debug = 0, seed = 42):
        self.seed = seed
        random.seed(self.seed )
        self.number_of_resources = 3
        self.amount_of_resource = 4
        self.debug = debug
        self.size = size
        self.base_step_limit = step_limit
        self.step_limit = self.base_step_limit
        #self.valid_moves = ['up', 'down', 'left', 'right', 'pass']
        self.valid_moves = [0, 1, 2, 3, 4]
        self.move_map = {0:[-1,0], 
                         1:[1,0], 
                         2:[0,-1], 
                         3:[0,1], 
                         4:[0,0]}
        self.initialize_map()
        self.initialize_resources()
        

    
    def initialize_map(self):
        self.map = np.zeros((self.size, self.size), dtype=int)
        agent_rand_loc = random.randint(0,1)
        if agent_rand_loc:
            agent_1_coords = [0,self.size -1]
            self.map[0,self.size -1 ] = -1
            agent_2_coords = [0,0]
            self.map[0, 0] = -2 
            
        else:
            agent_1_coords = [0,0]
            self.map[0,0] = -1
            agent_2_coords = [0,self.size-1]
            self.map[0,self.size-1] = -2
        self.agents_coords = [agent_1_coords, agent_2_coords]
        
    def initialize_resources(self):
        self.resources = []
        index = [i for i in range(self.size**2) if (i != 0) and (i != self.size-1)]
        all_indexes = random.sample(index, self.number_of_resources*self.amount_of_resource)
        res_ind = []
        for i in range(self.number_of_resources):
            resource = random.sample(all_indexes, self.amount_of_resource)
            all_indexes = [r for r in all_indexes if r not in resource]
            resource = [(int(r/self.size),r%self.size) for r in resource]
            empty_map = np.zeros((self.size, self.size), dtype=int)
            for ind in range(len(resource)):
                x, y = resource[ind][0], resource[ind][1]                
                self.map[x, y] = i+1
                empty_map[x, y] = 1
            self.resources.append(empty_map)
        goal_index = [i for i in range(self.number_of_resources)]
        goals = random.sample(goal_index, 2)
        self.agents_goals = goals
        self.agents_res_collected = [[], []]
        self.agents_self_goal_collected = [0, 0]
        self.agents_othr_goal_collected = [0, 0]
        self.agents_nthr_goal_collected = [0, 0]
                
    def step(self, agent_numb, move_code):
        if self.step_limit == 0:
            if self.debug:
                print('step limit reached, error')
        else:
            self.step_limit -= 1
            if self.debug:
                print(self.step_limit, 'steps left')
        if move_code not in self.valid_moves:
            if self.debug:
                print('icorrect move')
        else:
            agent_old_coords = self.agents_coords[agent_numb]
            move = self.move_map[move_code]
            agent_new_coords = l3 = [l+r for l,r in zip(agent_old_coords, move)]
            correct_move = True
            for coord in agent_new_coords:
                if (coord < 0) or (coord >= self.size):
                    if self.debug:
                        print('icorrect move')
                    correct_move = False
            other_coords = self.agents_coords[1-agent_numb]
            if agent_new_coords == other_coords:
                if self.debug:
                    print('icorrect move')
                correct_move = False
            if correct_move:
                self.map[agent_old_coords[0],agent_old_coords[1]] = 0
                self.map[agent_new_coords[0],agent_new_coords[1]] = -1 - agent_numb
                self.agents_coords[agent_numb] = agent_new_coords
                for res in range(self.number_of_resources):
                    if self.resources[res][agent_new_coords[0],agent_new_coords[1]] != 0:
                        self.agents_res_collected[agent_numb].append(res)
                        self.resources[res][agent_new_coords[0],agent_new_coords[1]] = 0
                        if res == self.agents_goals[agent_numb]:
                            self.agents_self_goal_collected[agent_numb]+=1
                            if self.debug:
                                print('collected self goal resource:', res)
                        elif res == self.agents_goals[1 - agent_numb]:
                            self.agents_othr_goal_collected[agent_numb]+=1
                            if self.debug:
                                print('collected other goal resource:', res)
                        else:
                            self.agents_nthr_goal_collected[agent_numb]+=1
                            if self.debug:
                                print('collected neither goal resource:', res)
                            
    def observation(self, agent_numb, with_map = 0):
        feature_vec = []
        self_other_coords = [self.agents_coords[agent_numb], 
                             self.agents_coords[1 - agent_numb] 
                            ]
        for agnt_coords in self_other_coords:
            agent_map = np.zeros((self.size, self.size), dtype=int)
            agent_map[agnt_coords[0],agnt_coords[1]] = 1
            agent_map = np.reshape(agent_map, self.size*self.size)
            feature_vec.extend(agent_map)
            
        for res in range(len(self.resources)):
            feature_vec.extend(np.reshape(self.resources[res], self.size*self.size))
        
        if with_map:
            feature_vec.extend(np.reshape(self.map, self.size*self.size))
        
        return feature_vec
    
    def reward(self):
        
        n_self_c_self = self.agents_self_goal_collected[0]
        n_othr_c_self = self.agents_othr_goal_collected[1]
        
        n_self_c_othr = self.agents_othr_goal_collected[0]
        n_othr_c_othr = self.agents_self_goal_collected[1]
        
        n_self_c_nthr = self.agents_nthr_goal_collected[0]
        n_othr_c_nthr = self.agents_nthr_goal_collected[1]
        
        reward = (n_self_c_self + n_othr_c_self)**2 + \
                 (n_self_c_othr + n_othr_c_othr)**2 - \
                 (n_self_c_nthr + n_othr_c_nthr)**2 
        
        return reward

    def reset(self):
        random.seed(self.seed )
        self.step_limit = self.base_step_limit
        self.initialize_map()
        self.initialize_resources()



Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
wandb: Wandb version 0.9.1 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


In [3]:
import torch.nn.functional as F

In [4]:
seeds = [13, 28, 37, 99, 72]

In [4]:
class Ffunction(nn.Module):
    def __init__(self, input_size):
        super(Ffunction, self).__init__()
        self.input_size = input_size
        self.fullycon1 = nn.Linear(self.input_size, 64)
        nn.init.orthogonal_(self.fullycon1.weight)
        self.fullycon1.bias.data.fill_(0)

        self.fullycon2 = nn.Linear(64, 64)
        nn.init.orthogonal_(self.fullycon2.weight)
        self.fullycon2.bias.data.fill_(0)
        
        self.lstm = nn.LSTM(64, 64)
        nn.init.orthogonal_(self.lstm.weight_hh_l0)
        nn.init.orthogonal_(self.lstm.weight_ih_l0)
        self.lstm.bias_ih_l0.data.fill_(0)
        self.lstm.bias_hh_l0.data.fill_(0)

        self.policy = nn.Linear(64, 5)
        nn.init.orthogonal_(self.policy.weight)
        self.policy.bias.data.fill_(0)
        self.value = nn.Linear(64, 1)
        nn.init.orthogonal_(self.value.weight)
        self.value.bias.data.fill_(0)

        self.entropy_loss_coef = 0.01
        self.value_loss_coef = 0.5

    def forward(self, inputs):
        inputs, (hx, cx) = inputs
        
        x = nn.functional.elu(self.fullycon1(inputs))
        x = nn.functional.elu(self.fullycon2(x))

        x = x.view(-1, 1, 64)

        x, (hx, cx) = self.lstm(x, (hx, cx))
        x = x.view(-1, 64)
        return nn.functional.softmax(self.policy(x)), self.value(x), (hx, cx)

    def get_a2c_loss(self, actions, probs, values, rewards):

        advantages = rewards - values
        value_loss = (advantages.pow(2)).mean()

        entropy_loss = (-torch.sum(probs * torch.log(probs), (1))).mean()
        
        logprobs = torch.log(probs[torch.arange(len(probs)), actions])
        policy_loss = -(logprobs * advantages.detach()).mean()

        #print(policy_loss.item(), entropy_loss.item(), value_loss.item())

        return policy_loss - self.entropy_loss_coef * entropy_loss + self.value_loss_coef * value_loss, \
    (policy_loss, entropy_loss, value_loss)

In [5]:
class ScaledDotProductAttention(nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)

    def forward(self, q, k, v, mask=None):

        attn = torch.matmul(q / self.temperature, k.transpose(2, 3))

        if mask is not None:
            attn = attn.masked_fill(mask == 0, -1e9)

        attn = self.dropout(F.softmax(attn, dim=-1))
        output = torch.matmul(attn, v)

        return output, attn

In [6]:
class MultiHeadAttention(nn.Module):
    ''' Multi-Head Attention module '''

    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
        self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
        self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
        self.fc = nn.Linear(n_head * d_v, d_model, bias=False)

        self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5)

        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)


    def forward(self, q, k, v, mask=None):

        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
        sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)

        residual = q
        q = self.layer_norm(q)
        #print(q.size(0), q.size(1))
        # Pass through the pre-attention projection: b x lq x (n*dv)
        # Separate different heads: b x lq x n x dv
        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)

        # Transpose for attention dot product: b x n x lq x dv
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
        
        if mask is not None:
            mask = mask.unsqueeze(1)   # For head axis broadcasting.

        q, attn = self.attention(q, k, v, mask=mask)

        # Transpose to move the head dimension back: b x lq x n x dv
        # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
        q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
        q = self.dropout(self.fc(q))
        q += residual

        return q, attn

In [7]:
class Embedding(nn.Module):
    def __init__(self, input_size, d_model):
        super().__init__()
        self.embedding = nn.Linear(input_size, d_model)
    
    def forward(self, input):
        return self.embedding(input)

In [8]:
class HnSagent(nn.Module):
    def __init__(self, input_size):
        super(HnSagent, self).__init__()
        self.self_embedding = Embedding(input_size + 6, 64)
        self.agent_embedding = Embedding(input_size, 64)
        self.coin_embedding = Embedding(input_size, 64)
        self.embedding_norm = nn.LayerNorm(64)
        
        self.attention = MultiHeadAttention(4, 64, 32, 32)
        self.attention_norm = nn.LayerNorm(64)
        
        self.average_pooling = nn.AvgPool1d(5)
        self.pooling_norm = nn.LayerNorm(64)
        
        self.lstm = nn.LSTM(64, 64)
        self.lstm_norm = nn.LayerNorm(64)
        
        self.hidden = nn.Linear(64, 64)
        self.hidden_norm = nn.LayerNorm(64)
        
        self.policy = nn.Linear(64, 5)
        
        self.value = nn.Linear(64, 1)
        self.entropy_loss_coef = 0.01
        self.value_loss_coef = 0.5

    def forward(self, inputs):
        inputs, (hx, cx) = inputs
        self_agent = torch.cat([inputs[:64], inputs[320:]]).reshape(1, 70)
        agents = inputs[64:128].reshape(1, 64)
        coins = inputs[128:320].reshape(3, 64)
        #print(self.agent_embedding(agents))
        x = self.embedding_norm(torch.cat([self.self_embedding(self_agent), self.agent_embedding(agents), \
                                           self.coin_embedding(coins)]))
        x = x.reshape(1, -1, 64)
        x = self.attention_norm(self.attention(x, x, x)[0])

        x = self.pooling_norm(self.average_pooling(x.reshape(1, 64, -1)).reshape(1, -1, 64))

        x, (hx, cx) = self.lstm(x, (hx, cx))
        x = self.lstm_norm(x.view(-1, 64))
        
        x = self.hidden_norm(self.hidden(x))
        return nn.functional.softmax(self.policy(x)), nn.functional.relu(self.value(x)), (hx, cx)

    def get_a2c_loss(self, actions, probs, values, rewards):

        advantages = rewards - values
        value_loss = (advantages.pow(2)).mean()

        entropy_loss = (-torch.sum(probs * torch.log(probs), (1))).mean()
        
        logprobs = torch.log(probs[torch.arange(len(probs)), actions])
        policy_loss = -(logprobs * advantages.detach()).mean()

        #print(policy_loss.item(), entropy_loss.item(), value_loss.item())

        return policy_loss - self.entropy_loss_coef * entropy_loss + self.value_loss_coef * value_loss, \
    (policy_loss, entropy_loss, value_loss)

In [9]:
agent1 = HnSagent(64)
agent1.to(device=cuda)
opt1 = torch.optim.Adam(agent1.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1 * 10 ** -8)

agent2 = HnSagent(64)
agent2.to(device=cuda)
opt2 = torch.optim.Adam(agent2.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1 * 10 ** -8)

# buffer_ag_N = [[input, action, revard]*number_of_samples]
buffer_ag_1 = []
buffer_ag_2 = []
reward_buf = []
self_coins_collected1 = []
self_coins_collected2 = []
other_coins_collected1 = []
other_coins_collected2 = []
neither_coins_collected1 = []
neither_coins_collected2 = []

def sample_from_buffer(buffer, batch_size=10):
    batch = random.sample(buffer, batch_size)
    inputs = [i[0] for i in batch]
    actions = [i[1] for i in batch]
    revards = [i[2] for i in batch]
    
    inputs = torch.FloatTensor(inputs)
    actions = torch.LongTensor(actions)
    revards = torch.FloatTensor(revards)
    
    return inputs, actions, revards

def sample_buffer(buffer):
    actions = [i[0] for i in buffer]
    probs = [i[1] for i in buffer]
    values = [i[2] for i in buffer]
    rewards = [i[3] for i in buffer]

    actions = torch.cat(actions)
    probs = torch.cat(probs)
    values = torch.LongTensor(values)
    rewards = torch.FloatTensor(rewards)

    return actions, probs, values, rewards

for episode in range(1, 1000001):

    hx11, cx11 = torch.zeros(1, 1, 64), torch.zeros(1, 1, 64)

    hx21, cx21 = torch.zeros(1, 1, 64), torch.zeros(1, 1, 64)
  
    for game in range(batch_size // 10):
        hx1 = hx11.detach().to(device=cuda)
        cx1 = cx11.detach().to(device=cuda)

        hx2 = hx21.detach().to(device=cuda)
        cx2 = cx21.detach().to(device=cuda)

        session_saver.append([])
        environment = env(8, 20, seed=seeds[game % 5])
        session_saver[-1].append(environment.map.tolist())

        x_other1 = torch.FloatTensor([1] * 3).to(device=cuda) / 3
        x_other1.requires_grad_(True)
        opt_other1 = torch.optim.SGD([x_other1], lr=0.1)
        x_self1 = [0] * 3
        x_self1[environment.agents_goals[0]] = 1
        x_self1 = torch.FloatTensor(x_self1).to(device=cuda)
        x_self1.requires_grad_(False)

        x_other2 = torch.FloatTensor([1] * 3).to(device=cuda) / 3
        x_other2.requires_grad_(True)
        opt_other2 = torch.optim.SGD([x_other2], lr=0.1)
        x_self2 = [0] * 3
        x_self2[environment.agents_goals[1]] = 1
        x_self2 = torch.FloatTensor(x_self2).to(device=cuda)
        x_self2.requires_grad_(False)


        action1, action2 = 0, 0

        seq1 = []
        seq2 = []

        for step in range(10):
            #Make action
            state1 = environment.observation(0, with_map=0)
            input1 = torch.cat([torch.Tensor(state1).to(device=cuda), x_self1, x_other1]).to(device=cuda)
            probs1, values1, (hx11, cx11) = agent1.forward((input1, (hx1, cx1)))
            #action1 = torch.argmax(probs1)
            action1 = torch.distributions.Categorical(probs1).sample()
            seq1.append([torch.LongTensor([action1]), probs1, values1])


            state2 = environment.observation(1, with_map=0)
            input2 = torch.cat([torch.Tensor(state2).to(device=cuda), x_self2, x_other2]).to(device=cuda)
            probs2, values2, (hx21, cx21) = agent2.forward((input2, (hx2, cx2)))
            #action2 = torch.argmax(probs2)
            action2 = torch.distributions.Categorical(probs2).sample()
            seq2.append([torch.LongTensor([action2]), probs2, values2])

            #Update SOM
            opt_other1.zero_grad()
            input_other1 = torch.cat([torch.Tensor(state2).to(device=cuda), x_other1, x_self1]).to(device=cuda)
            probs1, _, _ = agent1.forward((input_other1, (hx2, cx2)))
            #print(probs1)
            loss = torch.nn.functional.cross_entropy(probs1.view(1, 5), torch.LongTensor([action2.item()]).to(device=cuda))
            loss.backward()
            opt_other1.step()

            #print(action1.item() == torch.argmax(probs1).item())

            opt_other2.zero_grad()
            input_other2 = torch.cat([torch.Tensor(state1).to(device=cuda), x_other2, x_self2]).to(device=cuda)
            probs2, _, _ = agent2.forward((input_other2, (hx1, cx1)))
            loss = torch.nn.functional.cross_entropy(probs2.view(1, 5), torch.LongTensor([action1.item()]).to(device=cuda))
            loss.backward()
            opt_other2.step()


            #Add SA to sequences
            environment.step(0, action1.item())
            session_saver[-1].append(environment.map.tolist())

            environment.step(1, action2.item())
            session_saver[-1].append(environment.map.tolist())
            
            hx1, cx1 = hx11.clone().detach(), cx11.clone().detach()
            hx2, cx2 = hx21.clone().detach(), cx21.clone().detach()
    
    #Add sequences to buffer
        reward = environment.reward()
        reward_buf.append(reward)
        self_coins_collected1.append(environment.agents_self_goal_collected[0])
        self_coins_collected2.append(environment.agents_self_goal_collected[1])
        other_coins_collected1.append(environment.agents_othr_goal_collected[0])
        other_coins_collected2.append(environment.agents_othr_goal_collected[1])
        neither_coins_collected1.append(environment.agents_nthr_goal_collected[0])
        neither_coins_collected2.append(environment.agents_nthr_goal_collected[1])

        for i in range(10):
            buffer_ag_1.append(seq1[-i - 1] + [torch.FloatTensor([[reward]])])
            buffer_ag_2.append(seq2[-i - 1] + [torch.FloatTensor([[reward]])])
            reward *= 0.99

    #A2C update
    if(len(buffer_ag_1) >= batch_size and len(buffer_ag_2) >= batch_size):

        print(np.mean(reward_buf))

        actions, probs, values, rewards = sample_buffer(buffer_ag_1)
        opt1.zero_grad()
        loss1, (policy_loss1, entropy_loss1, value_loss1) = agent1.get_a2c_loss(actions.to(device=cuda), probs.to(device=cuda), \
                                    values.to(device=cuda), rewards.to(device=cuda))
        print(loss1.item(), ' ', end='')
        loss1.backward()
        opt1.step()

        actions, probs, values, rewards = sample_buffer(buffer_ag_2)
        opt2.zero_grad()
        loss2, (policy_loss2, entropy_loss2, value_loss2) = agent2.get_a2c_loss(actions.to(device=cuda), probs.to(device=cuda), \
                                    values.to(device=cuda), rewards.to(device=cuda))
        print(loss2.item(), end='\n')
        loss2.backward()
        opt2.step()

        torch.save(agent1.state_dict(), "agent1 "+str(batch_size)+run_name)
        torch.save(agent2.state_dict(), "agent2 "+str(batch_size)+run_name)

        wandb.log({"loss1": loss1.item(), "policy_loss1": policy_loss1.item(), "entropy_loss1": entropy_loss1.item(), \
                   "value_loss1": value_loss1, "loss2": loss2.item(), "policy_loss2": policy_loss2.item(), \
                   "entropy_loss2": entropy_loss2.item(), "value_loss2": value_loss2, \
                   "reward": np.mean(reward_buf), "episode": episode, "games": episode * batch_size // 10, \
                   "self_coins1": np.mean(self_coins_collected1), "self_coins2": np.mean(self_coins_collected2), "other_coins1": np.mean(other_coins_collected1), \
                   "other_coins2": np.mean(other_coins_collected2), "neither_coins1": np.mean(neither_coins_collected1), "neither_coins2": np.mean(neither_coins_collected2)})

        buffer_ag_1 = []
        buffer_ag_2 = []
        reward_buf = []
        self_coins_collected1 = []
        self_coins_collected2 = []
        other_coins_collected1 = []
        other_coins_collected2 = []
        neither_coins_collected1 = []
        neither_coins_collected2 = []

import pickle

ftw = open('save ' + str(batch_size)+run_name, 'wb')


pickle.dump(session_saver, file=ftw)
ftw.close()



0.8
2.0938758850097656  2.1155762672424316


Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
wandb: Wandb version 0.9.1 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


1.4
4.266401767730713  4.150779724121094
0.0
0.1536354273557663  0.23545604944229126
0.0
0.5105471611022949  0.33739542961120605
2.4
10.896157264709473  11.399492263793945
-0.8
1.4098533391952515  1.2395509481430054
1.2
3.8096203804016113  3.4792232513427734
1.4
3.9025158882141113  4.035610675811768
0.6
1.7245248556137085  1.7290774583816528
2.0
7.464557647705078  7.410544395446777
0.2
0.3281417191028595  0.36332327127456665
0.0
0.32055866718292236  0.4121289849281311
0.8
3.2368292808532715  2.8832123279571533
-0.4
0.34880149364471436  0.3190575838088989
-0.2
-0.1652325987815857  -0.13350245356559753
-0.2
-0.23054540157318115  -0.25158804655075073
1.0
8.575150489807129  8.851548194885254
0.8
2.5563738346099854  2.7422871589660645
1.4
9.394455909729004  9.274301528930664
0.2
0.5123145580291748  0.4966003894805908
0.8
2.7414469718933105  2.8431475162506104
-0.4
-0.0072577595710754395  -0.09229332208633423
0.4
0.7575037479400635  0.7629228830337524
1.8
6.670483589172363  6.499204635620117

KeyboardInterrupt: 

In [None]:
!pip install gym


In [1]:
import gym

In [10]:
env = gym.make("CartPole-v0")

In [11]:
wandb.init(entity = "dexfrost89", project="a2c cartpole")

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
wandb: Wandb version 0.8.31 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


W&B Run: https://app.wandb.ai/dexfrost89/a2c+cartpole/runs/n8gdww5s

In [15]:
def sample_buffer(buffer):
    actions = [i[0] for i in buffer]
    probs = [i[1] for i in buffer]
    values = [i[2] for i in buffer]
    rewards = [i[3] for i in buffer]

    actions = torch.cat(actions)
    probs = torch.cat(probs)
    values = torch.LongTensor(values)
    rewards = torch.FloatTensor(rewards)

    return actions, probs, values, rewards
agent = Ffunction(4)
opt = torch.optim.Adam(agent.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1 * 10 ** -8)

buffer = []




reward_buffer = []

for episode in range(1, 100001):
    
    for game in range(5):
        state = env.reset()
        done = False
        hx, cx = torch.zeros(1, 1, 64), torch.zeros(1, 1, 64)
        reward_sum = 0
        
        seq = []
        
        while(not done):
            hx = hx.detach()
            cx = cx.detach()
            
            probs, values, (hx, cx) = agent.forward((torch.Tensor(state), (hx, cx)))
            action = np.random.choice(2, 1, p=probs.cpu().view(-1).detach().numpy())
            
            next_state, reward, done, _ = env.step(int(action))
            
            seq.append([torch.LongTensor([action]), probs, values])
            state = next_state.copy()
            reward_sum += reward
        reward_buffer.append(reward_sum)
        
        for i in range(len(seq)):
            buffer.append(seq[-i - 1] + [torch.FloatTensor([[reward_sum]])])
            reward_sum *= 0.99
            
        
    
    print(np.mean(reward_buffer))
    actions, probs, values, rewards = sample_buffer(buffer)
    opt.zero_grad()
    loss, (policy_loss, entropy_loss, value_loss) = agent.get_a2c_loss(actions.to(device=cuda), probs.to(device=cuda), \
                                values.to(device=cuda), rewards.to(device=cuda))
    print(loss.item(), ' ', end='')
    loss.backward()
    opt.step()
    
    wandb.log({"loss": loss.item(), "policy_loss": policy_loss.item(), "entropy_loss": entropy_loss.item(), \
                   "value_loss": value_loss, \
                   "reward": np.mean(reward_buffer), "episode": episode})

    buffer = []
    reward_buffer = []
    
            
            
            



24.4
311.75726318359375  

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
wandb: Wandb version 0.8.31 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


27.8
432.4104919433594  21.2
218.507568359375  20.2
223.27023315429688  18.8
283.0455322265625  16.8
159.21231079101562  19.8
281.23919677734375  24.2
301.0773620605469  32.0
535.2586669921875  17.8
178.30532836914062  28.6
527.962890625  16.4
147.72422790527344  18.6
241.95828247070312  21.4
240.04640197753906  14.8
115.19706726074219  23.8
326.62677001953125  17.8
167.13705444335938  23.6
582.0220336914062  26.6
869.0665893554688  18.2
182.2181396484375  17.4
186.6610870361328  28.2
389.4013366699219  26.8
359.4521484375  20.4
326.80145263671875  17.8
164.56654357910156  19.8
208.48512268066406  22.4
346.8782043457031  33.6
533.7503051757812  19.6
206.2809295654297  19.0
205.5078582763672  29.8
589.2324829101562  16.4
139.8122100830078  26.0
294.52227783203125  18.0
163.785888671875  36.2
941.7974853515625  18.4
197.57537841796875  27.2
749.0093994140625  22.0
248.4583282470703  21.4
324.03411865234375  24.0
486.0234680175781  25.8
345.07537841796875  22.2
229.39251708984375  16.8
16

KeyboardInterrupt: 

In [None]:
reward_buffer

In [None]:
class agent(nn.Module):
    
    def __init__(self):
        self.fullycon = torch.nn.Linear(4, 4)
        
        self.policy = torch.nn.Linear