In [17]:
import math
import os
import seaborn as sns

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from math import ceil, floor
from collections import deque
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.distributions import Normal
from scipy.stats import rv_discrete


In [18]:
class Memory():
    
    def __init__(self):
        self.reset()
        self.all_rewards = []
    
    def reset(self):
        self.states = []
        self.actions = []
        self.log_probs = []
        self.mexs = []
        self.log_probs_mexs = []
        self.rewards = []
        
    def push(self, s, a, lp):
        self.states.append(s)        
        self.actions.append(a)        
        self.log_probs.append(lp)
        
    def push_comm(self, s, a, lp):
        self.states.append(s)        
        self.mexs.append(a)        
        self.log_probs_mexs.append(lp)
        
    def save_reward(self, r):
        self.rewards.append(r)        
        self.all_rewards.append(r)

In [19]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class CommNet(nn.Module):
    def __init__(self, n_coins, lr=0.01):
        super(CommNet, self).__init__()
        
        self.n_coins = n_coins
        self.lr = lr
        self.n_actions = 2 # 0 non do soldi, 1 do soldi
        
        h_size = int(n_coins)
        a_size = self.n_actions
        self.fc1 = nn.Linear(n_coins+1, h_size)
        self.fc2 = nn.Linear(h_size, a_size)
        
        self.opt = optim.Adam(self.parameters(), lr=lr)

    def forward(self, x): #entrata: n coins che ho
        out = self.fc1(x)
        out = F.relu(out)
        out = self.fc2(out)

        n = out.size()
        mask = torch.zeros(n).to(device)

        return out

In [20]:
class AgentNet(nn.Module):
    def __init__(self, n_players, n_coins, episode_len, n_actions=2, lr = 0.01, _gamma = 0.99):
        super(AgentNet, self).__init__()
        
        self.n_players = n_players
        self.n_coins = n_coins
        self.n_actions = n_actions # 0 non do soldi, 1 do soldi
        self.episode_len = episode_len
        
        h_size = int(n_coins)
        a_size = self.n_actions
        self.fc1 = nn.Linear(n_players+1, h_size)
        self.fc2 = nn.Linear(h_size, a_size)

        self.commnet = CommNet(self.n_coins)
        
        self._gamma = _gamma
        
        self.memory = Memory()
        self.opt = optim.Adam(self.parameters(), lr=lr)
        
        self.reset()
        
    def reset(self):
        self.memory.reset()

    def forward(self, x):
        out = self.fc1(x)
        out = F.relu(out)
        out = self.fc2(out)
        
        return out
    
    def act(self, coin, messages): 

        coin = torch.Tensor([coin])
        #coins = np.array(int(coins))
        
        state = torch.cat((coin, messages), 0).to(device).unsqueeze(dim=0)
        
        #state = torch.from_numpy(state).unsqueeze(dim=0).to(device)
        #print("state=", state)

        probs_total = self.forward(state).cpu()
  
        probs_total = F.softmax(probs_total, dim=1)
        
        m = Categorical(probs_total)

        action = m.sample()
        
        log_prob = m.log_prob(action)
        
        self.memorize(state, action, log_prob)

        return action, log_prob, probs_total

    def comm(self, coin):
        
        state = np.array(int(coin))

        state = torch.from_numpy(state).unsqueeze(dim=0).to(device)
        state = F.one_hot(state, num_classes=self.n_coins+1).float()
        
        probs_total = self.commnet.forward(state).cpu()
        probs_total = F.softmax(probs_total, dim=1)
        
        m = Categorical(probs_total)

        action = m.sample()

        log_prob = m.log_prob(action)
            
        self.memorize_comm(state, action, log_prob)

        return action, log_prob, probs_total
    
    def memorize(self, s, a, lp):
        self.memory.push(s, a, lp)
        
    def memorize_comm(self, s, a, lp):
        self.memory.push_comm(s, a, lp)
    
    def memorize_rewards(self, r):
        self.memory.save_reward(r)
    
    def compute_G(self):
        self.G = np.zeros((self.episode_len))
        cumulative_rewards = 0
        
        for t in reversed(range(0, self.episode_len)):
            cumulative_rewards = cumulative_rewards * self._gamma + self.memory.rewards[t]
            self.G[t] = cumulative_rewards
        return self.G

In [21]:
n_players = 3
n_coins = 15
messages = torch.Tensor((0,0,0))
my_coins = torch.Tensor([3])
ag = AgentNet(n_players = n_players, n_coins = n_coins, episode_len=1)
ag.act(my_coins, messages)

(tensor([1]),
 tensor([-0.8742], grad_fn=<SqueezeBackward1>),
 tensor([[0.5828, 0.4172]], grad_fn=<SoftmaxBackward>))

In [22]:
class Environment:

    def __init__(self, n_players, n_total_coins, threshold = 0., multiplier = 1.):
        self.n_players = n_players
        self.n_total_coins = n_total_coins
        self.state = np.zeros((self.n_players,1))
        self.threshold = threshold
        self.multiplier = multiplier
        self.rews_dict = {k: [] for k in range(self.n_players)}

    def reset(self):
        self.state = np.zeros(self.n_players)
        self.rews_dict = {k: [] for k in range(self.n_players)}
        
    def equal_division(self):
        self.coins = [int(self.n_total_coins/self.n_players) for i in range(self.n_players)]
        return self.coins
    
    def update_rews_dict(self, rew, t):
        for i in range(self.n_players):
            self.rews_dict[i].append(rew[i])
            
    def sample(self):
        left_coins = self.n_total_coins
        self.coins = []
        
        for i in range(self.n_players):
            if (i == self.n_players - 1):
                self.coins.append(left_coins)
            else:
                val = int(np.random.choice(left_coins-1, 1))
                self.coins.append(val)
                left_coins = left_coins - val 
                
        return self.coins

    def step(self, coins, actions):
        rewards = np.zeros(self.n_players)
        
        if (np.sum(actions)) < self.threshold:
            return rewards
        
        amount = np.sum([coins[i] for i in range(self.n_players) if actions[i] == 1])
            
        for i in range(self.n_players):
            if (actions[i] == 1.):
                rewards[i] = amount/self.n_players*self.multiplier
            else:
                rewards[i] = amount/self.n_players*self.multiplier + coins[i]
                
            # trick to avoid log(0) 
            if rewards[i] <= 0:
                rewards[i] = 1e-6

        return rewards

In [49]:
class System():
    
    def __init__(self, n_players, n_coins, episode_len, epsilon=0.1, n_actions=2, lr=0.01, _gamma=0.99):
        
        self.n_players = n_players
        self.n_coins = n_coins
        self.n_actions = n_actions # 0 non do soldi, 1 do soldi
        self.episode_len = episode_len
        self.lr = lr
        
        self.agents = {k: AgentNet(n_players, n_coins, episode_len=episode_len) for k in range(self.n_players)}

    def reset(self):
        for _, ag in self.agents.items(): ag.reset()
            
    def comm(self, coins):
        mexs = torch.empty(self.n_players)
        log_probs_mexs = np.zeros(self.n_players)
        
        for i, agent in self.agents.items():
            m, log_prob, _ = agent.comm(coins[i])
            mexs[i] = m.detach().numpy()[0]
            log_probs_mexs[i] = log_prob
            
        return mexs, log_probs_mexs
            
    def act(self, coins, mexs):
        actions = np.zeros(self.n_players)
        log_probs = np.zeros(self.n_players)
        
        for i, agent in self.agents.items():
            a, log_prob, _ = agent.act(coins[i], mexs)
            actions[i] = a.detach().numpy()[0]
            log_probs[i] = log_prob
            
        return actions, log_probs
    
    def memorize_rewards(self, rews):
        for idx, ag in self.agents.items(): ag.memorize_rewards(rews[idx])
            
    def reinforce(self):
        self.losses = []
        for agent_idx, agent in self.agents.items():
            #print("\nagent=", agent_idx)
            agent.compute_G()
            #print("ag.G=",agent.G)
            loss = 0
            for G, log_prob in zip(agent.G, agent.memory.log_probs):
                loss -= log_prob*G
                print(log_prob._version)
            print("loss=", loss, loss._version)
            
            self.losses.append(loss.detach().numpy()[0])
            
            agent.opt.zero_grad()
            loss.backward()
            agent.opt.step()
            
    def reinforce_comm(self):
        self.losses_comm = []
        for agent_idx, agent in self.agents.items():
            #print("\nagent=", agent_idx)
            #agent.compute_G()
            loss = 0
            for G, log_prob in zip(agent.G, agent.memory.log_probs_mexs):
                loss -= log_prob*G
            print("comm loss=", loss, loss._version)
            
            self.losses_comm.append(loss.detach().numpy()[0])
            agent.commnet.opt.zero_grad()
            print(agent.commnet.parameters()[0]._version)
            loss.backward()
            agent.commnet.opt.step()

In [50]:
def reinforce_players_communication(n_players=5, n_coins=10, n_episodes=1, episode_len=10, epsilon=0.1, \
                              threshold=0., multiplier=1., gamma=0.9):
    
    torch.autograd.set_detect_anomaly(True)
    
    df = pd.DataFrame(columns=['episode', 't', 'loss0', 'loss1', 'loss2'])
                      
    env = Environment(n_players, n_coins, threshold, multiplier)
    system = System(n_players, n_coins, episode_len, epsilon)
    
    history_rewards = {k: [] for k in range(n_players)}

    line = 0
    
    for e in range(n_episodes):
        #print("\nEpisode=", e)
        
        env.reset()
        system.reset()
        
        if ( e == int(n_episodes/2) ):
            for agent_idx, ag in system.agents.items():
                ag.epsilon = 0.1
        if (e == int(n_episodes - 50)):
            ag.epsilon = 0.0001
                
        for t in range(0, episode_len):
            
            coins = env.equal_division()
            
            print("\ncoins=", coins)

            mex, log_prob_mex = system.comm(coins)
            print("mexs=", mex)
            
            act, log_prob = system.act(coins, mex)
            print("acts=", act)
            
            rews = env.step(coins, act)
            print("rews=", rews)
            
            env.update_rews_dict(rews, t)
            
            system.memorize_rewards(rews)
            
        #print("act=", act)
        #print("rews=", rews)
        
        system.reinforce()
        system.reinforce_comm()
                  
        df.loc[line] = [e, t] + system.losses
        line += 1

        
        if (e%100 == 0):
            print("==> Episode=", e)
            print(system.losses)
            #print("epsilon=", system.agents[0].epsilon)
            #print("act=", act)
            #print("rews=", rews)
            
    return df, history_rewards, system.agents

In [51]:
n_episodes = 2
episode_len = 1

n_players = 3
n_coins = 10

threshold = 0.
multiplier = float(n_players)/2.+3.

history, agents, hr = reinforce_players_communication(n_players=n_players, n_coins=n_coins, n_episodes=n_episodes, episode_len=episode_len, threshold = threshold, multiplier=multiplier)


coins= [3, 3, 3]
mexs= tensor([0., 0., 1.])
acts= [1. 1. 0.]
rews= [ 9.  9. 12.]
0
loss= tensor([5.5930], grad_fn=<RsubBackward1>) 0
0
loss= tensor([7.4820], grad_fn=<RsubBackward1>) 0
0
loss= tensor([11.7344], grad_fn=<RsubBackward1>) 0
comm loss= tensor([5.6571], grad_fn=<RsubBackward1>) 0


AttributeError: 'Adam' object has no attribute '_version'

In [None]:
agents

In [None]:
figure, ax = plt.subplots(1, n_players, figsize=(16, 4))

for i in range(n_players):
    ax[i].plot(np.linspace(0, n_episodes*episode_len, n_episodes*episode_len), agents[i].memory.all_rewards, label='reward agent'+str(i))
    ax[i].legend()
    ax[i].grid()
plt.show()

In [None]:
figure, ax = plt.subplots(1, n_players, figsize=(16, 4))

for i in range(n_players):
    ax[i].plot(np.linspace(0, n_episodes, n_episodes), df['loss'+str(i)], label='loss agent'+str(i))
    ax[i].legend()
    ax[i].grid()
plt.show()

In [None]:
probbs = np.zeros((n_coins, 2))

for c in range(n_coins):
    a, log_prob, probs = agents[0].act(np.array(c))
    probbs[c] = probs.detach().numpy()
    
ax = sns.heatmap(probbs, annot=True, linewidth=.5)

In [None]:
probbs = np.zeros((n_coins, 2))

for c in range(n_coins):
    a, log_prob, probs = agents[0].act(np.array(c))
    probbs[c] = probs.detach().numpy()
    
ax = sns.heatmap(probbs, annot=True, linewidth=.5)

In [None]:
probbs = np.zeros((n_coins, 2))

for c in range(n_coins):
    a, log_prob, probs = agents[0].act(np.array(c))
    probbs[c] = probs.detach().numpy()
    
ax = sns.heatmap(probbs, annot=True, linewidth=.5)