In [26]:
import pyspiel
import numpy as np
import gym, gym.spaces
import math
import random
import numpy as np
from collections import namedtuple, deque
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

from BridgeNetwork import *

In [62]:
GAME = pyspiel.load_game('bridge(use_double_dummy_result=true)')

class BridgeEnv(gym.Env):
    """Custom Environment that follows gym interface"""

    def __init__(self):
        super(BridgeEnv, self).__init__()    # Define action and observation space
        self.action_space = gym.spaces.Box(low=0, high=1, shape=(38,), dtype=np.float32)
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(571,), dtype=np.float32)
        self.reset()

    def reset(self):
        self.state = self.generate_random_game()
        return np.array(self.state.observation_tensor())

    def step(self, action):
        action = self.pick_action(action)

        self.state.apply_action(action+52)
        
        if self.state.current_phase() == 3:
            return self.calculate_terminal_reward(action)
        
        # random opposing team
        self.state.apply_action(random.choice(self.state.legal_actions()))

        if self.state.current_phase() == 3:
            return self.calculate_terminal_reward(action, invert=True)

        return self.calculate_default_reward(action)

    def calculate_default_reward(self, action):
        obs = np.array(self.state.observation_tensor())
        reward = 0
        done = False
        return obs, reward, done, {"action": action}

    def calculate_terminal_reward(self, action, invert=False):
        obs = np.zeros(571)
        reward = self.state.score_by_contract()[self.state.contract_index()]
        if invert:
            reward = -reward
        done = True
        return obs, reward, done, {"action": action}

    def pick_action(self, action_vector):
        action_vector = self.softmax(action_vector)
        legal_action_mask = np.array(self.state.legal_actions_mask())[52:52+self.action_space.shape[0]]
        masked_action_vector = action_vector*legal_action_mask / sum(action_vector*legal_action_mask)
        action = np.random.choice(self.action_space.shape[0], p=masked_action_vector)

        if action + 52 not in self.state.legal_actions():
            print(action+52, self.state.legal_actions())
            print(action_vector[:6])
            print(legal_action_mask[:6])
            print((action_vector*legal_action_mask)[:6])
            print(masked_action_vector[:6])

        return action


    def softmax(self, x):
        y = np.exp(x - np.max(x))
        f_x = y / np.sum(y)
        return f_x

    def generate_random_game(self): 
        state = GAME.new_initial_state()
        # deal all 52 cards randomly
        for i in np.random.choice(52, size=(52,), replace=False):
            state.apply_action(i)
        return state
    

In [63]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def get(self):
        return self.memory

    def clear(self):
        self.memory.clear()

    def __len__(self):
        return len(self.memory)

In [64]:
BATCH_SIZE = 128
GAMMA = 0.9
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 40
LEARNING_RATE = 1e-4

device = torch.device("cpu")
env = BridgeEnv()

# Get number of actions from gym action space
n_actions = env.action_space.shape[0]
n_observations = sum(env.observation_space.shape)

actor_critic = BridgeActorCritic().to(device)

optimizer = optim.Adam(actor_critic.parameters(), lr = LEARNING_RATE)
memory = ReplayMemory(10000)


steps_done = 0

trailing_avg_reward = deque()
trailing_avg_size = 100

In [71]:
num_episodes = 1000

for i_episode in range(num_episodes):
    log_probs = []
    values = []
    rewards = []


    # Initialize the environment and state
    state = env.reset()
    state = torch.from_numpy(state).to(device).float().unsqueeze(0)
    for t in count():
        value, policy_dist = actor_critic.forward(state)

        value = value.detach().numpy()[0,0]
        dist = policy_dist.detach().squeeze().numpy()

        new_state, reward, done, metadata = env.step(dist)
        new_state = torch.from_numpy(new_state).to(device).float().unsqueeze(0)

        action = metadata["action"]
        log_prob = torch.log(policy_dist.squeeze(0)[action])
        entropy = -np.sum(np.mean(dist) * np.log(dist))

        rewards.append(reward)
        values.append(value)
        log_probs.append(log_prob)

        state = new_state
        
        if done:

            Qval, _ = actor_critic.forward(new_state)

            trailing_avg_reward.append(reward)
            if len(trailing_avg_reward) > trailing_avg_size:
                trailing_avg_reward.popleft()

            
            print(f"episode #{i_episode}, episode reward: {reward}, avg_reward: {round(np.mean(trailing_avg_reward),2)}, episode length: {t+1}")


            #print(env.state)
            break
    # Update the target network, copying all weights and biases in DQN
    Qvals = np.zeros_like(values)
    for t in reversed(range(len(rewards))):
        Qval = rewards[t]/100 + GAMMA * Qval
        Qvals[t] = Qval

    values = torch.FloatTensor(values) # values calculated by Critic
    Qvals = torch.FloatTensor(Qvals) # real values (calculated by sum of episode reward * discount factor)
    log_probs = torch.stack(log_probs) # log probability of each move in the episode
    
    advantage = Qvals - values
    actor_loss =  (-log_probs * advantage).mean()
    critic_loss = 0.5 * advantage.pow(2).mean()
    ac_loss = actor_loss + critic_loss

    optimizer.zero_grad()
    ac_loss.backward()
    optimizer.step()


print('Complete')

episode #0, episode reward: 5200, avg_reward: 907.0, episode length: 7
episode #1, episode reward: 1000, avg_reward: 871.0, episode length: 7
episode #2, episode reward: 400, avg_reward: 869.0, episode length: 7
episode #3, episode reward: -2900, avg_reward: 838.0, episode length: 5
episode #4, episode reward: 1600, avg_reward: 850.5, episode length: 7
episode #5, episode reward: 550, avg_reward: 852.5, episode length: 5
episode #6, episode reward: 550, avg_reward: 856.0, episode length: 4
episode #7, episode reward: 250, avg_reward: 854.5, episode length: 3
episode #8, episode reward: 2200, avg_reward: 866.5, episode length: 6
episode #9, episode reward: 4000, avg_reward: 890.5, episode length: 7
episode #10, episode reward: 500, avg_reward: 837.5, episode length: 4
episode #11, episode reward: 100, avg_reward: 832.0, episode length: 4
episode #12, episode reward: 4000, avg_reward: 869.5, episode length: 5
episode #13, episode reward: -300, avg_reward: 832.5, episode length: 8
episode

KeyboardInterrupt: 