<a href="https://colab.research.google.com/github/martacor2/corellian_spike/blob/main/RL_with_Corellian_Spike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Necessary imports

It is only necessary to run the second cell if you will be importing files from Google Drive and you are using Google Colab to run this notebook.

In [1]:
import random
import numpy as np
import pandas as pd

from numpy.lib.type_check import nan_to_num
import json
from statistics import mode

import os

import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Simulator Code

In [None]:
from traitlets.config.loader import KeyValueConfigLoader
class CorellianSpike:
  def __init__(self,p,n=2,human=False, epsilon = 0, win_credits=True, connect=False):
    self.n = n # Number of players
    self.human = human
    self.win_credits = win_credits
    self.connect = connect

    self.p = p # Policy
    
    self.homogeneous = type(self.p) == dict
    self.last_played = 0 # tracks who was the player who finished the betting phase for recording purposes 
    self.final_bet = 0 # tracks the min bet for recording purposes
    self.deck = [0,0]
    self.folded_players = []
    for i in range(-10, 11):
      self.deck += [i] * 3
    random.shuffle(self.deck)
    self.discard = []
    self.f = open('spike_data.txt','a')
    self.epsilon = epsilon
    self.states = set()

    # Betting actions: 0 = raise, 1 = match, 2 = fold
    # Buying a card: 3 = buy, 4 = don't buy
    # Next card action: 5 = discard, 6 = swap w/ spike, 7 = swap w/ hand

    self.end = False
    
    self.last_folded = []
    self.pool = 0
    self.last_gain = 0
    self.players = [{'hand':[], 'spike':[], 'extra_card':[], 'money':50, 'folded': False, 'win':False, 'last_state':0, 'last_action':0, 'last_min_bet':0, 'last_reward':0} for i in range(self.n)]

  def run_hand(self,hand): # izzie
    """
    This function runs a hand of Corellian Spike.
    No inputs or outputs, just updates tracked variables.
    """
    self.round = 0
    # Each player buys in with 2 credits
    for player in range(self.n):
      self.players[player]['money'] -= 2
      self.pool += 2
    self.deal_card()
    if self.human:
      print("Your starting hand is:",self.players[0]['hand'])

    if self.connect and hand != 0 and hand != 9:
      for player in range(self.n):
        if player not in self.last_folded:
          self.record(player, connect_time = True)
          self.players[player]['win'] = False

      
    self.bet(self.n-1)
    self.round += 1
    self.deal_spike()
    for p in range(self.n):
      if p not in self.folded_players:
        s = self.players[p]['last_state']
        sp = self.get_state(p,2)
        self.record(p,s,action=self.players[p]['last_action'],decision=2,r=self.players[p]['last_reward'],sp=sp,min_bet=self.final_bet)
    if self.human:
      print("Your spike card is:",self.players[0]['spike'])
      for player in range(1,self.n):
        print("Player",player,"has this spike card:",self.players[player]['spike'])
    for round in range(3):
      #print("NEW ROUND")
      if len(self.deck) < self.n:
        random.shuffle(self.discard)
        self.deck += self.discard
        self.discard = []
      self.buy_card()
      self.bet(self.n-1)
      self.round += 1
      if len(self.deck) < 3*self.n:
        random.shuffle(self.discard)
        self.deck += self.discard
        self.discard = []
      self.spike_dice()
      if self.human:
        print(" ")

    if self.human:
      print("Your final cards:",self.players[0]['hand'],self.players[0]['spike'])
      for player in range(1,self.n):
        print("Player",player,"final cards:",self.players[player]['hand'],self.players[player]['spike'])

    self.last_gain = self.pool
    self.evaluate_hands()
    for player in range(self.n):
      if player not in self.folded_players and not self.players[player]['win']:
        self.record(player)
    if self.human:
      if self.players[0]['win']:
        print("You won the hand!")
      else:
        print("You lost the hand. :(")
      print(" ")
      print("---")
      print(" ")

  def play_sabacc(self, k=10): #Izzie
    """
    This function has the players play for k rounds or until someone loses all their money.
    It also resets the 'folded' boolean after each hand.
    
    Returns: nothing
    """
    win_counts = np.zeros(self.n)
    for hand in range(k):
      if hand == k-1:
        self.end = True

      #print("HAND #"+str(hand))
      # Reset the deck
      self.deck = [0,0]
      for i in range(-10, 11):
        self.deck += [i] * 3
      random.shuffle(self.deck)
      self.discard = []
      lost = False

      # Play a hand
      if self.human:
        print("HAND",hand)
        print("You have",self.players[0]['money'],"credits.")
      self.run_hand(hand)
      # Reset folding and cards figure out if anyone lost all their money
      self.pool = 0
      for player in range(self.n):
        if self.players[player]['win']:
          win_counts[player] += 1
        self.players[player]['folded'] = False
        self.players[player]['hand'] = []
        self.players[player]['spike'] = []
        self.players[player]['extra_card'] = []
        self.last_folded = self.folded_players.copy()
        self.folded_players = []
        if self.players[player]['money'] <= 0:
          # self.players[player]['folded'] = True
          lost = True
      if lost:
        break
    self.end = False
    self.f.close()
    return win_counts

  def deal_card(self):  
    #range 2 to deal two cards
    for i in range(2):
      for j in range(self.n):
        if j in self.folded_players:
          pass
        else:
          card = self.deck[0]
          #assign card to a player
          self.players[j]['hand'].append(card)
          self.deck.pop(0)

  def bet(self, last_player, player=0, min_bet=0): # izzie
    """
    This is a recursive betting function.
    No outputs, only updates tracked variables.

    Inputs:
     - Which player's turn it is to bet
     - What the current minimum bet is (or they have to fold)
    """
    # Stop running this if everyone has folded
    if len(self.folded_players) == self.n:
      return 2
    # Skip the player if they have folded
    if not self.players[player]['folded']:
      # All in: player remains in game if no money. (Just skip them like they folded.)
      if self.players[player]['money'] <= 0:
        if player != last_player and self.n - len(self.folded_players) > 1:
          return self.bet(player=(player+1)%self.n,last_player=last_player,min_bet=min_bet)
        else:
          return 2
      else:
        s = self.get_state(player,1)
        if self.human and player==0:
          action = None
          print("The minimum bet is",min_bet,"credits. You have",self.players[0]['money'],"credits.")
          while not (action == 0 or action == 1 or action == 2):
            try:
              action = int(input("Type 0 to raise, 1 to match or stand, or 2 to fold: "))
            except:
              print("An error occurred, try again.")
        else:
          if self.homogeneous:
            if np.random.uniform() < self.epsilon and self.epsilon !=0:
              action = np.random.randint(0,2) # Look up this value
            else: #if epsilon is indeed 0, or the random value from a uniform distribution is greater than epsilon
              if s in self.p.keys():
                action = self.p[s] # Look up this value
              else:
                action = 1
          else:
            if np.random.uniform() < self.epsilon and self.epsilon !=0:
              action = np.random.randint(0,2) # Look up this value
            else: #if epsilon is indeed 0, or the random value from a uniform distribution is greater than epsilon
              if s in self.p[player].keys():
                action = self.p[player][s]
              else:
                action = 1
          if self.human:
            print("Player",player,"took this action:",action)
        if self.n - len(self.folded_players) == 1: # Force this thing to match if it's the only one playing
          action = 1
        self.players[player]['last_action'] = action
        self.players[player]['last_min_bet'] = min_bet
        self.players[player]['last_state'] = s
        # If raising
        if action == 0:
          self.pool += 2
          self.players[player]['money'] -= 2
          m = self.players[player]['money']
          p = self.pool
          # Recursively call function, last player to bet is the player directly before you
          dec = self.bet(player=(player+1)%self.n,last_player=(player-1)%self.n,min_bet=2)
          if dec!=2: # Only record if continuing betting
            sp = self.get_state(player,dec,passing=True,money=m,pool=p)
            self.record(player,s,action=action,decision=1,sp=sp,min_bet=2)
          else:
            self.players[player]['last_reward'] = self.get_reward(player,1,action,2)
            self.last_played = player
            self.final_bet = 2
          return 1
        # If matching
        elif action == 1:
          self.pool += min_bet
          self.players[player]['money'] -= min_bet
          m = self.players[player]['money']
          p = self.pool
          # Only call recursively if not the last person
          if player != last_player:
            dec = self.bet(player=(player+1)%self.n,last_player=last_player,min_bet=min_bet)
          else:
            dec = 2
          if dec!=2: # Only record if continuing betting
            sp = self.get_state(player,dec,passing=True,money=m,pool=p)
            self.record(player,s,action=action,decision=1,sp=sp,min_bet=min_bet)
          else:
            #self.players[player]['last_state'] = self.get_state(player,1)
            self.players[player]['last_reward'] = self.get_reward(player,1,action,min_bet)
            self.last_played = player
            self.final_bet = min_bet
          return dec
        # If folding
        else:
          self.players[player]['folded'] = True
          self.folded_players.append(player)
          # Stay in the same state, never will transition
          self.players[player]['last_reward'] = self.get_reward(player,1,action,min_bet)
          if player != last_player:
            dec = self.bet(player=(player+1)%self.n,last_player=last_player,min_bet=min_bet)
          else:
            dec = 2
          self.record(player,s,action=action,sp=0)
          return dec
    else:
      if player == last_player:
        return 2 # Don't keep playing if the last player has already folded
      else:
        return self.bet(player=(player+1)%self.n,last_player=last_player,min_bet=min_bet)

  def deal_spike(self): 
    for j in range(self.n):
      if j in self.folded_players:
        pass
      else:
        card = self.deck[0]
        #assign card to a player
        self.players[j]['spike'].append(card)
        self.deck.pop(0)

  def buy_card(self): # max
    for j in range(self.n):
      if j in self.folded_players:
        pass
      else:
        self.players[j]['hand'].sort()
        s = self.get_state(j,2)
        if self.human and j==0:
          action = None
          print("You have",self.players[0]['money'],"credits.")
          while not (action == 3 or action == 4):
            try:
              action = int(input("Type 3 to buy a card for 2 credits or 4 to do nothing: "))
            except:
              print("An error occurred, try again.")
        else:
          if self.homogeneous:
            if np.random.uniform() < self.epsilon and self.epsilon !=0:
              action = np.random.randint(3,4) # Look up this value
            else: #if epsilon is indeed 0, or the random value from a uniform distribution is greater than epsilon
              if s in self.p.keys():
                action = self.p[s] # Look up this value
              else:
                action = 4
          else:
            if np.random.uniform() < self.epsilon and self.epsilon !=0:
              action = np.random.randint(3,4) # Look up this value
            else: #if epsilon is indeed 0, or the random value from a uniform distribution is greater than epsilon
              if s in self.p[j].keys():
                action = self.p[j][s]
              else: action = 4
          if self.human:
            print("Player",j,"took this action:",action)
        if action == 3 and self.players[j]['money'] > 0: # buy card
          self.players[j]['last_action'] = 3
          self.players[j]['money'] -= 2 
          self.pool += 2
          card = self.deck[0]
          self.players[j]['extra_card'].append(card)
          self.deck.pop(0)
          sp = self.get_state(j,3)
          self.record(j,s,decision=2,sp=sp)
          if self.human and j==0:
            action = None
            print("Your hand is:",self.players[0]['hand'])
            print("Your spike card is:",self.players[0]['spike'])
            print("Your extra card is:",self.players[0]['extra_card'])
            while not (action == 5 or action == 6 or action == 7 or action == 8):
              try:
                action = int(input("Type 5 to discard, 6 to swap with the spike card, 7 to swap with the lower hand card, or 8 to swap with the higher hand card: "))
              except:
                print("An error occurred, try again.")
          else:
            if self.homogeneous:
              if np.random.uniform() < self.epsilon and self.epsilon !=0:
                action = np.random.randint(5,8) # Look up this value
              else: #if epsilon is indeed 0, or the random value from a uniform distribution is greater than epsilon
                if sp in self.p.keys():
                  action = self.p[sp] # Look up this value
                else:
                  action = 5
            else:
              if np.random.uniform() < self.epsilon and self.epsilon !=0:
                action = np.random.randint(5,8) # Look up this value
              else: #if epsilon is indeed 0, or the random value from a uniform distribution is greater than epsilon
                if sp in self.p[j].keys():
                  action = self.p[j][sp]
                else:
                  action = 5
            if self.human:
              print("Player",j,"took this action:",action)
          self.players[j]['last_action'] = action
          if action == 5: # discard bought card
            self.discard.append(card)
          elif action == 6: # swap spike
            self.players[j]['spike'].append(card)
            self.discard.append(self.players[j]['spike'].pop(0))
            if self.human and j!=0:
              print("Player",j,"has a new spike card:",self.players[j]['spike'])
          elif action == 7: # swap with lower hand card
            self.discard.append(self.players[j]['hand'].pop(0))
            self.players[j]['hand'].append(card)
          elif action == 8: # swap with higher hand card
            self.discard.append(self.players[j]['hand'].pop(1))
            self.players[j]['hand'].append(card)
          else:
            print('invalid action')
          self.players[j]['extra_card'] = []
          self.record(j,sp,decision=3,sp=self.get_state(j,1))
        else: # do not buy card
          self.players[j]['last_action'] = 4
          self.record(j,s,action=4,decision=2,sp=self.get_state(j,1))
        
  def spike_dice(self): # max
    roll_1 = random.randint(1, 6)
    roll_2 = random.randint(1, 6)

    if self.human:
      print("Spike dice roll results:",roll_1,roll_2)

    if roll_1 == 1 and roll_2 == 1:  # Replace Spike
      for j in range(self.n):
        if j in self.folded_players:
          pass
        else:
          self.discard.append(self.players[j]['spike'].pop(0))
      #internally checks for folded, deals spike to every player that has not folded
      self.deal_spike()
      if self.human:
        print("Your new spike card is:",self.players[0]['spike'])
        for player in range(self.n):
          print("Player",player,"has this spike card:",self.players[player]['spike'])

    
    if roll_1 == roll_2: # Replace Hand 
      for j in range(self.n):
        if j in self.folded_players:
          pass
        else:
          self.discard.append(self.players[j]['hand'].pop(0))  
          self.discard.append(self.players[j]['hand'].pop(0))
      #deal card applies to all players, internaly checks for folding
      self.deal_card()
      if self.human and 0 not in self.folded_players: #you have not folded
        print(f"Your new hand is:",self.players[0]['hand'])
        # for j in range(1,self.n):
        #   print(f"Check player {j}:",self.players[j]['hand'])

    if self.round == 4:
      pass
    else:
      for player in range(self.n):
        if player not in self.folded_players:
          s = self.players[player]['last_state']
          self.record(player,s,action=self.players[player]['last_action'],decision=2,r=self.players[player]['last_reward'],sp=self.get_state(player,2),min_bet=self.final_bet)


  def tie_breaker(self,winner_dict,p_win):
    """
    Deals with tie breaking rules
    """
    #just to get it out of the way, check if players literally have the same hand
    if (len(set([element for element in winner_dict['cards']]))==1):
        #everyone has the same hand
        n_win = np.random.choice([element for element in winner_dict['player']])
        k = winner_dict['player'][n_win] # who is it that won
        self.players[k]['win'] = True
        self.players[k]['money']+= self.pool
        self.record(k)
        #reset the pool
        self.pool = 0
        return k

    else:
        #there are multiple winners --> first tie breaker rule says whoever has the lowest positive sum wins
        #IF THE SUM IS POSITIVE (FOR ALL CARDS), TAKE THE LOWEST ONE TO WIN
        #IF HIGHEST SUM OF POSITIVE CARDS 
        #IF HIGHEST SINGLE VALUE CARD (POSITIVE)
        # GIVE A RANDOM PLAYER THE WIN

        p_points = [winner_dict['points'][i] for i in p_win]

        pos_score = [np.sum(element) for element in p_points if element>=0]                #get the positive/zero score of a player's hand
        positive_winner = [i for i, x in enumerate(pos_score) if x == min(pos_score)]                  #get the player with the lowest positive sum in their hand    fixed .min()

        # If all players have negative sums, keep the same list of players to tiebreak between
        if len(positive_winner) == 0:
          positive_winner = [i for i, x in enumerate(p_win)] 

        if len(positive_winner) == 1:
            #found only one person with the lowest positive sum, they win
            k = winner_dict['player'][p_win[positive_winner[0]]] #who is it
            self.players[k]['win'] = True
            self.players[k]['money']+=self.pool
            self.record(k)
            self.pool = 0
            return(k)
            
        else:
            #found multiple playes with the same lowest positive sum--> second tie breaker rule says whoever has highest sum of positive cards wins
            #get the players that are ties
            p_win = [p_win[i] for i in positive_winner]

            high_positive_sum = []
            for i in p_win:
               positive_cards = [card for card in winner_dict['cards'][i] if card>=0]
               high_positive_sum.append(np.sum(positive_cards))

            high_pos_sum_winner = [i for i, x in enumerate(high_positive_sum) if x == max(high_positive_sum)] # .max()

            # If none of the players have positive cards, keep the same list of players to tiebreak between
            if len(high_pos_sum_winner) == 0:
              high_pos_sum_winner = [i for i, x in enumerate(p_win)] 

            if len(high_pos_sum_winner) == 1:
              #found only one person with the lowest positive sum, they win
              k = winner_dict['player'][p_win[high_pos_sum_winner[0]]] #who is it
              self.players[k]['win'] = True
              self.players[k]['money']+= self.pool
              self.record(k)
              self.pool = 0
              return(k)

            else:
                #found multiple playes with the same  absolute highest single value card--> third tie breaker rule says whoever has the highest positive single value card wins
                p_win = [p_win[i] for i in high_pos_sum_winner]

                high_positive_card = []                                 #must check which one has the absolute highest single value card
                for i in p_win:
                    positive_cards = [card for card in winner_dict['cards'][i] if card>=0]
                    if len(positive_cards) > 0:
                      high_positive_card.append(np.max(positive_cards))
                    else:
                      high_positive_card.append(0)

                #find the player with the highest card in hand
                high_pos_card_winner = [i for i, x in enumerate(high_positive_card) if x == max(high_positive_card)] #.max()

                # Again, dealing with if no one has positive cards
                if len(high_pos_card_winner) == 0:
                  high_pos_card_winner = [i for i, x in enumerate(p_win)] 

                if len(high_pos_card_winner) == 1:
                    #found only one person with the lowest positive sum, they win
                    k = winner_dict['player'][p_win[high_pos_card_winner[0]]] #who is it
                    self.players[k]['win'] = True
                    self.players[k]['money']+= self.pool
                    self.record(k)
                    self.pool = 0
                    return k

                else:
                    #multiple people have the idiot's array, they all won! Randomly pick a winner
                    ind = random.randint(0,len(high_pos_card_winner)-1)
                    #n_win = np.random.choice(winner_dict['player'][p_win[high_pos_card_winner]])
                    n_win = winner_dict['player'][p_win[high_pos_card_winner[ind]]]
                    self.players[n_win]['win'] = True
                    self.players[n_win]['money']+= self.pool
                    self.record(n_win)
                    #reset the pool
                    self.pool = 0
                    return n_win
  
  def evaluate_hands(self, special = False): #marta
    """
    This function evaluates players' hands at the end of a game

    Inputs:
     - special (Boolean): consider special hands or not

    No outputs - updates tracked values
    """

    #---------------------TO-DO: need to add intermediate state for evaluating hands ----------------#

    winner_dict = {'player':[], 'cards':[], 'points': []}                                              #keep track of the player points to evaluate the winners
    for p in range(self.n):
      if p in self.folded_players:
        pass
      else:
        #player -> the player number (a list of ints/floats)
        winner_dict['player'].append(p)
        #cards-> the player's 3 hand combo (a list of lists)
        winner_dict['cards'].append(tuple(self.players[p]['hand']+self.players[p]['spike']))
        #points-> the player's points from the 3 hand combo (a list of floats)
        winner_dict['points'].append(np.sum(self.players[p]['hand']+self.players[p]['spike']))

    if self.human:
      print(winner_dict)

    if special:
      #check the special hands!
      indices = [i for i, x in enumerate(winner_dict['cards']) if x == [2,3,0]]

      if len(indices) == 1:
        #someone has the idiot's array, they won!
        k = winner_dict['player'][indices[0]] # who is it that won
        self.players[k]['win'] = True
        self.players[k]['money']+=self.pool
        self.record(k)
        self.pool = 0
        return k

      elif len(indices) > 1:
        #multiple people have the idiot's array, they all won! Randomly pick a winner
        n_win = np.random.choice(indices)
        k = winner_dict['player'][n_win] # who is it that won
        self.players[k]['win'] = True
        self.players[k]['money']+= self.pool
        self.record(k)
        #reset the pool
        self.pool = 0
        return k

      else:
        #no one has the special hand, however, let's check prime sabacc
        indices = [i for i, x in enumerate(winner_dict['cards']) if x == [10,-10,0]]
        
        if len(indices) == 1:
           #someone has prime sabacc, they won!
           k = winner_dict['player'][indices[0]] # who is it that won
           self.players[k]['win'] = True
           self.players[k]['money']+=self.pool
           self.record(k)
           self.pool = 0
           return k

        elif len(indices) > 1:
            #multiple people have the prime sabacc, they all won! Randomly pick a winner
            n_win = np.random.choice(indices)
            k = winner_dict['player'][n_win] # who is it that won
            self.players[k]['win'] = True
            self.players[k]['money']+= self.pool
            self.record(k)
            #reset the pool
            self.pool = 0
            return k
            
        else:
            #no one has special hands, run as usual
            #consider the absolute value of the points and find the player with the lowest sum
            p_win = [i for i, x in enumerate(winner_dict['points']) if np.abs(x) == min(np.absolute(winner_dict['points']))]   
            
        if len(p_win) == 1:    
            #there is only one winner and they have the lowest number of points, give them money and reset the pool
            k = winner_dict['player'][p_win[0]] # who is it that won
            self.players[k]['win'] = True
            self.players[k]['money']+=self.pool
            self.record(k)
            self.pool = 0
            return k
            
        elif len(p_win) > 1:
            return self.tie_breaker(winner_dict, p_win)
        else:
            print('Could not find a winner',len(self.folded_players))

    else: #we are not running with special hands
        #consider the absolute value of the points and find the player with the lowest sum
        p_win = [i for i, x in enumerate(winner_dict['points']) if np.abs(x) == min(np.absolute(winner_dict['points']))]                  
        if len(p_win) == 1:    
            #there is only one winner and they have the lowest number of points, give them money and reset the pool
            k = winner_dict['player'][p_win[0]] # who is it that won
            self.players[k]['win'] = True
            self.players[k]['money']+=self.pool
            self.record(k)
            self.pool = 0
            return k
            
        elif len(p_win) > 1:
            return self.tie_breaker(winner_dict, p_win)
        else:
            print('Could not find a winner',len(self.folded_players))

  def get_state(self,player,decision,passing=False,money=None,pool=None): # Izzie
    """
    Calculates the state for a given player at a given decision round.
    # Example: Betting round at the beginning, holds a Prime Sabacc
    # State = 10001000
    # Example 2: Buying a card, holding 5,5,5, has 55 credits, 45 in the pool
    # State = 29000605
    # Example 3: First round deciding what to do with fourth card (-5), holding 2,3,3
    # State = 38051000
    """
    # Round = 1, 2, 3
    # Decision = 1, 2, 3 per the project proposal
    # Sum of cards not abs value, abs value is capped at 9, add 10 (two digits)
    SUM = sum(self.players[player]['hand']) 
    if len(self.players[player]['spike']) > 0:
      SUM += self.players[player]['spike'][0]
    #option - skip 1, make it so that 10 is zero
    if SUM > 9:
      SUM = 9
    elif SUM < -9:
      SUM = -9
    SUM += 10
    # Extra = extra card + 10 (two digits)
    if len(self.players[player]['extra_card'])==0:
      extra = 0
    else:
      extra = self.players[player]['extra_card'][0] + 10
    # Money, divide by 10 and round (two digits)
    if not passing:
      money = np.round(self.players[player]['money']/10)
    else:
      money = np.round(money/10)
    if money > 9:
      money = 9
    if money < 0:
      money = 0
    # Pool, divide by 2 and round (two digits)
    if not passing:
      pool = np.round(self.pool/10)
    else:
      pool = np.round(pool/10)
    if pool > 9:
      pool = 9
    # State = Round * 10^9 + Decision * 10^8 + Sum * 10^6 + extra * 10^4 + Money * 10^2 + Pool
    state = self.round*10**7 + decision*10**6 + SUM*10**4 + extra*10**2 + money*10 + pool
    # Keep track of last state for calling record() during evaluate_hands
    #self.players[player]['last_state'] = int(state)
    return int(state)

  def record(self,player,state=None,action=None,r=None,decision=None,sp=None,min_bet=0,connect_time=False): # Izzie
    if self.round == 4 and (not self.connect or self.end):
      self.f.write(str(self.players[player]['last_state']))
      self.states.add(self.players[player]['last_state'])
      self.f.write(',')
      self.f.write(str(self.players[player]['last_action']))
      self.f.write(",")
      self.f.write(str(self.players[player]['last_reward'] + self.get_reward(player)))
      self.f.write(",")
      self.f.write('0')
      self.f.write("\n")
    
    elif connect_time:
      self.f.write(str(self.players[player]['last_state']))
      self.states.add(self.players[player]['last_state'])
      self.f.write(',')
      self.f.write(str(self.players[player]['last_action']))
      self.f.write(",")
      if action == 2:
        self.f.write("0")
      else:
        self.f.write(str(self.players[player]['last_reward'] + self.get_reward(player, connect_time=True)))
      self.f.write(",")
      self.f.write(str(self.get_state(player,1)))
      self.f.write("\n")

    elif not self.round==4 and (not action == 2 or self.end):
      self.f.write(str(state))
      self.states.add(state)
      self.f.write(",")
      if action == None:
        action = self.players[player]['last_action']
      self.f.write(str(action))
      self.f.write(",")
      if r == None:
        r = self.get_reward(player,decision,action,min_bet)
      self.f.write(str(r))
      self.f.write(",")
      self.f.write(str(sp))
      self.f.write("\n")
  
  def get_reward(self,player,decision=None,action=None,min_bet=0,connect_time=False): # Izzie
    if self.win_credits:
      # If the game is over...
      if self.round == 4 and self.players[player]['win']:
        # Reward on winning = 31*gain = (1+highest possible sum)*gain
        return self.pool
      elif connect_time and self.players[player]['win']:
        return self.last_gain
      else:
        # Reward in betting round = - (1+abs(sum of cards in hand)) * bet amount
        if decision == 1 and action == 0:
          return -min_bet
        elif decision == 1 and action == 1:
          return -min_bet
        # Reward for buying a card = - (1+abs(min(sums of possible combinations of 3 of 4 cards)))*3/4) * 2
        elif decision == 2 and action == 3:
          return -2
        # No reward for losing or card rounds
        else:
          return 0
    else:
      # If the game is over...
      if self.round == 4 and self.players[player]['win']:
        # Reward on winning = 31*gain = (1+highest possible sum)*gain
        return 31*self.pool
      elif connect_time and self.players[player]['win']:
        return self.last_gain
      else:
        # Reward in betting round = - (1+abs(sum of cards in hand)) * bet amount
        if decision == 1 and action == 0:
          SUM = sum(self.players[player]['hand']) 
          if len(self.players[player]['spike']) > 0:
            SUM += self.players[player]['spike'][0]
          SUM = abs(SUM)
          return -(1+abs(SUM))*min_bet
        elif decision == 1 and action == 1:
          SUM = sum(self.players[player]['hand']) 
          if len(self.players[player]['spike']) > 0:
            SUM += self.players[player]['spike'][0]
          SUM = abs(SUM)
          return -(1+abs(SUM))*min_bet
        # Reward for buying a card = - (1+abs(min(sums of possible combinations of 3 of 4 cards)))*3/4) * 2
        elif decision == 2 and action == 3:
          sums = []
          for ind in range(4):
            cards = (self.players[player]['hand'] + self.players[player]['spike'] + self.players[player]['extra_card']).copy()
            del(cards[ind])
            sums.append(abs(sum(cards)))
          return -(1+abs(min(sums))*3/4)*2
        # No reward for losing or card rounds
        else:
          return 0

# Policy generator code

This function generates a policy that can be inputted into the simulator. By default, this policy will always match bets and not buy a card. If r=True in the arguments, a completely random policy will be generated. If fold=2, then this random policy will fold 33% of the time. If this behavior is not desired, then setting fold=1 will generate a random policy that matches half the time and raises the other half of the time.

Running this cell block also prints the number of states that are generated for that number of players. More states means Q-learning will take longer.

In [None]:
# Let's create an initial policy!

def generate_policy(player_number,fold=2,r=False):
  """
  The state_lookup dictionary is generated such that the key is the state recorded, and it return the enumeration corresponding to that state
  """
  policy = {}
  state_lookup = {}
  #reserved the first count for the terminal state
  count = 1
  for round in range(0,4):
      for decision in range(3):
          for sum in range(1,20): #19): #keeping track of negative sums now
              for extra in range(21):
                  # With max 8 players, most money you could win is 400 credits, not accounting for buying things
                  for money in range(10):
                      # The amount of money in the pool is limited by what the player has
                      for pool in range(10):
                        if money + pool <= player_number*5:
                          if round == 0:
                            if extra == 0:
                              state = (round)*10**7 + (decision + 1)*10**6 + sum*10**4 + extra*10**2 + money*10 + pool
                              if decision+1 == 1:
                                if r:
                                  policy[state] = random.randint(0,fold)
                                else:
                                  policy[state] = 1
                                state_lookup[state] = count
                                count+=1
                          else:
                            state = (round)*10**7 + (decision + 1)*10**6 + sum*10**4 + extra*10**2 + money*10 + pool
                            if decision+1 == 1:
                                if r:
                                  policy[state] = random.randint(0,fold)
                                else:
                                  policy[state] = 1
                            if decision+1 == 2:
                                if r:
                                  policy[state] = random.randint(3,4)
                                else:
                                  policy[state] = 4
                            if decision+1 == 3:
                                if r:
                                  policy[state] = random.randint(5,8)
                                else:
                                  policy[state] = 5
                            state_lookup[state] = count
                            count+=1
  state_lookup[0] = 0
  # print(count)

  return policy, state_lookup

policy, state_lookup = generate_policy(2)

print(len(list(policy.keys())))

231040


# Q-Learning

The first cell block contains functions necessary for running Q-learning. The second cell runs Q-learning to obtain an optimal policy. In each iteration, data is generated by running 2000 simulations, then the action-value function is approximated. The number of iterations can be set with tot_iter, the number of players with n_player, and the number of hands with n_hands. This will create a text file with the "optimal" policy.

Before running the code, make sure to change the file locations in line 112 in cell 1 and lines 17, 27, 34, and 44 in cell 2.

The third cell outputs a graph that shows how many states were explored over iterations. If all went well, the curve should initially rapidly decrease, then level out.

The fourth cell creates a graph that shows how the number of simulations and the epsilon value (used for an epsilon-greedy exploration strategy) affect the number of states that are visited on average.

In [None]:
def txt2policy(policy_text_file):
  # reading the data from the file
  with open(policy_text_file) as file:
      data = file.read()   
  # reconstructing the data as a dictionary
  policy = json.loads(data)
  return {int(k):int(v) for k,v in policy.items()}

def utility_file(utility, outputfilename):
    """Generate .policy file from policy numpy.array

    Args:
        policy (numpy.array): array of actions to take at each state
        outputfilename (string): name of output policy file
    """
    with open(outputfilename, 'w') as utility_file:
      utility_file.write(json.dumps(utility))

def policy2txt(policy_dict,policy_name):
  with open(policy_name, 'w') as policy_file:
      policy_file.write(json.dumps(policy_dict))

def q_bellman_res(qk, q_k1):
    """Calculate the L_infinity norm of of the difference between the current Q function and
    the previous Q function

    Args:
        qk (numpy.array): previous Q funciton
        q_k1 (numpy.array): current Q function

    Returns:
        float: L_infinity norm
    """
    res = np.max(np.abs(q_k1 - qk))
    return res

def q_learning(inputfilename, state_lookup, policy, q_function = None, iteration_num = 50, res_tol = 1e-9):
    """Q-learning algorithm for provided dataset

    Args:
          inputfilename (string): path to data file
          state_lookup (dict): state mapping dictionary
          res_tol (float, optional): Q-function residual. Defaults to 1e-9.

    Returns:
          numpy.array: policy extracted from action value function
    """
    #states-> enumerated 0 to length-1
    s = len(list(state_lookup.keys()))
    #actions-> enumerated 0 to 8
    a = 9

    if q_function is not None:
      Q_tensor = q_function

    else:
      Q_tensor = np.empty((s,a,))
      Q_tensor[:] = -np.Inf

    alpha = 0.1; gamma = 1;
    data = pd.read_csv(inputfilename, header = None, names = ['s','a','r','sp'], 
                        dtype = {'s' : np.int64, 'a' : np.int64, 'r' : np.float64, 'sp' : np.int64})
    iter_count = 1

    # print('Begin Q-learning iterations')

    #go through transitions in simulation multiple times
    while iter_count < 20:
      # print(f'Iteration {iter_count}')
      for index, row in data.iterrows():
          #map state values to enumeration with lookup dictionary
          s_idx = state_lookup[int(row['s'])]
          sp_idx = state_lookup[int(row['sp'])]
          a_idx = int(row['a'])
          r = row['r']

          #Q-learning -> using the current state and action taken to transition to the next state, we updated the Q-funciton (action value function)
          if np.max(Q_tensor[sp_idx,:]) == -np.Inf:
            Q_max = 0
          else:
            Q_max = np.max(Q_tensor[sp_idx,:])

          if Q_tensor[s_idx,a_idx] == -np.Inf:
            Q_tensor[s_idx,a_idx] = 0

          Q_tensor[s_idx,a_idx] += alpha*(r + gamma*Q_max - Q_tensor[s_idx,a_idx])
      
      # #Marta - I have a bad feeling about this...
      # if iter_count>1:
      #     res = q_bellman_res(Q_prev, Q_tensor)
      #     Q_prev = np.copy(Q_tensor)
      #     if res < res_tol:
      #         break
      # else:
      #     Q_prev = np.copy(Q_tensor)
      # print(iter_count)
      iter_count+=1

    #update given policy from Q-funciton created
    u_function = policy.copy()

    for state in list(state_lookup.keys()):
      s_idx = state_lookup[int(state)]
      #if we were able to fill in the action-value function at that state
      if np.max(Q_tensor[s_idx,:]) != -np.Inf:
        policy[state] = int(np.nanargmax(Q_tensor[s_idx,:]))
        u_function[state] = np.nanmax(Q_tensor[s_idx,:])

      else:
        u_function[state] = -(10**9)

    utility_file(u_function,'/Users/marta/Desktop/spike_data_files/utility_'+str(iteration_num)+'.txt')

    unknown_count = 0
    for state in range(s):
      if list(Q_tensor[state,:]) == [-np.inf for i in range(a)]:
        unknown_count+=1

    return policy, Q_tensor, unknown_count

In [None]:
#set up learning routine
iter = 0
tot_iter = 50
#parameters for running the simulation
n_player = 2
n_hands = 10
epsilon = 0.9
alpha = 0.9556                     # this means that after 50 iterations, the epsilon parameter is 9.722398597662178 %
q_function = None
not_seen_counts = []
win_credits_reward = True

#generate a random policy and save it
policy, state_lookup = generate_policy(n_player,fold=2)
random_policy = policy.copy()
policy2txt(random_policy,'/Users/marta/Desktop/spike_data_files/random_policy.txt')

while iter<tot_iter:
  for sim in range(2000):
    #in evaluating the policy, make it so that different players run on different policies
    #evaluate based on  - most money gained in a game
    simulation = CorellianSpike(policy,n=n_player, epsilon=epsilon, win_credits = win_credits_reward)
    wins = simulation.play_sabacc(k=n_hands)

  print(f'Q-learning, iteration {iter}')
  inputfilename = '/Users/marta/Desktop/spike_data.txt'
  policy, q_function, tot_not_seen_states = q_learning(inputfilename, state_lookup, policy, q_function = q_function, iteration_num=iter)

  not_seen_counts.append(tot_not_seen_states)
  #record intermediate policies for further evaluation
  if (iter+1)%5 ==0:
    mid_policy = policy.copy()
    policy2txt(mid_policy,'/Users/marta/Desktop/spike_data_files/mid_policy_'+str(iter+1)+'.txt')

  #delete file to stop appending
  os.remove(inputfilename)

  #update the epsilon parameter
  epsilon = epsilon*alpha
  iter+=1

optimal_policy = policy.copy()
policy2txt(optimal_policy,'/Users/marta/Desktop/spike_data_files/optimal_policy.txt')

In [None]:
fig = plt.figure(figsize=(7,4))
plt.plot(range(len(not_seen_counts)) ,not_seen_counts)
plt.xlabel('Iterations')
plt.ylabel('Total unexplored states')
plt.grid()
plt.xlim(0)
plt.tight_layout()
fig.savefig('unseen_states.png')

# Policy Evaluation

This runs n_sim simulations for n_player players and n_hands hands and outputs how often each policy wins, how much money is gained/lost on average, and how many times each policy completely runs out of money. Which policies are being compared can be changed by importing different policies and changing which are included in the variable policy.

In [None]:
from pandas.io.parquet import read_parquet
n_player = 8 #number of players
n_sim = 2000 #number of sims
n_hands = 2
win_counts = np.zeros(n_player) # running count of hands won across all games
money = np.zeros(n_player)      # running count of money gained or lost across all games
# policy, state_lookup = generate_policy(n_player, fold = 2)
randpol, state_lookup = generate_policy(2)
#mid = txt2policy("/content/drive/MyDrive/Winter 2022/AA 228/first_spike_run/marta data/mid_policy_25.txt")
multi = txt2policy("/content/drive/MyDrive/Winter 2022/AA 228/first_spike_run/izzie data/data/optimal_policy.txt")
#policy2, state_lookup = generate_policy(n_player,fold = 2)
#mon_pol = txt2policy("/content/drive/MyDrive/Winter 2022/AA 228/first_spike_run/max data/win_credits_true/spike_data_files/optimal_policy.txt")
oth_pol = txt2policy("/content/drive/MyDrive/Winter 2022/AA 228/first_spike_run/marta data/optimal_policy.txt")
#ss_pol = txt2policy("/content/drive/MyDrive/Winter 2022/AA 228/first_spike_run/starship data/optimal_policy.txt")
#policy = [policy, policy2]
policy = [oth_pol,multi,randpol,randpol,randpol,randpol,randpol,randpol]
states = set()

bust = np.zeros(n_player)

total_wins = 0
for sim in range(n_sim):
  #in evaluating the policy, make it so that different players run on different policies
  #evaluate based on  - most money gained in a game
  order = np.arange(n_player)
  np.random.shuffle(order)
  #print(order)
  policy_shuffled = []
  for i in order:
    policy_shuffled.append(policy[i])

  simulation = CorellianSpike(policy_shuffled,n=n_player,connect=True)
  wins = simulation.play_sabacc(k=n_hands)
  #print(wins)
  temp_w = []
  for i in order:
    temp_w.append(wins[i])
  wins = np.array(temp_w)
  #print(wins)
  states = states.union(simulation.states)
  
  for p in range(n_player):
    if simulation.players[order[p]]['money'] == 0:
      bust[p]+=1

  win_counts += wins
  total_wins += np.sum(wins)
  #self.players[player]['money']
  temp_money = np.zeros(n_player)
  for player in range(n_player):
    temp_money[player] += (simulation.players[order[player]]['money']-50)
  money += temp_money
  #print(temp_money)
  #print(" ")

avg_money_gained = money/n_sim
win_rate = win_counts/total_wins
print(f'Average money gained: {avg_money_gained}')
print(f'Win Rate: {win_rate}')
print(f'Bust Count: {bust}')
print(len(states))

Average money gained: [-1.152  0.955 -1.466 -0.736 -0.619  3.14   1.087 -1.209]
Win Rate: [0.121   0.13775 0.1135  0.1195  0.12    0.142   0.1305  0.11575]
Bust Count: [0. 0. 0. 0. 0. 0. 0. 0.]
11003


# Play against the computer!

After generating a policy, change policy to the name of the policy you would like to play against. Change k=1 in order to change the number of hands you play.

In [None]:
human_sim = CorellianSpike(policy,human=True)
human_sim.play_sabacc(k=1)

# Action Distribution Plots!

Run the following cells to generate the same action distribution plots as given in the final report

In [None]:
directory = '/Users/marta/Desktop/spike_data_files_no_win_reward/policies'
all_policies = {key:{} for key in [filename[:-4] for filename in os.listdir(directory)]}
r_policy, state_lookup = generate_policy(2)

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    policy = txt2policy(f)
    # checking if it is a file
    all_policies[filename[:-4]] = policy


plot_order = ['random_policy', 'mid_policy_5', 'mid_policy_10', 'mid_policy_15', 'mid_policy_20', 'mid_policy_25', 'mid_policy_30','mid_policy_35', 'mid_policy_40', 'mid_policy_45', 'optimal_policy']

state_list = list(state_lookup.keys())

r0d1_states = [int(key) for key in state_list if int(key)<10000000 and int(key)>0]

r1d1_states = [int(key) for key in state_list if int(key)>=11000000 and int(key)<12000000]
r1d2_states = [int(key) for key in state_list if int(key)>=12000000 and int(key)<13000000]
r1d3_states = [int(key) for key in state_list if int(key)>=13000000 and int(key)<20000000]

r2d1_states = [int(key) for key in state_list if int(key)>=21000000 and int(key)<22000000]
r2d2_states = [int(key) for key in state_list if int(key)>=22000000 and int(key)<23000000]
r2d3_states = [int(key) for key in state_list if int(key)>=23000000 and int(key)<30000000]

r3d1_states = [int(key) for key in state_list if int(key)>=31000000 and int(key)<32000000]
r3d2_states = [int(key) for key in state_list if int(key)>=32000000 and int(key)<33000000]
r3d3_states = [int(key) for key in state_list if int(key)>=33000000]

r0d1_actions = []; r1d1_actions = []; r1d2_actions = []; r1d3_actions = []
r2d1_actions = []; r2d2_actions = []; r2d3_actions = []
r3d1_actions = []; r3d2_actions = []; r3d3_actions = []

for key in plot_order:
    policy =  all_policies[key]

    r0d1 = [policy[state] for state in r0d1_states]
    r0d1_actions.append([r0d1.count(0)/len(r0d1),r0d1.count(1)/len(r0d1),(len(r0d1)-r0d1.count(0)-r0d1.count(1))/len(r0d1)])

    r1d1 = [policy[state] for state in r1d1_states]
    r1d1_actions.append([r1d1.count(0)/len(r1d1),r1d1.count(1)/len(r1d1),(len(r1d1)-r1d1.count(0)-r1d1.count(1))/len(r1d1)])
    r1d2 = [policy[state] for state in r1d2_states]
    r1d2_actions.append([r1d2.count(3)/len(r1d2),r1d2.count(4)/len(r1d2)])
    r1d3 = [policy[state] for state in r1d3_states]
    r1d3_actions.append([r1d3.count(5)/len(r1d3),r1d3.count(6)/len(r1d3),r1d3.count(7)/len(r1d3), r1d3.count(8)/len(r1d3)])

    r2d1 = [policy[state] for state in r2d1_states]
    r2d1_actions.append([r2d1.count(0)/len(r2d1),r2d1.count(1)/len(r2d1),(len(r2d1)-r2d1.count(0)-r2d1.count(1))/len(r2d1)])
    r2d2 = [policy[state] for state in r2d2_states]
    r2d2_actions.append([r2d2.count(3)/len(r2d2),r2d2.count(4)/len(r2d2)])
    r2d3 = [policy[state] for state in r2d3_states]
    r2d3_actions.append([r2d3.count(5)/len(r2d3),r2d3.count(6)/len(r2d3),r2d3.count(7)/len(r2d3), r2d3.count(8)/len(r2d3)])

    r3d1 = [policy[state] for state in r3d1_states]
    r3d1_actions.append([r3d1.count(0)/len(r3d1),r3d1.count(1)/len(r3d1),(len(r3d1)-r3d1.count(0)-r3d1.count(1))/len(r3d1)])
    r3d2 = [policy[state] for state in r3d2_states]
    r3d2_actions.append([r3d2.count(3)/len(r3d2),r3d2.count(4)/len(r3d2)])
    r3d3 = [policy[state] for state in r3d3_states]
    r3d3_actions.append([r3d3.count(5)/len(r3d3),r3d3.count(6)/len(r3d3),r3d3.count(7)/len(r3d3), r3d3.count(8)/len(r3d3)])


In [None]:
x =  ['Initial', 'Iteration 5', 'Iteration 10', 'Iteration 15', 'Iteration 20', 'Iteration 25', 'Iteration 30','Iteration 35', 'Iteration 40', 'Iteration 45', 'Final']

size_fig = (13.5,2.5)

fig = plt.figure(figsize=size_fig)
y0 = [a[0] for a in r0d1_actions]
y1 = [a[1] for a in r0d1_actions]
y2 = [a[2] for a in r0d1_actions]
# plot bars in stack manner
plt.bar(x, y0, color='tab:blue', label = 'Raise')
plt.bar(x, y1, bottom=y0, color='tab:orange',label = 'Match')
plt.bar(x, y2, bottom=np.array(y0)+np.array(y1), color='tab:green',label = 'Fold')
plt.xlabel('Policies')
plt.title('Round 0 Decision 1')
plt.ylabel(r'Action distribution')
plt.ylim(0,1.1)
plt.legend()
plt.tight_layout()
fig.savefig('r0d1.png')


fig = plt.figure(figsize=size_fig)
y0 = [a[0] for a in r1d1_actions]
y1 = [a[1] for a in r1d1_actions]
y2 = [a[2] for a in r1d1_actions]
# plot bars in stack manner
plt.bar(x, y0, color='tab:blue', label = 'Raise')
plt.bar(x, y1, bottom=y0, color='tab:orange',label = 'Match')
plt.bar(x, y2, bottom=np.array(y0)+np.array(y1), color='tab:green',label = 'Fold')
plt.xlabel('Policies')
plt.ylabel(r'Action distribution')
plt.title('Round 1 Decision 1')
plt.ylim(0,1.1)
plt.legend()
plt.tight_layout()
fig.savefig('r1d1.png')


fig = plt.figure(figsize=size_fig)
y0 = [a[0] for a in r2d1_actions]
y1 = [a[1] for a in r2d1_actions]
y2 = [a[2] for a in r2d1_actions]
# plot bars in stack manner
plt.bar(x, y0, color='tab:blue', label = 'Raise')
plt.bar(x, y1, bottom=y0, color='tab:orange',label = 'Match')
plt.bar(x, y2, bottom=np.array(y0)+np.array(y1), color='tab:green',label = 'Fold')
plt.xlabel('Policies')
plt.ylabel(r'Action distribution')
plt.title('Round 2 Decision 1')
plt.ylim(0,1.1)
plt.legend()
plt.tight_layout()
fig.savefig('r2d1.png')

fig = plt.figure(figsize=size_fig)
y0 = [a[0] for a in r3d1_actions]
y1 = [a[1] for a in r3d1_actions]
y2 = [a[2] for a in r3d1_actions]
# plot bars in stack manner
plt.bar(x, y0, color='tab:blue', label = 'Raise')
plt.bar(x, y1, bottom=y0, color='tab:orange',label = 'Match')
plt.bar(x, y2, bottom=np.array(y0)+np.array(y1), color='tab:green',label = 'Fold')
plt.xlabel('Policies')
plt.ylabel(r'Action distribution')
plt.title('Round 3 Decision 1')
plt.ylim(0,1.1)
plt.legend()
plt.tight_layout()
fig.savefig('r3d1.png')

In [None]:
fig = plt.figure(figsize=size_fig)
y0 = [a[0] for a in r1d2_actions]
y1 = [a[1] for a in r1d2_actions]
# plot bars in stack manner
plt.bar(x, y0, color='tab:blue', label = 'Buy card')
plt.bar(x, y1, bottom=y0, color='tab:orange',label = 'No card bought')
plt.xlabel('Policies')
plt.ylabel(r'Action distribution')
plt.ylim(0,1.1)
plt.title('Round 1 Decision 2')
plt.legend()
plt.tight_layout()
fig.savefig('r1d2.png')


fig = plt.figure(figsize=size_fig)
y0 = [a[0] for a in r2d2_actions]
y1 = [a[1] for a in r2d2_actions]
# plot bars in stack manner
plt.bar(x, y0, color='tab:blue', label = 'Buy card')
plt.bar(x, y1, bottom=y0, color='tab:orange',label = 'No card bought')
plt.xlabel('Policies')
plt.ylabel(r'Action distribution')
plt.ylim(0,1.1)
plt.title('Round 2 Decision 2')
plt.legend()
plt.tight_layout()
fig.savefig('r2d2.png')

fig = plt.figure(figsize=size_fig)
y0 = [a[0] for a in r3d2_actions]
y1 = [a[1] for a in r3d2_actions]
# plot bars in stack manner
plt.bar(x, y0, color='tab:blue', label = 'Buy card')
plt.bar(x, y1, bottom=y0, color='tab:orange',label = 'No card bought')
plt.xlabel('Policies')
plt.ylabel(r'Action distribution')
plt.title('Round 3 Decision 2')
plt.ylim(0,1.1)
plt.legend()
plt.tight_layout()
fig.savefig('r3d2.png')


In [None]:
fig = plt.figure(figsize=size_fig)
y0 = [a[0] for a in r1d3_actions]
y1 = [a[1] for a in r1d3_actions]
y2 = [a[2] for a in r1d3_actions]
y3 = [a[3] for a in r1d3_actions]
# plot bars in stack manner
plt.bar(x, y0, color='tab:blue', label = 'Discard')
plt.bar(x, y1, bottom=y0, color='tab:orange',label = 'Swap with spike')
plt.bar(x, y2, bottom=np.array(y0)+np.array(y1), color='tab:green',label = 'Swap with lower hand card')
plt.bar(x, y3, bottom=np.array(y0)+np.array(y1)+np.array(y2), color='tab:purple',label = 'Swap with higher hand card')
plt.xlabel('Policies')
plt.ylabel(r'Action distribution')
plt.ylim(0,1.1)
plt.legend()
plt.title('Round 1 Decision 3')
plt.tight_layout()
fig.savefig('r1d3.png')


fig = plt.figure(figsize=size_fig)
y0 = [a[0] for a in r2d3_actions]
y1 = [a[1] for a in r2d3_actions]
y2 = [a[2] for a in r2d3_actions]
y3 = [a[3] for a in r2d3_actions]
# plot bars in stack manner
plt.bar(x, y0, color='tab:blue', label = 'Discard')
plt.bar(x, y1, bottom=y0, color='tab:orange',label = 'Swap with spike')
plt.bar(x, y2, bottom=np.array(y0)+np.array(y1), color='tab:green',label = 'Swap with lower hand card')
plt.bar(x, y3, bottom=np.array(y0)+np.array(y1)+np.array(y2), color='tab:purple',label = 'Swap with higher hand card')
plt.xlabel('Policies')
plt.ylabel(r'Action distribution')
plt.ylim(0,1.1)
plt.legend()
plt.title('Round 2 Decision 3')
plt.tight_layout()
fig.savefig('r2d3.png')


fig = plt.figure(figsize=size_fig)
y0 = [a[0] for a in r3d3_actions]
y1 = [a[1] for a in r3d3_actions]
y2 = [a[2] for a in r3d3_actions]
y3 = [a[3] for a in r3d3_actions]
# plot bars in stack manner
plt.bar(x, y0, color='tab:blue', label = 'Discard')
plt.bar(x, y1, bottom=y0, color='tab:orange',label = 'Swap with spike')
plt.bar(x, y2, bottom=np.array(y0)+np.array(y1), color='tab:green',label = 'Swap with lower hand card')
plt.bar(x, y3, bottom=np.array(y0)+np.array(y1)+np.array(y2), color='tab:purple',label = 'Swap with higher hand card')
plt.xlabel('Policies')
plt.ylabel(r'Action distribution')
plt.ylim(0,1.1)
plt.title('Round 3 Decision 3')
plt.legend()
plt.tight_layout()
fig.savefig('r3d3.png')