# Monte Carlo Counterfactual regret minimization
Resources:
- Paper [An Introduction to Counterfactual Regret Minimization](http://modelai.gettysburg.edu/2013/cfr/cfr.pdf)


In [1]:
import os
import numpy as np
import random
import matplotlib.pyplot as plt

## Rock, Paper, Scissors
In Rock Paper Scissors and every two-player zero-sum game: when both players use regret-matching to update their strategies,
the pair of average strategies converges to a Nash equilibrium as the number of iterations tends to infinity. 
At each iteration, both players update their regrets as above and then both each player computes their own new strategy based on their own regret tables.
Modify the RPSTrainer program above so that both players use regret matching. 
Compute and print the resulting unique equilibrium strategy.

<!-- 
    The best strategy for rock, paper, scissors against a well paying oppenent is not to find a winning strategy but achive Nash equilibrium on the strategy.
    A nash equilibrium on rock, paper, scissors will be a strategy of 1/3 chance of picking which geuster as a action.
 -->
 

## Regret Algorithm

In [56]:
# RPS = Rock, Paper, Scissors
class RPSTrainer:
    ROCK = 0
    PAPER = 1
    SCISSORS = 2
    NUM_ACTIONS = 3

    def __init__(self, opponentStrategy):
        self.regretSum = np.zeros(self.NUM_ACTIONS, dtype=np.float64)
        self.strategySum = np.zeros(self.NUM_ACTIONS, dtype=np.float64)
        self.strategy = np.zeros(self.NUM_ACTIONS, dtype=np.float64)
        self.opponentStrategy = opponentStrategy
        self.opponentRealStrategy = np.zeros(self.NUM_ACTIONS, dtype=np.float64)

    # Get current mixed strategy through regret-matching
    def getStrategy(self):
        normalizingSum = 0
        for i in range(self.NUM_ACTIONS):
            self.strategy[i] = self.regretSum[i] if self.regretSum[i] > 0 else 0
            normalizingSum += self.strategy[i]

        for i in range(self.NUM_ACTIONS):
            self.strategy[i] = self.strategy[i] / normalizingSum if normalizingSum > 0 else 1.0 / self.NUM_ACTIONS
            self.strategySum[i] += self.strategy[i]
        
        return self.strategy

    # Get random action according to mixed-strategy distribution
    def getAction(self, strategy):
        rr = random.random()
        cumlativeProbability = 0
        action = 0

        while action < self.NUM_ACTIONS-1:
            cumlativeProbability += strategy[action]
            if rr < cumlativeProbability:
                break
            action += 1

        return action


    # Train
    def train(self, iterations):
        actionUtility = np.zeros(self.NUM_ACTIONS)
        for _ in range(iterations):
            # Regret Mixed-strategy actions
            strategy = self.getStrategy()
            myAction = self.getAction(strategy)
            opponentAction = self.getAction(self.opponentStrategy)
            
            # Compute Action utilities 
            actionUtility[opponentAction] = 0
            actionUtility[(opponentAction + 1) % self.NUM_ACTIONS] = 1
            actionUtility[(opponentAction - 1) % self.NUM_ACTIONS] = -1
            
            # Accumulate action regrets
            for i in range(self.NUM_ACTIONS):
                self.regretSum[i] += actionUtility[i] - actionUtility[myAction]

    @classmethod
    def winner(cls, p1, p2):
        if p1 == p2:
            return 0
        elif (p1==cls.ROCK and p2==cls.SCISSORS) or (p1==cls.SCISSORS and p2==cls.PAPER) or (p1==cls.PAPER and p2==cls.ROCK):
            return 1
        else:
            return -1
        
    def play(self, iterations):
        myStrategy = self.strategy
        winCount, drawCount, lossCount = 0, 0, 0

        for _ in range(iterations):
            myAction = self.getAction(myStrategy)
            oppentAction = self.getAction(self.opponentStrategy)
            
            winner = self.winner(myAction, oppentAction)
            if winner == 1:
                winCount += 1
            elif winner == -1:
                 lossCount += 1
            else:
                drawCount += 1

        return (winCount, drawCount, lossCount)

In [59]:
# Opponent probability distribution of choosing hand. This is also know as strategy here.
oppStrategy = np.array([0.1, 0.8, 0.1]) # 10 % Rock, 80% Paper, 10% Scissors
#oppStrategy = np.array([1/3 for _ in range(3)]) # Nash equilibrium of 33 % Rock, 33% Paper, 33% Scissors

# Create it
engine = RPSTrainer(oppStrategy)

# Train, it!
print(10*"=", "Training", "="*10)
iteration = 1000
engine.train(iteration)
print(f"{iteration} iterations trained")

# Test, it!
print(10*"=", "Playing", "="*10)
print('Playing using my trained strategy:')
w, d, l = engine.play(1000)

print('win : ',w)
print('draw: ',d)
print('loss: ',l)

1000 iterations trained
Playing using my trained strategy:
win :  795
draw:  113
loss:  92


In [39]:
#oppStrategy = np.array([1/3 for _ in range(3)]) # Nash equilibrium of Rock, Paper, Scissors


oppStrategy = np.array([0.0, 0.1, 0.0]) # Always Paper
engine = RPSTrainer(oppStrategy)

print('Practice Training:')
iterations = 10
actionUtility = np.zeros(engine.NUM_ACTIONS)

for i in range(iterations):
    print(f"Iteration {i+1}", end="\t")
    # Regret Mixed-strategy actions
    strategy = engine.getStrategy()
    myAction = engine.getAction(strategy)
    opponentAction = engine.getAction(engine.opponentStrategy)

    # Compute Action utilities 
    actionUtility[opponentAction] = 0
    actionUtility[(opponentAction + 1) % engine.NUM_ACTIONS] = 1
    actionUtility[(opponentAction - 1) % engine.NUM_ACTIONS] = -1
    
    # Accumulate action regrets
    for i in range(engine.NUM_ACTIONS):
        engine.regretSum[i] += actionUtility[i] - actionUtility[myAction]
    
    print("Bot strategy:", strategy, "Regret Sum:", engine.regretSum, "Action Utility:", actionUtility)

hand = {0: 'Rock', 1: 'Paper', 2: 'Scissors'}
print("BOT expect Rock. Answer ==", hand[engine.getAction(strategy)])

Practice Training:
Iteration 1	Bot strategy: [0.33333333 0.33333333 0.33333333] Regret Sum: [ 1. -1.  0.] Action Utility: [ 1. -1.  0.]
Iteration 2	Bot strategy: [1. 0. 0.] Regret Sum: [ 1. -3. -1.] Action Utility: [ 1. -1.  0.]
Iteration 3	Bot strategy: [1. 0. 0.] Regret Sum: [ 1. -2.  1.] Action Utility: [-1.  0.  1.]
Iteration 4	Bot strategy: [0.5 0.  0.5] Regret Sum: [ 1. -4.  0.] Action Utility: [ 1. -1.  0.]
Iteration 5	Bot strategy: [1. 0. 0.] Regret Sum: [ 1. -6. -1.] Action Utility: [ 1. -1.  0.]
Iteration 6	Bot strategy: [1. 0. 0.] Regret Sum: [ 1. -8. -2.] Action Utility: [ 1. -1.  0.]
Iteration 7	Bot strategy: [1. 0. 0.] Regret Sum: [  1. -10.  -3.] Action Utility: [ 1. -1.  0.]
Iteration 8	Bot strategy: [1. 0. 0.] Regret Sum: [  1. -12.  -4.] Action Utility: [ 1. -1.  0.]
Iteration 9	Bot strategy: [1. 0. 0.] Regret Sum: [  1. -14.  -5.] Action Utility: [ 1. -1.  0.]
Iteration 10	Bot strategy: [1. 0. 0.] Regret Sum: [  1. -16.  -6.] Action Utility: [ 1. -1.  0.]
BOT expect 