In [1]:
import numpy as np

In [72]:
def rps_util(i, j):
    if i == j: 
        return 0
    action = [0, 1, 2, 0]
    player_0_win = set([(action[i], action[i+1]) for i in range(3)])

    if (i, j) in player_0_win: 
        return 1
    else: 
        return -1

In [70]:
def get_nash(util, num_decisions, num_iterations):
    # This works for any two-player zero-sum game with utility function implemented by util(i, j)
    total_strategy_profile = [np.zeros(num_decisions) for _ in range(2)]
    cumulative_regret = [np.zeros(num_decisions) for _ in range(2)]
    for _ in range(num_iterations):
        # get the probability distribution:
        curr_strategy_profile = []
        for i in range(2):
            cumulative_regret_player = cumulative_regret[i] 
            if np.max(cumulative_regret_player) <= 0: 
                curr_strategy_profile.append(np.random.uniform(low=0, high=10, size=num_decisions))
            else: 
                curr_strategy_profile.append(np.clip(cumulative_regret_player, 0, np.inf))
        
        #sample actions:
        player_distribution = [strat/np.sum(strat) for strat in curr_strategy_profile]

        action = [np.random.choice(range(num_decisions), p=p) for p in player_distribution]

        #calculate regrets:
        regrets_0 = [util(i, action[1]) - util(action[0], action[1]) for i in range(num_decisions)]
        regrets_1 = [-(util(action[0], i) - util(action[0], action[1])) for i in range(num_decisions)]

        #add regrets to the cumulative regrets:
        cumulative_regret[0] += regrets_0
        cumulative_regret[1] += regrets_1

        #add strategy to cumulative strategy
        for i in range(2):
            total_strategy_profile[i] += player_distribution[i]

    nash = [total_strat/num_iterations for total_strat in total_strategy_profile]
    return nash

In [75]:
get_nash(rps_util, num_decisions=3, num_iterations=10000)

[array([0.33830721, 0.32382131, 0.33787149]),
 array([0.329541  , 0.33440232, 0.33605668])]