In [6]:
""" Playgame routine.  Plays one game between StratA and StratB.  Outputs return to A
    Added output of cards and plays to allow ML"""

def playgame(GameDeck, StratA, StratB, verbose = False):

    # Deal 
    cardA = GameDeck.deal()
    cardB = GameDeck.deal()
    if verbose: print("Card A: ", cardA, " Card B: ", cardB)
    playA = ""
    playB = ""
    
    # Player A decides
    playA = StratA.play(cardA,"A")
    if verbose: print("Player A: ", playA)

    # if Player A pass, showdown for $2    
    if playA == "Pass":
        if cardA > cardB:
            payout = 1
        elif cardB > cardA:
            payout = -1
        else:
            payout = 0
    # if Player A raises, player B decides
    else:
        playB = StratB.play(cardB,"B")
        if verbose: print("Player B: ", playB)
        
        #if player B calls, showdown for $4
        if playB == "Call":
            if cardA > cardB:
                payout = 2
            elif cardB > cardA:
                payout = -2
            else:
                assert (cardA == cardB)
                payout = 0
        # if player B folds, A gets the ante
        else:
            payout = 1
    if verbose: 
        print("Payout: ",payout)
        print("")
    return {'winA':payout,'playA':playA,'playB':playB,'cardA':cardA,'cardB':cardB}
        

In [7]:
""" Deck class  Defines the deck.  For now only a discrete set 0 - n-1"""

class Deck:
    def __init__(self, decksize):
        self.decksize = decksize
        self.cards = range(self.decksize)      
        
    def deal(self):
        import random              # is it ok to have this here?
        card_delt = random.randint(0,self.decksize -1 )
        return card_delt
    

In [8]:
"""  Strategy Class.  Sets standards for all strategies 
    Create a subclass for each strategy"""

class Strategy:
    def __init__(self):
        pass
        # self.gamedeck = GameDeck
        # self.decksize = GameDeck.decksize
        
    def play(self,mycard,player):
        """ determine strategy for player, having been dealt card mycard.  
        If player = "A" return either 'Pass' or 'Raise' 
        If player = 'B' return either 'Fold' or 'Call' """ 
        pass
    



In [9]:
""" Vector based strategy.  

    Paramaterized by a vector giving probability of aggressive (Raise/Call) strategy for each card.
    
    Old strategies
    
    Random - [1/2,1/2,1/2,1/2,1/2,1/2]
    Simple - e.g [0,0,0,1,1,1] 
    bluff - e.g [p,p,p,1,1,1]
    optimal - A [2/3,0,0,0,1,1]  B [0, 1/3,1/3,1,1,1]"""

class vectorstrat(Strategy):
    
    def __init__(self,aggprobs):
        self.aggprobs = aggprobs
        
    def play(self,mycard,player):
        import random
        if random.random() > self.aggprobs[mycard]:
            if player == "A":
                return 'Pass'
            else:
                return 'Fold'
        else:
            if player == "A":
                return "Raise"
            else:
                return 'Call'

        

In [10]:
"""Challenge Routine.
Plays n games between two strategies and returns the net result.
Used for testing
"""

def challenge(num_games,strata,stratb,strataname = "",stratbname = "",verbose = False):

    decksize = 6
    d = Deck(decksize)

    a_net_wins = 0

    if verbose: print("Player A: ", strataname,"   Player B: ", stratbname )

    for i in range (num_games):
        a_net_wins += playgame(d,strata,stratb)['winA']
        if verbose and i % 1000 == 0: print(i," games played")

    if verbose: print(strataname, " won $", a_net_wins, "  $", a_net_wins / num_games, " per game.")
    
    return a_net_wins

In [29]:
""" A/B Learning  both learn together

    Will use logistic regression to set each of the vector parameters individually.
    
    Inline training -- play one game, instantly update.
    
   
    
    """
import math
def logit(x):
    p = 1 / ( 1 + math.exp(-x))
    return p

def logodds(p):
    if p == 0: return -10
    elif p == 1: return 10
    else:
        x = math.log(p / (1-p))
        return x


def train_AB_logit(num_games = 10 ** 6,num_updates = 20 , alpha = 0.005, decksize = 6,
                  start_A =[], start_B =[],verbose = False,learnvsaverage = False,pass_win_reward=-1):

    import random
    import copy
    
    # set paramaters and counters

    d = Deck(decksize)
    
    if start_A == [] :
        paramvectorA = [0 for i in range(decksize)]  # starts random parameter = 0 -> prob 50%
    else:
        paramvectorA = [logodds(p) for p in start_A]
        
    

    if start_B == []:   # default for B is also 50%
        paramvectorB = [0 for i in range(decksize)]
    else:
        paramvectorB = [logodds(p) for p in start_B]
        
    
    winningsA = 0
    winningsA_temp = 0
    training_updates =[]
    games_update = num_games / num_updates
    
    
    for i in range(num_games+1):
        
        # set strategy to current logit of parameters for both A and B
        strategyvectorA = [logit(x) for x in paramvectorA]
        sa = vectorstrat(strategyvectorA)
        
        strategyvectorB = [logit(x) for x in paramvectorB]
        sb = vectorstrat(strategyvectorB)

        #play a game
        result = playgame(d,sa,sb,verbose = False)
        
        # direction of update is positive for a plus action.  Maybe change action from string to +/- 1
        if result['playA'] == "Raise":
            directionA = 1
        else:
            if result['winA'] > 0 :
                directionA =  - pass_win_reward
            else:
                directionA = -1
                
        if result['playB'] == "Call":
            directionB = 1
        else:
            directionB = -1
        
        # update strategy.  Alpha is change rate parameter.  
        # Result * direction positive when aggression paid off or passive resulted in loss -> increase prob.
        # idea -- change result to result - average
        avgwins = 0
        if learnvsaverage:
            avgwins = winnings / i
            
        paramvectorA[result['cardA']] += alpha * (result['winA']-avgwins) *  directionA
        
        if result['playA'] == "Raise":   # B only has a decision when A raises
            paramvectorB[result['cardB']] += alpha * (-1) * (result['winA'] - avgwins) * directionB
            #if result['cardB'] == 0:
            #    print('cardA: ', result['cardA']," cardB: ",result['cardB'], " playB: ",
            #          result['playB']," A Wins: ",result['winA'],
            #          ' learning:',alpha * (-1) * (result['winA'] - avgwins) * directionB)
        
      
        # avoid over/underflow errors  Likely a better method, but this works within 1%
        paramvectorA[result['cardA']] = max(-10,paramvectorA[result['cardA']])
        paramvectorA[result['cardA']] = min(10,paramvectorA[result['cardA']])
        paramvectorB[result['cardB']] = max(-10,paramvectorB[result['cardB']])
        paramvectorB[result['cardB']] = min(10,paramvectorB[result['cardB']])
        
        
        # track performance
        winningsA += result['winA']
        winningsA_temp += result['winA']
 
        # periodically update
        if i % games_update == 0 :
            if verbose:
                if i > 0: 
                    tempwinrate = round(winningsA_temp / games_update,4) 
                else: 
                    tempwinrate = "       "
                print(i, tempwinrate,'A:',[round(logit(x)*100,2) for x in paramvectorA],
                      " B: ",[round(logit(x)*100,2) for x in paramvectorB])    
            training_updates += [{'games':i,'tempAwins':winningsA_temp,
                                  'logoddsA':copy.deepcopy(paramvectorA),'logoddsB':copy.deepcopy(paramvectorB)}]
            winningsA_temp = 0
    
    if verbose:
        print("Done")
        winrateA = winningsA / num_games
        print("A's Winrate = ", winrateA)
        print([round(x,4) for x in paramvectorA])
        strategyvector = [round(logit(x)*100,2)  for x in paramvectorA]
        print(strategyvectorA)
        print([round(x,4) for x in paramvectorB])
        strategyvector = [round(logit(x)*100,2)  for x in paramvectorB]
        print(strategyvectorA)
        
    return {'num_games':num_games, 'winningsA':winningsA,'final model A':paramvectorA,
            'final model B':paramvectorB,'training_updates':training_updates}
        
    



In [24]:
t = train_AB_logit(num_games = 10**6,num_updates = 20 , alpha = 0.005, decksize = 6,
                  start_A =[], start_B =[],verbose = True,learnvsaverage = False)

0         A: [50.0, 50.0, 50.0, 50.0, 50.25, 50.0]  B:  [49.75, 50.0, 50.0, 50.0, 50.0, 50.0]
50000 0.0636 A: [55.48, 50.12, 67.92, 99.48, 100.0, 100.0]  B:  [36.94, 40.37, 54.98, 88.65, 100.0, 100.0]
100000 0.0572 A: [52.0, 52.62, 67.7, 99.98, 100.0, 100.0]  B:  [37.75, 45.88, 53.99, 88.13, 100.0, 100.0]
150000 0.0541 A: [49.0, 59.27, 68.89, 99.99, 100.0, 100.0]  B:  [36.47, 47.13, 55.48, 85.32, 100.0, 100.0]
200000 0.0435 A: [49.0, 53.49, 67.15, 100.0, 100.0, 100.0]  B:  [37.17, 40.61, 53.37, 85.57, 100.0, 100.0]
250000 0.0652 A: [45.26, 60.47, 69.95, 99.99, 100.0, 100.0]  B:  [36.47, 42.43, 52.5, 84.55, 100.0, 100.0]
300000 0.0434 A: [41.1, 55.11, 64.91, 99.99, 100.0, 100.0]  B:  [36.59, 39.53, 52.25, 85.38, 100.0, 100.0]
350000 0.049 A: [51.0, 55.72, 71.5, 100.0, 100.0, 100.0]  B:  [32.52, 42.19, 53.62, 83.69, 100.0, 100.0]
400000 0.0625 A: [47.25, 53.0, 65.7, 100.0, 100.0, 100.0]  B:  [30.26, 43.66, 54.61, 87.32, 100.0, 100.0]
450000 0.0416 A: [46.26, 52.75, 63.3, 100.0, 100.0, 10

First try 
A bit closer to the analytic solution.
Both learn to raise/call with high card very quickly.
However, something is odd with B on low card. This should quickly converge to zero, as there should be no positive inforcement.  But the param is sometimes going back up.  Adding some diagnostic printing to investigate.

In [27]:
t = train_AB_logit(num_games = 100,num_updates = 20 , alpha = 0.005, decksize = 6,
                  start_A =[], start_B =[],verbose = True,learnvsaverage = False)

0         A: [50.0, 50.0, 50.12, 50.0, 50.0, 50.0]  B:  [50.0, 50.0, 50.0, 50.0, 50.0, 50.0]
5 1.0 A: [50.0, 50.12, 50.12, 50.0, 50.0, 50.5]  B:  [50.0, 49.75, 50.0, 50.12, 50.12, 50.0]
10 -0.2 A: [50.12, 50.12, 50.0, 50.0, 50.0, 50.62]  B:  [50.0, 49.75, 50.0, 50.12, 50.5, 50.0]
cardA:  2  cardB:  0  playB:  Call  A Wins:  2  learning: -0.01
15 0.6 A: [50.12, 50.12, 50.0, 50.0, 50.0, 51.0]  B:  [49.75, 49.75, 50.0, 50.12, 50.25, 50.25]
cardA:  0  cardB:  0  playB:  Fold  A Wins:  1  learning: 0.005
cardA:  4  cardB:  0  playB:  Call  A Wins:  2  learning: -0.01
20 0.0 A: [50.25, 50.12, 50.0, 49.75, 50.37, 51.0]  B:  [49.63, 49.75, 50.0, 50.12, 50.25, 50.5]
cardA:  1  cardB:  0  playB:  Call  A Wins:  2  learning: -0.01
25 0.8 A: [50.25, 50.62, 50.0, 49.88, 50.37, 51.12]  B:  [49.38, 49.88, 50.0, 50.12, 50.37, 50.5]
30 0.6 A: [50.25, 50.62, 50.12, 50.12, 50.62, 51.37]  B:  [49.38, 49.63, 49.75, 50.12, 50.37, 50.5]
35 1.0 A: [50.25, 50.62, 50.25, 50.25, 50.75, 51.62]  B:  [49.38, 49.63,

Issue now clear.  Similar issue to before.

When B folds, he loses, so gets discouraged from folding, even though this was the right move.
This is a limitation of the linear approach.
It could be that with enough games the greater feedback of the $2 loss on a raise will eventually get the right solution.  Try another run with higher alpha and more games to test.


In [30]:
t = train_AB_logit(num_games = 10**7,num_updates = 20 , alpha = 0.025, decksize = 6,
                  start_A =[], start_B =[],verbose = True,learnvsaverage = False)

0         A: [50.0, 50.0, 51.25, 50.0, 50.0, 50.0]  B:  [50.0, 48.75, 50.0, 50.0, 50.0, 50.0]
500000 0.0478 A: [45.64, 46.26, 70.06, 99.99, 100.0, 100.0]  B:  [37.75, 43.78, 46.88, 90.25, 100.0, 100.0]
1000000 0.0515 A: [50.0, 51.87, 59.27, 99.99, 100.0, 100.0]  B:  [41.34, 38.94, 54.98, 81.76, 100.0, 100.0]
1500000 0.0494 A: [36.59, 60.47, 55.6, 99.99, 100.0, 100.0]  B:  [31.54, 47.5, 48.75, 93.55, 100.0, 100.0]
2000000 0.0485 A: [45.02, 51.25, 71.61, 99.99, 100.0, 100.0]  B:  [31.0, 43.78, 56.83, 81.0, 100.0, 100.0]
2500000 0.0431 A: [53.12, 51.87, 65.7, 99.99, 100.0, 100.0]  B:  [38.94, 40.13, 56.22, 86.12, 99.99, 100.0]
3000000 0.0506 A: [44.4, 59.87, 72.11, 99.99, 100.0, 100.0]  B:  [38.34, 40.13, 52.5, 88.08, 100.0, 100.0]
3500000 0.0522 A: [50.0, 60.47, 64.57, 100.0, 100.0, 100.0]  B:  [40.73, 46.26, 54.98, 89.57, 100.0, 100.0]
4000000 0.0498 A: [53.12, 57.44, 83.2, 99.99, 100.0, 100.0]  B:  [37.75, 57.44, 51.87, 68.46, 100.0, 100.0]
4500000 0.049 A: [44.4, 57.44, 67.92, 99.99, 

B's strategy on low card still not converging towards zero. try again with very long slow learn.

In [31]:
t = train_AB_logit(num_games = 10**9,num_updates = 200 , alpha = 0.001, decksize = 6,
                  start_A =[], start_B =[],verbose = True,learnvsaverage = False)

0         A: [50.02, 50.0, 50.0, 50.0, 50.0, 50.0]  B:  [50.0, 50.02, 50.0, 50.0, 50.0, 50.0]
5000000 0.0513 A: [45.19, 52.32, 69.06, 100.0, 100.0, 100.0]  B:  [35.94, 41.53, 53.62, 87.89, 100.0, 100.0]
10000000 0.0499 A: [45.14, 51.77, 68.76, 100.0, 100.0, 100.0]  B:  [35.43, 41.31, 53.44, 88.48, 100.0, 100.0]
15000000 0.0489 A: [46.16, 55.06, 67.55, 100.0, 100.0, 100.0]  B:  [35.64, 41.78, 53.02, 87.38, 100.0, 100.0]
20000000 0.0499 A: [44.82, 51.35, 67.24, 100.0, 100.0, 100.0]  B:  [34.23, 42.6, 53.89, 89.24, 100.0, 100.0]
25000000 0.0487 A: [45.12, 53.42, 66.73, 100.0, 100.0, 100.0]  B:  [36.47, 40.71, 55.63, 88.37, 100.0, 100.0]
30000000 0.0499 A: [45.49, 55.9, 65.7, 100.0, 100.0, 100.0]  B:  [35.16, 42.75, 52.97, 87.79, 100.0, 100.0]
35000000 0.0495 A: [45.44, 52.95, 67.22, 100.0, 100.0, 100.0]  B:  [34.62, 42.36, 53.99, 89.58, 100.0, 100.0]
40000000 0.0488 A: [45.91, 55.87, 68.68, 100.0, 100.0, 100.0]  B:  [35.85, 44.28, 55.8, 87.61, 100.0, 100.0]
45000000 0.0499 A: [45.88, 51.0

KeyboardInterrupt: 

Still not converging.  Next round will need a better learning algo.