## Neural network - v 0.01

We have 3 rounds.
- 1. Bet or check
- 2. Bet or check
- 3. Bet or fold

We generate the game first and then feed the data to network,
so we know the game rewards.\\

Learning parameters:
- epsilon for exploration vs exploitation
- gama for future rewards
- learning rate for learning intensity

We implement Buffer and learning with batches

In [44]:
# import libraries
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random 
import ultimate
from collections import deque

In [None]:
# define card set
suits = ['Hearts', 'Diamonds', 'Clubs', 'Spades']
ranks = ['2', '3', '4', '5', '6', '7', '8', '9', '10', 'J', 'Q', 'K', 'A']
rank_values = {rank: i for i, rank in enumerate(ranks, start=2)}

deck = [{'rank': rank, 'suit': suit} for suit in suits for rank in ranks]

combinations = ["High Card", "One Pair", "Two Pair", "Three of a Kind", "Four of a Kind", 
                "Full House", "Straight", "Flush", "Straight Flush", "Royal Flush"]
combinations_values = {combination: i for i, combination in enumerate(combinations, start=1)}
# set ordered winning combinations
winning_hands = ["High Card", "One Pair", "Two Pair", "Three of a Kind", "Straight", "Flush", 
                "Full House", "Four of a Kind", "Straight Flush", "Royal Flush"]
#enumerate the deck
enumerated_deck = dict(enumerate(deck, start=1))
num_deck = np.arange(1, 53)

# define split card dict, num_of_cards: (num of color, num of rank) ex. 13: (1, 13)
split_card_dict={}
for i in num_deck:
    split_card_dict[i] = ((i - 1) // 13 + 1, (i-1) % 13 + 1)

split_card_dict

In [46]:
# function for generating a game (all cards) - output a list of all cards in game cards
# cards are interpreted as: 2 for each player, 5 for the table, last 2 for the dealer
def generate_game(num_of_players = 1):
    game_size = 7 + 2*num_of_players

    whole_game = np.random.choice(np.arange(1, 53), size=game_size, replace=False)
    
    return whole_game

In [31]:
# make model
class DQN(nn.Module):
    def __init__(self, input_dim, hidden_1, hidden_2, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, output_dim)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [53]:
# next state and next cards - returns next state input, game is 7-14 with split info for cards
# each cards ha info about suit (1-4) and rank (1-13)
#hardcoded for one player
def state_to_tensor(round, whole_game):
    split_card_info = [element for i in whole_game for element in split_card_dict[i]]

    cards_len = len(split_card_info) - 4
    if round == 0:
        cards = np.array(split_card_info[:4])
    elif round == 1:
        cards = np.array(split_card_info[:10])
    elif round == 2:
        cards = np.array(split_card_info[:14])
    else:
        cards = np.array([]) # irrelevant just for filler

    cards = np.pad(cards, (0, cards_len - len(cards)), mode='constant')
    state = np.array([round])
    full_state = np.concatenate([state, cards])
    return torch.from_numpy(full_state).unsqueeze(0).float()

In [None]:
# reward function - target is where the reward was biggest
def reward_function(round, whole_game, action):
    # normalize reward values (royal flush and straight flusch 
    # will be more than 1 but its tolerable)
    
    def normalize_reward(r, min_r=-6, max_r=11):
        return (r - min_r) / (max_r - min_r)
    # blind and ante are set to be 1
    blind = 1
    ante = 1

    # if we fold in third round - we lose 
    if round == 2 and action == 1:
        return normalize_reward(- blind - ante) 
    # if we say check in first or second round - the reward is neutral 
    elif round != 2 and action == 1:
        return normalize_reward(0)
    
    # in all other cases we need to know who won and if the ante or blind are valid
    player_hand = [enumerated_deck[card] for card in whole_game[:7]]
    dealer_hand = [enumerated_deck[card] for card in whole_game[2:]]
    #print(player_hand)
    #print(dealer_hand)
    player_combination = ultimate.get_best_hand(player_hand)
    dealer_combination = ultimate.get_best_hand(dealer_hand)
    #print(player_combination, dealer_combination)
    # find the victor (Player, Dealer, Tie)
    victor = ""
    if winning_hands.index(player_combination) > winning_hands.index(dealer_combination):
        victor = "Player"
    elif winning_hands.index(player_combination) == winning_hands.index(dealer_combination):
        result = ultimate.decider(player_combination, player_hand, 
                                  dealer_combination, dealer_hand)
        if result == "player":
            victor = "Player"
        elif result == "dealer":	
            victor = "Dealer"	
        else:
            return normalize_reward(0) # the reward for game is 0 if tie
    else:
        victor = "Dealer"

    # we bet in third round
    if round == 2:
        if victor == "Player":
            # calculate ante and blind
            ante_valid = ultimate.has_ante(dealer_hand, dealer_combination) # boolean
            blind_pay = ultimate.net_blind_payout(blind, player_combination) # value if won
            return normalize_reward(ante + blind_pay + ante if ante_valid else 0)
        else:
            return normalize_reward(- blind - 2*ante)
    # we bet in second round
    elif round == 1:
        if victor == "Player":
            # calculate ante and blind
            ante_valid = ultimate.has_ante(dealer_hand, dealer_combination) # boolean
            blind_pay = ultimate.net_blind_payout(blind, player_combination) # value if won
            return normalize_reward(2 * ante + blind_pay + ante if ante_valid else 0)
        else:
            return normalize_reward(- blind - 3*ante)
    # we bet in first round
    elif round == 0:
        if victor == "Player":
            # calculate ante and blind
            ante_valid = ultimate.has_ante(dealer_hand, dealer_combination) # boolean
            blind_pay = ultimate.net_blind_payout(blind, player_combination) # value if won
            return normalize_reward(4 * ante + blind_pay + ante if ante_valid else 0)
        else:
            return normalize_reward(- blind - 5*ante)

In [48]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)  # Stores experiences up to `capacity`

    # store games in buffer
    def add(self, action, reward, round_input_tensor, next_round_input_tensor, end):
        self.buffer.append((action, reward, round_input_tensor, next_round_input_tensor, end))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)  # Random mini-batch
        actions, rewards, round_input_tensors, next_round_input_tensors, end = zip(*batch)
        
        # Convert to NumPy arrays for easier tensor conversion
        return (np.array(actions), np.array(rewards), np.array(round_input_tensors),
                np.array(next_round_input_tensors), np.array(end))

    def size(self):
        return len(self.buffer)  # Current number of experiences stored

In [None]:
# function for training the model
def train_model(model, target_model, optimizer, loss_fn, num_of_games, buffer,
                EPSILON = .2, GAMMA = .9, starting_round = 0,
                batch_size = 32, target_update = 500, train_freq = 4):
    
    def update_model_weights():
        if buffer.size() < batch_size:  
            return
        
        # Sample a mini-batch
        actions, rewards, round_input_tensors, next_round_input_tensors, end = buffer.sample(batch_size)
 
        # Convert to PyTorch tensors
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        round_input_tensors = torch.tensor(round_input_tensors, dtype=torch.float32)
        next_round_input_tensors = torch.tensor(next_round_input_tensors, dtype=torch.float32)
        end = torch.tensor(end, dtype=torch.float32)
        print("Model output shape:", model(round_input_tensors).shape)
        print("Actions shape before unsqueeze:", actions.shape)
        print("Actions shape after unsqueeze:", actions.unsqueeze(1).shape)
        # Compute Q-values for current states (only the taken actions)
        q_values = model(round_input_tensors).squeeze(1).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Compute target Q-values using Bellman equation
        target_q_values = rewards
        with torch.no_grad():

            next_q_values = target_model(next_round_input_tensors).max(1)[0]  # Max Q-value for next state

            print("Before squeeze:", target_model(next_round_input_tensors).shape)  
            print("After squeeze:", target_model(next_round_input_tensors).squeeze(1).shape) 
            print(target_q_values.shape, next_q_values.shape, end.shape)
            print(next_q_values)
            target_q_values += (0.99 * next_q_values * (1 - end))  # Q-learning update, if round ended no future is included

        # Compute loss (Mean Squared Error loss)
        loss = loss_fn(q_values, target_q_values)

        # Backpropagation & gradient update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Set model to train mode
    model.train()

    # Copy weights from dqn to target_model
    target_model.load_state_dict(model.state_dict())
    target_model.eval()  # No gradient updates for target

    for episode in range(num_of_games):

        if episode % target_update == 0:
            target_model.load_state_dict(model.state_dict())  # Sync weights for target model
        
        if episode % train_freq == 0:  # Train every 4 steps
            update_model_weights()

        round = starting_round  # Start at Round 1
        done = False
        
        # generate a training game
        whole_game = generate_game()

        while not done:
            # get input data for this round
            round_input_tensor = state_to_tensor(round, whole_game)
            end = 0

            # Epsilon-greedy action selection, we will explore with probability EPSILON
            if np.random.rand() < EPSILON:
                action = np.random.choice([0, 1])
            else:
                with torch.no_grad():
                    q_values = model(round_input_tensor)
                    action = q_values.argmax().item()

            # Calculate reward based on state and action, if action is check, reward is 0
            reward = reward_function(round, whole_game, action)  
            
            if action == 1:
                round += 1 # move to the next state if the action is check/fold
            
            if action == 0 or round == 3:
                end = 1
                
            # Determine the next state, if state == 3 it will be irrelevant
            next_round_input_tensor = state_to_tensor(round, whole_game)
            
            # add game to buffer
            buffer.add(action, reward, round_input_tensor, next_round_input_tensor, end)
           
            # Transition to next state or end the episode if terminal
            if  round == 3 or action == 0:
                done = True

        #if episode % 100 == 0:
        #    print(f"Episode {episode}, Loss: {loss.item():.4f}")

    print("Training complete!")

In [64]:
# make the model
# layer dimensions
input_dim = 15  # 1 for game state [0, 1, 2] and 7 for cards (1 card: (1-4, 1-13)), 0 means not known
hidden_1 = 252 # hidden layer 1 size
hidden_2 = 128 # hidden layer 2 size
output_dim = 2  # Two actions: Bet (0) and Check/Fold (1)

# Initialize DQN
dqn = DQN(input_dim, hidden_1, hidden_2, output_dim)
# Initialize DQN for target rewards, it will lag behind 
targetDQN = DQN(input_dim, hidden_1, hidden_2, output_dim)
 
# try and get it on
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [65]:
#initialise buffer
buffer = ReplayBuffer(5000)

In [96]:
# train the model
# Hyperparameters
num_of_games = 1000
EPSILON = 0.2         # Exploration probability
ALPHA = 0.0001          # Learning rate
GAMMA = 0.9          # Discount factor
optimizer = optim.Adam(dqn.parameters(), lr=ALPHA)
loss_fn = nn.MSELoss()
trainings = 1
for i in range(trainings):
    print(f"training{i}")
    train_model(dqn, targetDQN, optimizer, loss_fn, num_of_games, buffer, EPSILON, GAMMA, 2)
    train_model(dqn, targetDQN, optimizer, loss_fn, num_of_games, buffer, EPSILON, GAMMA, 1)
    train_model(dqn, targetDQN, optimizer, loss_fn, num_of_games, buffer, EPSILON, GAMMA, 0)

training0
Model output shape: torch.Size([32, 1, 2])
Actions shape before unsqueeze: torch.Size([32])
Actions shape after unsqueeze: torch.Size([32, 1])
torch.Size([32, 1, 2])
torch.Size([32]) torch.Size([32, 2]) torch.Size([32])
tensor([[-0.6350, -0.6722],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [-1.1807, -0.6817],
        [ 0.0299, -0.0309],
        [-0.5371, -0.9403],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [ 0.0299, -0.0309],
        [-0.4315, -0.7774],
        [ 0.02

RuntimeError: The size of tensor a (2) must match the size of tensor b (32) at non-singleton dimension 1

In [82]:
# testing model
dqn.eval()  # Set model to evaluation mode
games = 1
budget = 0
betted = 0
betted4x = 0
betted2x = 0
betted1x = 0
folded = 0
for i in range(games):
    round = 0

    whole_game = generate_game()
    while True:
        state_tensor = state_to_tensor(round, whole_game)
         
        with torch.no_grad():
            q_values = dqn(state_tensor)
            action = q_values.argmax().item()
        print(q_values)
        # if action is no (check/fold) we move to the next round
        if action == 1 and round != 2:
            round += 1
        else:
            budget += reward_function(round, whole_game, action)
            if action == 1 and round == 2:
                folded += 1
                betted += 2 # currently hardcoded
            elif round == 2:
                betted1x += 1
                betted += 3
            elif round == 1:
                betted2x += 1
                betted += 4
            elif round == 0:
                betted4x += 1
                betted += 6
            break
    
    #print(f"Game: {i+1}, Budget: {budget}")

print(f"Total Budget: {budget}")
print(f"Total Betted: {betted}")
print(f"Folded: {folded}-times")
print(f"Betted 4x: {betted4x}-times")
print(f"Betted 2x: {betted2x}-times")
print(f"Betted 1x: {betted1x}-times")

tensor([[-0.2067, -0.2101]])
Total Budget: 0.0
Total Betted: 6
Folded: 0-times
Betted 4x: 1-times
Betted 2x: 0-times
Betted 1x: 0-times
