## NN with embedded encoding

Size 8 embedding with padding

layers: [57] -> [64, 32] -> 2

Batch size 32

Just CPU as its faster for smaller models, no data transfer

Implement Prioritized Experience Replay

Save models

In [1]:
# import libraries
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random 
import ultimate_for_NN as ultimate
from collections import deque
import NN_functions as NN
import time
from importlib import reload
reload(NN)

<module 'NN_functions' from 'c:\\Users\\neomi\\OneDrive\\Desktop\\PDF\\Poker Ultimate\\Ultimate_Texas_Holdem\\NN-Neo_igranje\\NN_functions.py'>

In [2]:
# make model
class DQNEmbedding(nn.Module):
    def __init__(self, num_cards, embedding_dim, hidden_1, hidden_2, output_dim):
        super(DQNEmbedding, self).__init__()

        # Embedding layer: 52 cards + 1 padding token
        self.card_embedding = nn.Embedding(num_cards + 1, embedding_dim, padding_idx=0)

        self.fc1 = nn.Linear(embedding_dim * 7 + 1, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, output_dim)
    
    def forward(self, card_indices, game_round):

        # Convert card indices into embeddings
        embedded_cards = self.card_embedding(card_indices)  # Shape: (batch, 5, 8)

        # Flatten embeddings
        flat_cards = embedded_cards.view(embedded_cards.size(0), -1)  # Shape: (batch, 40)
        #print(flat_cards.shape, game_round.shape, game_round.unsqueeze(1).shape)
        # Combine with game state
        x = torch.cat([flat_cards,  game_round], dim=1)

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
# function for training the model
def train_model(model, target_model, optimizer, loss_fn, num_of_games, buffer, 
                EPSILON = .2, starting_round = 0):
    batch_size = 32
    target_update = 500
    train_freq = 1
    GAMMA = .9

    def update_model_weights():
        if buffer.size() < batch_size:  
            return
        
        # Sample a mini-batch
        actions, rewards, cards_input_tensors, round_input_tensors, next_cards_input_tensors, next_round_input_tensors, end = buffer.sample(batch_size)
    
        # Convert to PyTorch tensors
        actions = torch.tensor(actions, dtype=torch.long)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        cards_input_tensors = torch.tensor(cards_input_tensors, dtype=torch.long)
        round_input_tensors = torch.tensor(round_input_tensors, dtype=torch.long).squeeze(1)
        next_cards_input_tensors = torch.tensor(next_cards_input_tensors, dtype=torch.long)
        next_round_input_tensors = torch.tensor(next_round_input_tensors, dtype=torch.long).squeeze(1)
        end = torch.tensor(end, dtype=torch.float32)
        # Compute Q-values for current states (only the taken actions)
        q_values = model(cards_input_tensors, round_input_tensors).squeeze(1).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Compute target Q-values using Bellman equation
        target_q_values = rewards.clone()
        with torch.no_grad():
            next_q_values = target_model(next_cards_input_tensors, next_round_input_tensors).squeeze(1).max(1)[0]  # Max Q-value for next state
            target_q_values += (GAMMA * next_q_values * (1 - end))  # Q-learning update, if round ended no future is included

        # Compute loss (Mean Squared Error loss)
        loss = loss_fn(q_values, target_q_values)

        # Backpropagation & gradient update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Set model to train mode
    model.train()

    # Copy weights from dqn to target_model
    target_model.load_state_dict(model.state_dict())
    target_model.eval()  # No gradient updates for target

    for episode in range(num_of_games):

        if episode % target_update == 0:
            target_model.load_state_dict(model.state_dict())  # Sync weights for target model
        
        if episode % train_freq == 0:  # Train every train_freq steps
            update_model_weights()

        round = starting_round  # Start at Round 1
        done = False
        
        # generate a training game
        whole_game = NN.generate_game()

        while not done:
            # get input data for this round
            cards_input_tensor, round_input_tensor = NN.state_to_tensor_embedding(round, whole_game)

            end = 0

            # Epsilon-greedy action selection, we will explore with probability EPSILON
            if np.random.rand() < EPSILON:
                action = np.random.choice([0, 1])
            else:
                with torch.no_grad():
                    q_values = model(cards_input_tensor, round_input_tensor)
                    action = q_values.argmax().item()

            # Calculate reward based on state and action, if action is check, reward is 0
            reward = NN.reward_function(round, whole_game, action)  
            
            if action == 1:
                round += 1 # move to the next state if the action is check/fold
            
            if action == 0 or round == 3:
                end = 1

            # TODO: get expected reward and save game only if it deviates a lot from the q value
                
            # Determine the next state, if state == 3 it will be irrelevant
            next_cards_input_tensor, next_round_input_tensor = NN.state_to_tensor_embedding(round, whole_game)
            
            # add game to buffer
            buffer.add(action, reward, cards_input_tensor, round_input_tensor, 
                       next_cards_input_tensor, next_round_input_tensor, end)
           
            # Transition to next state or end the episode if terminal
            if  round == 3 or action == 0:
                done = True

        #if episode % 100 == 0:
        #    print(f"Episode {episode}, Loss: {loss.item():.4f}")

    print("Training complete!")

In [None]:
# make the model - layer dimensions
num_cards = 52
embedding = 8  # 1 for game state [0, 1, 2] and 7 for cards, -1 means not known
hidden_1 = 64 # hidden layer 1 size
hidden_2 = 32 # hidden layer 2 size
output_dim = 2  # Two actions: Bet (0) and Check/Fold (1)

In [13]:
# Initialize DQN
emb_model = DQNEmbedding(num_cards, embedding, hidden_1, hidden_2, output_dim)
# Initialize DQN for target rewards, it will lag behind 
emb_target_model = DQNEmbedding(num_cards, embedding, hidden_1, hidden_2, output_dim)

ALPHA = 0.0001          # Learning rate
optimizer = optim.Adam(emb_model.parameters(), lr=ALPHA)
loss_fn = nn.MSELoss()

In [6]:
#initialise buffer
buffer = NN.ReplayBufferEmbedding(5000)

## Training model

In [None]:
#save model
#torch.save(emb_model.state_dict(), "embedded.pth")# commented so i dont accidentally override my model

In [15]:
#load model
emb_model.load_state_dict(torch.load("embedded.pth"))

  emb_model.load_state_dict(torch.load("embedded.pth"))


<All keys matched successfully>

Different trainings: high EPSILON - low TRESHOLD, low EPSILON - high TRESHOLD, set to train
late game or early game

In [None]:
# train the model
# Hyperparameters, epsilon control is important
num_of_games = 10000
EPSILON = 0          # Exploration probability
GAMMA = 0.9          # Discount factor

trainings = 10

start = time.time()
for i in range(trainings):
    print(f"training{i}")
    train_model(emb_model, emb_target_model, optimizer, loss_fn, num_of_games, buffer, EPSILON, 2)
    train_model(emb_model, emb_target_model, optimizer, loss_fn, num_of_games, buffer,  EPSILON, 1)
    train_model(emb_model, emb_target_model, optimizer, loss_fn, num_of_games * 2, buffer, EPSILON, 0)
end = time.time()
print(end - start)

training0
Training complete!
Training complete!
Training complete!
training1
Training complete!
Training complete!
Training complete!
training2
Training complete!
Training complete!
Training complete!
training3
Training complete!
Training complete!
Training complete!
training4
Training complete!
Training complete!
Training complete!
training5
Training complete!
Training complete!
Training complete!
training6
Training complete!
Training complete!
Training complete!
training7
Training complete!
Training complete!
Training complete!
training8
Training complete!
Training complete!
Training complete!
training9
Training complete!
Training complete!
Training complete!
1465.5833735466003


In [None]:
# testing the model
NN.testing_embedding(dqn)

Total Budget: -4159.0
Total Betted: 29095
Folded: 906-times
Betted 4x: 0-times
Betted 2x: 1-times
Betted 1x: 9093-times
