## NN-4_13_encoding

We  represent cards as (suit, rank)

Layers size: 15 -> [128, 64] -> 2

batch size: 32

In [1]:
# import libraries
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random 
import ultimate_for_NN as ultimate
from collections import deque
import NN_functions as NN

In [2]:
print(torch.cuda.is_available())  # Should return True if a GPU is available
print(torch.cuda.device_count())  # Number of GPUs available
print(torch.cuda.get_device_name(0))  # Name of the first GPU

True
1
NVIDIA GeForce GTX 1650 Ti


In [None]:
# make model
class DQN(nn.Module):
    def __init__(self, input_dim, hidden_1, hidden_2, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, output_dim)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
# function for training the model
def train_model(model, target_model, optimizer, loss_fn, num_of_games, buffer,
                EPSILON = .2, GAMMA = .9, starting_round = 0,
                batch_size = 32, target_update = 500, train_freq = 4):
    
    def update_model_weights():
        if buffer.size() < batch_size:  
            return
        
        # Sample a mini-batch
        actions, rewards, round_input_tensors, next_round_input_tensors, end = buffer.sample(batch_size)
 
        # Convert to PyTorch tensors
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        round_input_tensors = torch.tensor(round_input_tensors, dtype=torch.float32)
        next_round_input_tensors = torch.tensor(next_round_input_tensors, dtype=torch.float32)
        end = torch.tensor(end, dtype=torch.float32)
        # Compute Q-values for current states (only the taken actions)
        q_values = model(round_input_tensors).squeeze(1).gather(1, actions.unsqueeze(1)).squeeze(1)

        # Compute target Q-values using Bellman equation
        target_q_values = rewards
        with torch.no_grad():
            next_q_values = target_model(next_round_input_tensors).squeeze(1).max(1)[0]  # Max Q-value for next state
            target_q_values += (GAMMA * next_q_values * (1 - end))  # Q-learning update, if round ended no future is included

        # Compute loss (Mean Squared Error loss)
        loss = loss_fn(q_values, target_q_values)

        # Backpropagation & gradient update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Set model to train mode
    model.train()

    # Copy weights from dqn to target_model
    target_model.load_state_dict(model.state_dict())
    target_model.eval()  # No gradient updates for target

    for episode in range(num_of_games):

        if episode % target_update == 0:
            target_model.load_state_dict(model.state_dict())  # Sync weights for target model
        
        if episode % train_freq == 0:  # Train every 4 steps
            update_model_weights()

        round = starting_round  # Start at Round 1
        done = False
        
        # generate a training game
        whole_game = NN.generate_game()

        while not done:
            # get input data for this round
            round_input_tensor = NN.state_to_tensor(round, whole_game)
            end = 0

            # Epsilon-greedy action selection, we will explore with probability EPSILON
            if np.random.rand() < EPSILON:
                action = np.random.choice([0, 1])
            else:
                with torch.no_grad():
                    q_values = model(round_input_tensor)
                    action = q_values.argmax().item()

            # Calculate reward based on state and action, if action is check, reward is 0
            reward = NN.reward_function(round, whole_game, action)  
            
            if action == 1:
                round += 1 # move to the next state if the action is check/fold
            
            if action == 0 or round == 3:
                end = 1
                
            # Determine the next state, if state == 3 it will be irrelevant
            next_round_input_tensor = NN.state_to_tensor(round, whole_game)
            
            # add game to buffer
            buffer.add(action, reward, round_input_tensor, next_round_input_tensor, end)
           
            # Transition to next state or end the episode if terminal
            if  round == 3 or action == 0:
                done = True

        #if episode % 100 == 0:
        #    print(f"Episode {episode}, Loss: {loss.item():.4f}")

    print("Training complete!")

In [None]:
# make the model
# layer dimensions
input_dim = 15  # 1 for game state [0, 1, 2] and 7 for cards (1 card: (1-4, 1-13)), 0 means not known
hidden_1 = 252 # hidden layer 1 size
hidden_2 = 128 # hidden layer 2 size
output_dim = 2  # Two actions: Bet (0) and Check/Fold (1)

# Initialize DQN
dqn = DQN(input_dim, hidden_1, hidden_2, output_dim)
# Initialize DQN for target rewards, it will lag behind 
targetDQN = DQN(input_dim, hidden_1, hidden_2, output_dim)
 
# try and get it on
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
#initialise buffer
buffer = NN.ReplayBuffer(5000)

In [None]:
# train the model
# Hyperparameters
num_of_games = 10000
EPSILON = 0.2         # Exploration probability
ALPHA = 0.0001          # Learning rate
GAMMA = 0.9          # Discount factor
optimizer = optim.Adam(dqn.parameters(), lr=ALPHA)
loss_fn = nn.MSELoss()
trainings = 5
for i in range(trainings):
    print(f"training{i}")
    train_model(dqn, targetDQN, optimizer, loss_fn, num_of_games, buffer, EPSILON, GAMMA, 2)
    train_model(dqn, targetDQN, optimizer, loss_fn, num_of_games, buffer, EPSILON, GAMMA, 1)
    train_model(dqn, targetDQN, optimizer, loss_fn, num_of_games, buffer, EPSILON, GAMMA, 0)