In [15]:
!pip install ../pokerrl_env-0.1.3-py3-none-any.whl --force-reinstall

Processing d:\poker project\poker_project\pokerrl_env-0.1.3-py3-none-any.whl
Collecting numpy<2.0.0,>=1.19.5 (from pokerrl-env==0.1.3)
  Using cached numpy-1.26.4-cp310-cp310-win_amd64.whl.metadata (61 kB)
Collecting pytest<7.0.0,>=6.2.4 (from pokerrl-env==0.1.3)
  Using cached pytest-6.2.5-py3-none-any.whl.metadata (7.5 kB)
Collecting setuptools-rust<2.0.0,>=1.5.2 (from pokerrl-env==0.1.3)
  Using cached setuptools_rust-1.9.0-py3-none-any.whl.metadata (9.3 kB)
Collecting attrs>=19.2.0 (from pytest<7.0.0,>=6.2.4->pokerrl-env==0.1.3)
  Using cached attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting iniconfig (from pytest<7.0.0,>=6.2.4->pokerrl-env==0.1.3)
  Using cached iniconfig-2.0.0-py3-none-any.whl.metadata (2.6 kB)
Collecting packaging (from pytest<7.0.0,>=6.2.4->pokerrl-env==0.1.3)
  Downloading packaging-24.1-py3-none-any.whl.metadata (3.2 kB)
Collecting pluggy<2.0,>=0.12 (from pytest<7.0.0,>=6.2.4->pokerrl-env==0.1.3)
  Using cached pluggy-1.5.0-py3-none-any.whl.metadata

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.13.0 requires numpy<=1.24.3,>=1.22, but you have numpy 1.26.4 which is incompatible.
tensorflow-intel 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.10.0 which is incompatible.

[notice] A new release of pip is available: 23.3.2 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from pokerrl_env import Config, Game, BetLimits, GameTypes

FileNotFoundError: Could not find module 'd:\PokemonDL\deepLearning\lib\site-packages\rusteval\target\release\librusteval.dll' (or one of its dependencies). Try using the full path with constructor syntax.

In [None]:
class PokerNN(nn.Module):
    def __init__(self, input_size, output_size):
        super(PokerNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []

    def push(self, state, action):
        if len(self.buffer) >= self.capacity:
            self.buffer.pop(0)
        self.buffer.append((state, action))

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        batch = [self.buffer[idx] for idx in indices]
        state, action = zip(*batch)
        return state, action

In [None]:
env = PokerEnv()
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

policy_net = PokerNN(state_size, action_size)
target_net = PokerNN(state_size, action_size)
target_net.load_state_dict(policy_net.state_dict())

policy_optimizer = optim.Adam(policy_net.parameters())
sl_buffer = ReplayBuffer(10000)
rl_buffer = ReplayBuffer(10000)

In [None]:
def train_sl(batch_size):
    if len(sl_buffer.buffer) < batch_size:
        return
    states, actions = sl_buffer.sample(batch_size)
    states = torch.FloatTensor(states)
    actions = torch.LongTensor(actions)

    policy_optimizer.zero_grad()
    outputs = policy_net(states)
    loss = nn.CrossEntropyLoss()(outputs, actions)
    loss.backward()
    policy_optimizer.step()

def train_rl(batch_size, gamma=0.99):
    if len(rl_buffer.buffer) < batch_size:
        return
    states, actions, rewards, next_states, dones = rl_buffer.sample(batch_size)
    states = torch.FloatTensor(states)
    actions = torch.LongTensor(actions)
    rewards = torch.FloatTensor(rewards)
    next_states = torch.FloatTensor(next_states)
    dones = torch.FloatTensor(dones)

    q_values = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
    next_q_values = target_net(next_states).max(1)[0]
    expected_q_values = rewards + (gamma * next_q_values * (1 - dones))

    loss = nn.MSELoss()(q_values, expected_q_values)

    policy_optimizer.zero_grad()
    loss.backward()
    policy_optimizer.step()

In [None]:
num_episodes = 1000
batch_size = 32

for episode in range(num_episodes):
    state = env.reset()
    done = False

    while not done:
        action = policy_net(torch.FloatTensor(state)).argmax().item()
        next_state, reward, done, _ = env.step(action)

        sl_buffer.push(state, action)
        rl_buffer.push(state, action, reward, next_state, done)

        state = next_state

        train_sl(batch_size)
        train_rl(batch_size)

    if episode % 10 == 0:
        target_net.load_state_dict(policy_net.state_dict())
        print(f'Episode {episode} completed')