In [1]:
%load_ext autoreload
%autoreload 1

import numpy as np
from wordle_env import WordleEnv

nenvs = 8    # change this if you have more than 8 CPU ;)
env = WordleEnv()




In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
env.game_voc_matrix

array([[22, 26, 18, 21,  7],
       [ 6, 21,  4, 17,  8],
       [19, 15,  4, 23,  8]], dtype=int32)

In [4]:
env.step(np.array([24, 24, 14, 5, 7]))

(array([[[24, 24, 14,  5,  7],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0]],
 
        [[ 1,  1,  1,  1,  1],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0],
         [ 0,  0,  0,  0,  0]]], dtype=int32),
 0.0,
 False,
 {})

In [5]:
# from wrappers import nature_dqn_env

# nenvs = 2
# env = nature_dqn_env(nenvs=nenvs)





In [7]:
list(map(lambda x: WordleEnv().tokenizer.index2letter[x], [ 7, 19, 16,  5,  7]))

['e', 'q', 'n', 'c', 'e']

In [5]:
class Agent(nn.Module):

    def __init__(self, state_shape, n_actions):
        
        super().__init__()
        
        conv_layers = [state_shape[0], 32, 64, 64]
        size = state_shape[1]        

        modules = []
        for i in range(len(conv_layers) - 1):
            size = conv2d_size_out(size, 3, 2)
            conv2d = nn.Conv2d(conv_layers[i], conv_layers[i + 1], kernel_size=3, stride=2)
            nn.init.orthogonal_(conv2d.weight, np.sqrt(2))
            nn.init.zeros_(conv2d.bias)
            modules.append(conv2d)
            modules.append(nn.ReLU())
        modules.append(Flatten())
        self.bottom = nn.Sequential(*modules)        

        modules = [nn.Linear(64 * size * size, 256), nn.ReLU(), nn.Linear(256, n_actions)]
        self.logits_head = nn.Sequential(*modules)

        modules = [nn.Linear(64 * size * size, 128), nn.ReLU(), nn.Linear(128, 1)]
        self.V_head = nn.Sequential(*modules)

    def forward(self, states):
        '''
        input:
            states - tensor, (batch_size x channels x width x height)
        output:
            logits - tensor, logits of action probabilities for your actor policy, (batch_size x num_actions)
            V - tensor, critic estimation, (batch_size)
        '''
        features = self.bottom(states)
        logits = self.logits_head(features)
        V = self.V_head(features).squeeze()
        return logits, V

In [9]:
from torch.distributions import Categorical

class Policy:
    def __init__(self, agent):
        self.agent = agent

    def act(self, inputs):
        '''
        input:
            inputs - numpy array, (batch_size x channels x width x height)
        output: dict containing keys ['actions', 'logits', 'log_probs', 'values']:
            'actions' - selected actions, numpy, (batch_size)
            'logits' - actions logits, tensor, (batch_size x num_actions)
            'log_probs' - log probs of selected actions, tensor, (batch_size)
            'values' - critic estimations, tensor, (batch_size)
        '''
        
        batch_size, *_ = inputs.shape
        
        inputs = torch.FloatTensor(inputs).to(DEVICE)
        logits, values = agent(inputs)
        
        probs = F.softmax(logits, dim=-1)
        actions = Categorical(probs=probs).sample()
        log_probs = torch.log(probs.clip(min=1e-12))[range(batch_size), actions].squeeze()
        
        return {
            "actions": actions.cpu().numpy(),
            "logits": logits,
            "log_probs": log_probs,
            "values": values,
        }

In [None]:
model = Model(obs.shape[1:], n_actions).to(DEVICE)
policy = Policy(model)

In [None]:
runner = EnvRunner(simulator, policy, nsteps=5)