# Training with greedy algorithm.


In [7]:
from Hangman_env import HangedManEnv
import nltk

### Let's start an Environment

In [8]:
max_length = 3
env = HangedManEnv( )

TypeError: HangedManEnv.__init__() missing 1 required positional argument: 'word_dictionary'

# La partie apprentissage

## Définission d'une policy

In [None]:
import numpy as np

# Q-learning parameters
learning_rate = 0.1
discount_factor = 0.99
initial_epsilon = 1.0
epsilon_decay = 0.9975
min_epsilon = 0.05

# Initialize the Q-table
# The shape is based on the observation space and action space
q_table_shape = (28**max_length, 26)  # Adjust based on the state and action space
q_table = np.zeros(q_table_shape)

def epsilon_greedy_policy(state):
    """
    Returns an action based on epsilon-greedy policy.
    """
    if np.random.uniform(0, 1) < epsilon:
        # Exploration: choose a random action
        return np.random.choice(26)
    else:
        # Exploitation: choose the action with max Q-value for the current state
        # Mask forbidden actions using the second element of the observation
        masked_q_values = np.copy(q_table[state[0]])
        # Mask by setting very low Q-value for letter already tried
        masked_q_values[state[1]] = -np.inf
        # return the action with the max Q-value
        return np.argmax(masked_q_values)

## Entrainement

In [None]:
# Training loop
# Training parameters
total_episodes = 100000
batch_size = 500  # Number of episodes in a batch
n_tests = 500  # Number of evaluation runs after each batch
batch_rewards = []

epsilon = initial_epsilon # epsilon will update after each batch

# Train per batches
# epsilon decay after batch
# tests after batch
for batch in range(total_episodes // batch_size):
    # Training episodes
    batch_reward_sum = 0
    for episode in range(batch_size):

        state, _ = env.reset()

        # Convert state to a single integer for indexing in Q-table
        state_index = np.dot(state[0], [28**i for i in range(max_length)])

        done = False
        while not done:
            # action is determined with epsilon greedy policy
            # either exploratory or best according to current q-table
            action = epsilon_greedy_policy((state_index, state[1]))
            next_state, reward, done, _, _ = env.step(action)

            # Convert next_state to a single integer for indexing in Q-table
            next_state_index = np.dot(next_state[0], [28**i for i in range(max_length)])

            # Q-learning update rule
            best_next_action = np.argmax(q_table[next_state_index])
            td_target = reward + discount_factor * q_table[next_state_index][best_next_action]
            td_error = td_target - q_table[state_index][action]
            q_table[state_index][action] += learning_rate * td_error

            state = next_state
            state_index = next_state_index


    epsilon *= epsilon_decay
    epsilon = max(epsilon, min_epsilon)

    # Evaluation after each batch
    total_test_rewards = []
    for _ in range(n_tests):
        state, _ = env.reset()
        state_index = np.dot(state[0], [28**i for i in range(max_length)])
        total_reward = 0
        done = False
        while not done:
            # Always choose the best possible action
            # Same as using the policy with epsilon = 0
            masked_q_values = np.copy(q_table[state_index])
            masked_q_values[state[1]] = -np.inf
            action = np.argmax(masked_q_values)
            state, reward, done, _, _ = env.step(action)
            state_index = np.dot(state[0], [28**i for i in range(max_length)])
            total_reward += reward

        total_test_rewards.append(total_reward)
    #
    mean_test_reward = np.mean(total_test_rewards)
    batch_rewards.append(mean_test_reward)


In [None]:
epsilon

In [None]:
import matplotlib.pyplot as plt

# Plotting
plt.plot(batch_rewards)
plt.xlabel('Batch')
plt.ylabel('Mean Evaluation Reward')
plt.title('Evaluation Reward over Time')
plt.show()