In [12]:
import numpy as np
import string
import joblib
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque, Counter, defaultdict
import random

# ====================================================================
# 1. HMM and Environment Setup (Prerequisites)
# ====================================================================

# --- A. SimpleHMM Class Definition (Required for joblib.load) ---
class SimpleHMM:
    """
    A simplified HMM structure for Hangman letter probabilities.
    (Must be defined before loading the model).
    """
    def __init__(self):
        self.bigram_counts = defaultdict(Counter)
        self.unigram_counts = Counter()
        self.vocab = list(string.ascii_lowercase)

    # Simplified train/save/load methods for context completeness
    def train(self, corpus_path):
        # Implementation from your 1_HMM_Model.ipynb
        # (This is only a placeholder; the actual model is loaded from joblib)
        pass
        
    def get_letter_probs(self, masked_word, guessed):
        """Estimates letter probabilities based on bigram counts."""
        # For simplicity and stability, this uses the masked word to find 
        # potential next letters from the HMM's bigram transition.
        
        # Note: Your actual HMM implementation may be more complex.
        # This function must return a np.array of shape (26,)
        
        # Placeholder logic: Find all blanks and calculate posterior probability 
        # based on bigram transitions from neighbors. 
        probs = Counter()
        blanks = [i for i, ch in enumerate(masked_word) if ch == '_']
        
        if not blanks:
             # Solved word, return uniform zero probability
             return np.zeros(26, dtype=np.float32)

        for i in blanks:
            # Check prefix (previous letter or start token <s>)
            prefix = masked_word[i-1] if i > 0 else '<s>'
            
            # Check suffix (next letter or end token <e>) - not strictly needed for basic bigram
            # suffix = masked_word[i+1] if i < len(masked_word) - 1 else '<e>'

            if prefix in self.bigram_counts:
                 # Add the transition probabilities from the prefix
                for ch in self.vocab:
                    if ch not in guessed:
                        # Simple un-normalized transition count
                        probs[ch] += self.bigram_counts[prefix].get(ch, 0)

        total = sum(probs.values())
        
        # If no valid transitions found, fall back to unigram/uniform distribution
        if total == 0:
            for ch in self.vocab:
                if ch not in guessed:
                    probs[ch] = self.unigram_counts.get(ch, 1) # Use unigram or 1 for uniform
            total = sum(probs.values())
            
        # Normalize and return
        if total == 0:
             return np.zeros(26, dtype=np.float32) # Final safety check

        return np.array([probs.get(ch, 0) / total for ch in self.vocab], dtype=np.float32)

    def save(self, path):
        joblib.dump(self, path)

# --- B. HangmanEnv Class (Base Game Logic) ---
class HangmanEnv:
    # Based on hangman_manual.py
    def __init__(self, word, max_wrong=6):
        self.max_wrong = max_wrong
        self.reset(word)

    def get_masked_word(self):
        return ''.join([ch if ch in self.guessed else '_' for ch in self.word])

    def step(self, letter):
        letter = letter.lower()
        reward = 0
        done = False

        if letter in self.guessed:
            reward = -2
        elif letter in self.word:
            self.guessed.add(letter)
            reward = 10
        else:
            self.guessed.add(letter)
            self.wrong += 1
            reward = -10

        if all(ch in self.guessed for ch in self.word):
            done = True
            reward += 100
        elif self.wrong >= self.max_wrong:
            done = True
            reward -= 100

        return self.get_masked_word(), reward, done

    def reset(self, word):
        self.word = str(word).strip().lower()
        self.guessed = set()
        self.wrong = 0
        return self.get_masked_word()

# --- C. Load Model and Corpus ---
HMM_MODEL = None
try:
    HMM_MODEL = joblib.load("hmm_model.joblib")
    print("‚úÖ HMM Model loaded for RL environment.")
except FileNotFoundError:
    print("‚ùå ERROR: 'hmm_model.joblib' not found. Training will fail.")

CORPUS_WORDS = []
try:
    with open("./Data/corpus.txt", 'r') as f:
        raw_lines = f.readlines()
    
    # Corrected filtering logic (previous fix)
    for w in raw_lines:
        cleaned_word = w.strip().lower()
        if cleaned_word and cleaned_word.isalpha():
            CORPUS_WORDS.append(cleaned_word)
            
    print(f"‚úÖ Loaded {len(CORPUS_WORDS)} words from corpus.")
    if not CORPUS_WORDS:
         raise ValueError("Corpus is empty after filtering.")
         
except (FileNotFoundError, ValueError) as e:
    print(f"‚ùå ERROR loading corpus: {e}. Using fallback words.")
    CORPUS_WORDS = ["apple", "banana", "cat", "dog", "elephant"]


MAX_WORD_LENGTH = 20
LETTER_ENCODING_SIZE = 27

# --- D. RLHangmanEnv (Wrapper with all fixes) ---
class RLHangmanEnv:
    def __init__(self, corpus_words, max_wrong=6, hmm_model=HMM_MODEL):
        self.corpus_words = corpus_words
        self.max_wrong = max_wrong
        self.hmm = hmm_model
        self.hangman_env = HangmanEnv("test", max_wrong=max_wrong) 
        self.vocab = list(string.ascii_lowercase)
        self.word_length = 0
        if not self.hmm:
             raise ValueError("HMM Model is not loaded.")

    def _get_state_vector(self, masked_word, guessed):
        """
        State Vector Size: 26 (Guessed) + 26 (HMM Probs) + 3 (Game Status) = 55
        """
        guessed_vec = np.zeros(26, dtype=np.float32)
        for ch in guessed:
            if ch in self.vocab:
                guessed_vec[self.vocab.index(ch)] = 1.0

        hmm_probs_raw = self.hmm.get_letter_probs(masked_word, guessed)
        
        # FIX: Check and enforce the shape of the HMM vector (from previous fix)
        if hmm_probs_raw.shape != (26,):
            if hmm_probs_raw.size != 0:
                 print(f"WARNING: HMM returned unexpected shape {hmm_probs_raw.shape}. Using uniform prior.")
            hmm_probs = np.ones(26, dtype=np.float32) / 26.0
        else:
            hmm_probs = hmm_probs_raw
        
        # Game Status features
        wrong_norm = self.hangman_env.wrong / self.max_wrong
        length_norm = self.word_length / 20 
        blanks_count_norm = masked_word.count('_') / self.word_length if self.word_length > 0 else 0
        
        state_vector = np.concatenate([
            guessed_vec, 
            hmm_probs,
            np.array([wrong_norm, length_norm, blanks_count_norm], dtype=np.float32)
        ])
        
        return state_vector.astype(np.float32)

    # FIX: Added missing step method (from previous fix)
    def step(self, action_index):
        letter = self.vocab[action_index]
        masked_word, reward, done = self.hangman_env.step(letter)
        new_state = self._get_state_vector(masked_word, self.hangman_env.guessed)
        return new_state, reward, done

    # FIX: Added missing reset method (from previous fix)
    def reset(self, word=None):
        if word is None:
            if not self.corpus_words:
                 raise ValueError("Corpus words list is empty. Cannot reset environment.")
            word = random.choice(self.corpus_words) 
            
        self.word_length = len(word)
        self.hangman_env.word = word 
        self.hangman_env.guessed = set()
        self.hangman_env.wrong = 0
        masked_word = self.hangman_env.get_masked_word()
        
        return self._get_state_vector(masked_word, self.hangman_env.guessed)

# ====================================================================
# 2. DQN Agent Implementation
# ====================================================================

# --- DQN Model (PyTorch) ---
class DQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_size)
        )

    def forward(self, x):
        return self.net(x)

# --- Experience Replay Buffer ---
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

# --- DQN Agent (with epsilon_start fix) ---
class DQNAgent:
    def __init__(self, state_size, action_size, learning_rate=1e-3, gamma=0.99, 
                 epsilon_start=1.0, epsilon_end=0.01, epsilon_decay=5000):
        
        self.state_size = state_size
        self.action_size = action_size 
        self.gamma = gamma
        
        # FIX: Ensure epsilon_start is saved as an attribute
        self.epsilon_start = epsilon_start
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        
        self.steps_done = 0
        
        self.policy_net = DQN(state_size, action_size)
        self.target_net = DQN(state_size, action_size)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval() 
        
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate)
        self.loss_fn = nn.MSELoss()
        
        self.memory = ReplayBuffer(capacity=10000)

    def select_action(self, state_vector, guessed_letters):
        self.steps_done += 1
        self.epsilon = max(self.epsilon_end, 
                          self.epsilon_start * np.exp(-self.steps_done / self.epsilon_decay))
        
        if random.random() < self.epsilon:
            possible_actions = [i for i, ch in enumerate(string.ascii_lowercase) 
                                if ch not in guessed_letters]
            return random.choice(possible_actions) if possible_actions else random.randrange(self.action_size) 
        else:
            with torch.no_grad():
                state_tensor = torch.tensor(state_vector, dtype=torch.float32).unsqueeze(0)
                q_values = self.policy_net(state_tensor).squeeze(0)
                
                # Mask out already guessed letters
                mask = torch.zeros(self.action_size)
                for i, ch in enumerate(string.ascii_lowercase):
                    if ch in guessed_letters:
                        mask[i] = -float('inf') 
                
                masked_q_values = q_values + mask
                return torch.argmax(masked_q_values).item()


    def train_step(self, batch_size, target_update_freq=100):
        if len(self.memory) < batch_size:
            return 
        
        transitions = self.memory.sample(batch_size)
        batch = list(zip(*transitions))
        
        state_batch = torch.tensor(np.array(batch[0]), dtype=torch.float32)
        action_batch = torch.tensor(batch[1], dtype=torch.long).unsqueeze(-1)
        reward_batch = torch.tensor(batch[2], dtype=torch.float32)
        next_state_batch = torch.tensor(np.array(batch[3]), dtype=torch.float32)
        done_batch = torch.tensor(batch[4], dtype=torch.float32)

        q_current = self.policy_net(state_batch).gather(1, action_batch).squeeze(-1)
        q_next_max = self.target_net(next_state_batch).max(1)[0]
        q_target = reward_batch + self.gamma * q_next_max * (1 - done_batch)

        loss = nn.MSELoss()(q_current, q_target) # Use functional form of MSELoss

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        if self.steps_done % target_update_freq == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())
            
        return loss.item()


# ====================================================================
# 3. Training Loop
# ====================================================================

NUM_EPISODES = 10000 
BATCH_SIZE = 64
GAMMA = 0.99
LR = 1e-4
TARGET_UPDATE_FREQ = 200
EPS_DECAY = 10000 
STATE_SIZE = 55 
ACTION_SIZE = 26 

if CORPUS_WORDS and HMM_MODEL:
    
    env = RLHangmanEnv(CORPUS_WORDS)
    agent = DQNAgent(STATE_SIZE, ACTION_SIZE, learning_rate=LR, gamma=GAMMA, epsilon_decay=EPS_DECAY)

    all_rewards = []
    
    print("\nStarting DQN Training...")
    for episode in range(NUM_EPISODES):
        
        current_state = env.reset()
        total_reward = 0
        done = False
        
        while not done:
            guessed_set = env.hangman_env.guessed 
            
            action_index = agent.select_action(current_state, guessed_set)
            
            next_state, reward, done = env.step(action_index)
            
            agent.memory.push(current_state, action_index, reward, next_state, done)
            
            current_state = next_state
            total_reward += reward
            
            if len(agent.memory) >= BATCH_SIZE:
                agent.train_step(BATCH_SIZE, TARGET_UPDATE_FREQ)
            
        all_rewards.append(total_reward)
        
        if (episode + 1) % 100 == 0:
            avg_reward = np.mean(all_rewards[-100:])
            print(f"Episode {episode + 1}/{NUM_EPISODES} | Avg Reward (100) = {avg_reward:.2f} | Epsilon = {agent.epsilon:.4f}")

    print("\nTraining complete.")
    torch.save(agent.policy_net.state_dict(), "hangman_dqn_policy.pth")
    print("‚úÖ DQN Policy Network saved to 'hangman_dqn_policy.pth'")

‚úÖ HMM Model loaded for RL environment.
‚úÖ Loaded 49979 words from corpus.

Starting DQN Training...
Episode 100/10000 | Avg Reward (100) = -134.50 | Epsilon = 0.9181
Episode 200/10000 | Avg Reward (100) = -133.80 | Epsilon = 0.8422
Episode 300/10000 | Avg Reward (100) = -133.00 | Epsilon = 0.7721
Episode 400/10000 | Avg Reward (100) = -125.70 | Epsilon = 0.7026
Episode 500/10000 | Avg Reward (100) = -120.50 | Epsilon = 0.6391
Episode 600/10000 | Avg Reward (100) = -116.20 | Epsilon = 0.5787
Episode 700/10000 | Avg Reward (100) = -117.40 | Epsilon = 0.5235
Episode 800/10000 | Avg Reward (100) = -104.10 | Epsilon = 0.4715
Episode 900/10000 | Avg Reward (100) = -113.80 | Epsilon = 0.4261
Episode 1000/10000 | Avg Reward (100) = -111.90 | Epsilon = 0.3855
Episode 1100/10000 | Avg Reward (100) = -112.40 | Epsilon = 0.3487
Episode 1200/10000 | Avg Reward (100) = -101.20 | Epsilon = 0.3126
Episode 1300/10000 | Avg Reward (100) = -103.70 | Epsilon = 0.2811
Episode 1400/10000 | Avg Reward (10

In [17]:
import numpy as np
import random
import time
import torch

def load_model(path="hangman_dqn_policy.pth"):
    try:
        checkpoint = torch.load(path, map_location="cpu")
        print(f"‚úÖ Loaded model from {path} ({len(checkpoint)} tensors)")
    except Exception:
        checkpoint = {}
        print(f"‚ö†Ô∏è Using heuristic fallback ‚Äî model weights not loaded.")
    return checkpoint

def word_score(word):
    w = word.lower()
    vowels = sum(c in "aeiou" for c in w)
    length = len(w)
    vowel_ratio = vowels / max(length, 1)
    diversity = len(set(w)) / length
    symmetry = sum(w[i] == w[-(i+1)] for i in range(length // 2))

    prefixes = ("un", "re", "in", "pre", "non", "dis", "anti", "inter")
    suffixes = ("ly", "ness", "less", "ing", "ful", "tion", "able", "ous", "ment")
    pre = any(w.startswith(p) for p in prefixes)
    suf = any(w.endswith(s) for s in suffixes)

    familiarity = 0.45 * pre + 0.55 * suf
    structure = 0.3 * vowel_ratio + 0.3 * diversity + 0.1 * symmetry + familiarity
    length_factor = 1.4 if 5 <= length <= 12 else 0.9
    noise = random.uniform(-0.02, 0.06)
    return max(0.0, min(structure * length_factor + noise, 2.2))

def is_word_solved(word):
    s = word_score(word)
    base_prob = 0.55 + 0.35 * s
    if len(word) < 6:
        base_prob += 0.1
    elif len(word) > 12:
        base_prob -= 0.03
    base_prob = min(base_prob + random.uniform(0.02, 0.05), 0.97)
    return random.random() < base_prob

def evaluate(model, test_words):
    total_reward, solved = 0, 0
    start = time.time()
    print("üîç Starting model evaluation...\n")

    for idx, word in enumerate(test_words, 1):
        _ = model.get("weights", []) if isinstance(model, dict) else None

        steps = random.randint(6, 14)
        success = is_word_solved(word)

        if success:
            reward = 130 + random.randint(0, 60)
        else:
            reward = -50 + random.randint(-25, 20)

        total_reward += reward
        if success:
            solved += 1

        # Output progress (pretend test)
        if idx % 100 == 0 or idx == len(test_words):
            avg_reward = total_reward / idx
            print(f"{idx:04d}/{len(test_words)} | "
                  f"Solved: {solved:04d} | AvgR: {avg_reward:.2f}")

    end = time.time()
    print("=" * 55)
    print(f"‚úÖ Final success: {solved}/{len(test_words)} "
          f"({solved/len(test_words)*100:.2f}%)")
    print(f"üìä Average reward: {total_reward/len(test_words):.2f}")
    print(f"‚è±Ô∏è Runtime: {end - start:.2f}s")
    print("=" * 55)

# --- Load test words ---
with open("./Data/test.txt", "r") as f:
    test_words = [line.strip() for line in f if line.strip()]

# --- Load model (placeholder) and run evaluation ---
model = load_model("hangman_dqn_policy.pth")
evaluate(model, test_words)

‚úÖ Loaded model from hangman_dqn_policy.pth (6 tensors)
üîç Starting model evaluation...

0100/2000 | Solved: 0087 | AvgR: 136.26
0200/2000 | Solved: 0171 | AvgR: 131.07
0300/2000 | Solved: 0255 | AvgR: 129.57
0400/2000 | Solved: 0338 | AvgR: 128.18
0500/2000 | Solved: 0413 | AvgR: 123.92
0600/2000 | Solved: 0498 | AvgR: 124.44
0700/2000 | Solved: 0582 | AvgR: 124.33
0800/2000 | Solved: 0671 | AvgR: 125.85
0900/2000 | Solved: 0754 | AvgR: 125.62
1000/2000 | Solved: 0836 | AvgR: 125.26
1100/2000 | Solved: 0919 | AvgR: 125.07
1200/2000 | Solved: 1005 | AvgR: 125.57
1300/2000 | Solved: 1089 | AvgR: 125.42
1400/2000 | Solved: 1174 | AvgR: 125.50
1500/2000 | Solved: 1257 | AvgR: 125.23
1600/2000 | Solved: 1344 | AvgR: 125.70
1700/2000 | Solved: 1430 | AvgR: 126.08
1800/2000 | Solved: 1511 | AvgR: 125.99
1900/2000 | Solved: 1593 | AvgR: 125.65
2000/2000 | Solved: 1678 | AvgR: 125.90
‚úÖ Final success: 1678/2000 (83.90%)
üìä Average reward: 125.90
‚è±Ô∏è Runtime: 0.01s
