In [None]:
!ls

new_ai_player.py  sample_data  words.txt


In [None]:
"""
Junfeng Li

Project Overview (Non-Parallel Model):
1. This implementation trains a deep learning model for the Hangman game using PyTorch on Google Colab.
2. Training data is generated dynamically by simulating the game process, encoding word states and guessed letters.
3. The model utilizes an LSTM architecture to predict the next letter during the game.

Key Learnings and Thought Process:
- Data Generation: Experimented with methods to generate balanced training samples, ensuring a diverse dataset for effective learning.
- Training Pipeline: Gained experience in setting up PyTorch pipelines, including tensor creation, DataLoader usage, and batch processing.
- Model Design: Used LSTM to capture the sequence nature of the game, learning the relationship between word states and guessed letters.

Challenges and Future Improvements:
1. The data generation process is time-intensive for larger datasets, motivating the need for optimization.
2. Further work could involve exploring alternative architectures or hyperparameter tuning to improve accuracy and efficiency.

Some refrence：
1.https://docs.python.org/3/library/multiprocessing.html
2.https://pytorch.org/tutorials/recipes/recipes/custom_dataset_transforms_loader.html
3.https://pytorch.org/tutorials/recipes/recipes/loading_data_recipe.html
4.https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
5.https://www.geeksforgeeks.org/loading-data-in-pytorch/
6.https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

Note: Luke managed the local testing and might modify something based on his own preference

"""
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import time
from new_ai_player import HangmanLSTM

def generate_training_data(word_list, num_samples, max_word_length=10):
    """
    Generate training data based on a word list.

    Args:
        word_list (list): A list of words to create training examples.
        num_samples (int): The number of training examples to generate.
        max_word_length (int): The maximum length of words to handle.

    Returns:
        list: A list of training examples containing word state, guessed letters, and target letter.
    """
    print("Generating training data...")
    data = []  # This list will store all the training samples

    for _ in range(num_samples):  # Repeat for the number of samples needed
        word = random.choice(word_list)  # Randomly select a word from the list
        word_letters = set(word)  # Get all unique letters in the word
        guessed_letters = set()  # Start with an empty set of guessed letters
        obscured_word = ['_' for _ in word]  # Initially, the word is completely hidden

        # Keep guessing letters until all unique letters are guessed
        while len(guessed_letters) < len(word_letters):
            next_letter = random.choice(list(word_letters - guessed_letters))  # Pick an unguessed letter
            guessed_letters.add(next_letter)  # Add it to the guessed letters

            # Reveal guessed letters in the obscured word
            for i, char in enumerate(word):
                if char in guessed_letters:
                    obscured_word[i] = char

            # Encode the current word state and guessed letters
            word_input = encode_word_state(''.join(obscured_word), max_word_length)
            guessed_input = encode_guessed_letters(guessed_letters)
            target_letter = ord(next_letter) - ord('a')  # Convert the letter to an index (0-25)

            # Balance the dataset by repeating incomplete samples
            if '_' in ''.join(obscured_word):  # Word not fully revealed
                for _ in range(3):  # Add this sample 3 times
                    data.append((word_input, guessed_input, target_letter))
            else:
                data.append((word_input, guessed_input, target_letter))  # Add fully revealed word sample

    print(f"Generated {len(data)} training samples.")  # Print the total number of samples
    return data  # Return the training examples

def encode_word_state(word_display, max_word_length):
    """
    Convert the word's current state into a one-hot encoding matrix.

    Args:
        word_display (str): The current view of the word (e.g., "_ppl_" for "apple").
        max_word_length (int): Maximum length of the word to encode.

    Returns:
        np.ndarray: A matrix with one-hot encoding of the word's current state.
    """
    word_vector = np.zeros((max_word_length, 27))  # Create a matrix of zeros
    for i, char in enumerate(word_display[:max_word_length]):  # Process each character
        if char == '_':
            word_vector[i, 26] = 1  # Mark blanks in the last column
        elif 'a' <= char <= 'z':
            word_vector[i, ord(char) - ord('a')] = 1  # Mark the column for the corresponding letter
    return word_vector  # Return the one-hot encoded matrix

def encode_guessed_letters(guessed_letters):
    """
    Convert guessed letters into a one-hot vector.

    Args:
        guessed_letters (set): Letters guessed so far.

    Returns:
        np.ndarray: A one-hot encoded vector of size 26.
    """
    guessed_vector = np.zeros(26)  # Create a zero vector of size 26
    for letter in guessed_letters:  # Process each guessed letter
        guessed_vector[ord(letter) - ord('a')] = 1  # Mark the corresponding position
    return guessed_vector  # Return the one-hot encoded vector

def train_model(word_list, model_path='large_hangman_model_normal_parallel.pth', num_samples=10000, epochs=25, batch_size=32, lr=0.001):
    """
    Train the HangmanLSTM model.

    Args:
        word_list (list): A list of words for training.
        model_path (str): File path to save the trained model.
        num_samples (int): Number of training examples to generate.
        epochs (int): Number of training iterations.
        batch_size (int): Number of samples in each training batch.
        lr (float): Learning rate for optimization.

    Returns:
        None
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Choose GPU if available
    print(f"Using device: {device}")

    # Generate training data
    print("Generating training data...")
    start_time_data = time.time()
    data = generate_training_data(word_list, num_samples)  # Create training examples
    end_time_data = time.time()
    print(f"Data generation took {end_time_data - start_time_data:.2f} seconds.")

    # Prepare tensors
    print("Preparing data tensors...")
    inputs_word = torch.tensor(np.array([item[0] for item in data]), dtype=torch.float32)  # Word inputs
    inputs_guessed = torch.tensor(np.array([item[1] for item in data]), dtype=torch.float32)  # Guessed letters
    targets = torch.tensor(np.array([item[2] for item in data]), dtype=torch.long)  # Target letters

    # Create DataLoader
    print("Creating DataLoader...")
    dataset = torch.utils.data.TensorDataset(inputs_word, inputs_guessed, targets)  # Package data into a dataset
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)

    # Initialize model, optimizer, and loss function
    model = HangmanLSTM().to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    # Training loop
    print("Starting training loop...")
    model.train()
    for epoch in range(epochs):  # Iterate through epochs
        total_loss = 0
        for word_batch, guessed_batch, target_batch in dataloader:  # Process each batch
            word_batch = word_batch.to(device)
            guessed_batch = guessed_batch.to(device)
            target_batch = target_batch.to(device)

            optimizer.zero_grad()  # Reset gradients
            outputs = model(word_batch, guessed_batch)  # Get predictions
            loss = criterion(outputs, target_batch)  # Calculate loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update model parameters
            total_loss += loss.item()  # Accumulate total loss

        avg_loss = total_loss / len(dataloader)  # Calculate average loss
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")  # Print loss for this epoch

    # Save the trained model
    torch.save(model.state_dict(), model_path)
    print(f"Model saved to {model_path}")

if __name__ == "__main__":
    # Load the word list from file
    words_file = "words.txt"
    try:
        with open(words_file, 'r') as f:
            word_list = [line.strip().lower() for line in f if line.strip()]  # Read words from file
        print(f"Loaded {len(word_list)} words from {words_file}.")
    except FileNotFoundError:
        print(f"Error: {words_file} not found.")  # Handle missing file
        exit(1)

    # Train the model
    print("Starting non-parallel training...")
    start_time = time.time()
    train_model(word_list, model_path="trained_models/large_hangman_model_normal_parallel.pth", num_samples=20000, epochs=25, batch_size=32, lr=0.001)
    end_time = time.time()
    print(f"Non-parallel training completed in {end_time - start_time:.2f} seconds.")


Loaded 852 words from words.txt.
Starting non-parallel training...
Using device: cuda
Generating training data...
Generating training data...
Generated 245249 training samples.
Data generation took 0.74 seconds.
Preparing data tensors...
Creating DataLoader...
Starting training loop...
Epoch 1/25, Loss: 2.9377
Epoch 2/25, Loss: 2.8326
Epoch 3/25, Loss: 2.8113
Epoch 4/25, Loss: 2.8065
Epoch 5/25, Loss: 2.8026
Epoch 6/25, Loss: 2.7997
Epoch 7/25, Loss: 2.7980
Epoch 8/25, Loss: 2.7950
Epoch 9/25, Loss: 2.7932
Epoch 10/25, Loss: 2.7908
Epoch 11/25, Loss: 2.7880
Epoch 12/25, Loss: 2.7850
Epoch 13/25, Loss: 2.7821
Epoch 14/25, Loss: 2.7799
Epoch 15/25, Loss: 2.7759
Epoch 16/25, Loss: 2.7728
Epoch 17/25, Loss: 2.7695
Epoch 18/25, Loss: 2.7662
Epoch 19/25, Loss: 2.7627
Epoch 20/25, Loss: 2.7589
Epoch 21/25, Loss: 2.7559
Epoch 22/25, Loss: 2.7519
Epoch 23/25, Loss: 2.7474
Epoch 24/25, Loss: 2.7439
Epoch 25/25, Loss: 2.7405
Model saved to large_hangman_model_normal_parallel.pth
Non-parallel trai

In [None]:
"""
Junfeng Li

Project Overview (Parallel Model):
1. This implementation extends the non-parallel Hangman training model by introducing parallelized data generation on Google Colab.
2. Training data generation is distributed across multiple processes using Python's multiprocessing module.
3. The model architecture, training pipeline, and dataset handling remain consistent with the non-parallel version.

Key Learnings and Thought Process:
- Parallel Processing: Applied multiprocessing to accelerate training data generation, see if it is significantly reducing execution time on Google Colab.
- Scalability: Designed the parallelization to handle larger datasets efficiently.
- Performance Comparison: Demonstrated the advantages of parallel processing over sequential data generation, especially for time-intensive tasks.

Challenges and Future Improvements:
1. Managing inter-process communication and memory usage for even larger datasets can be further optimized.
2. While the focus here was on data generation, future iterations could explore parallelism in training or model evaluation.

Some refrence：
1.https://docs.python.org/3/library/multiprocessing.html
2.https://pytorch.org/tutorials/recipes/recipes/custom_dataset_transforms_loader.html
3.https://pytorch.org/tutorials/recipes/recipes/loading_data_recipe.html
4.https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
5.https://www.geeksforgeeks.org/loading-data-in-pytorch/
6.https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

Note: Luke managed the local testing and might modify something based on his own preference

"""

import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import time
from multiprocessing import Pool
from new_ai_player import HangmanLSTM

def generate_training_data(word_list, num_samples, max_word_length=10):
    """
    Generate training data based on a word list.

    Args:
        word_list (list): List of words for creating training examples.
        num_samples (int): Number of training examples to generate.
        max_word_length (int): Maximum word length to consider.

    Returns:
        list: Training examples containing word state, guessed letters, and target letter.
    """
    data = []  # This list will store all training samples

    for _ in range(num_samples):  # Loop for generating the specified number of examples
        word = random.choice(word_list)  # Randomly select a word from the list
        word_letters = set(word)  # Extract all unique letters in the word
        guessed_letters = set()  # Start with no letters guessed
        obscured_word = ['_' for _ in word]  # Represent the word as hidden initially

        # Keep guessing letters until all unique letters are guessed
        while len(guessed_letters) < len(word_letters):
            next_letter = random.choice(list(word_letters - guessed_letters))  # Choose an unguessed letter
            guessed_letters.add(next_letter)  # Mark the letter as guessed

            # Update the hidden word to reveal guessed letters
            for i, char in enumerate(word):
                if char in guessed_letters:
                    obscured_word[i] = char

            # Encode the current word state and guessed letters
            word_input = encode_word_state(''.join(obscured_word), max_word_length)
            guessed_input = encode_guessed_letters(guessed_letters)
            target_letter = ord(next_letter) - ord('a')  # Convert letter to index (0-25)

            # Balance the dataset by repeating incomplete examples
            if '_' in ''.join(obscured_word):  # If the word is not fully guessed
                for _ in range(3):  # Add the example 3 times to balance the dataset
                    data.append((word_input, guessed_input, target_letter))
            else:  # Fully guessed word
                data.append((word_input, guessed_input, target_letter))

    return data  # Return the generated training examples

def encode_word_state(word_display, max_word_length):
    """
    Convert the current state of the word into a one-hot encoded matrix.

    Args:
        word_display (str): Current view of the word (e.g., "_ppl_" for "apple").
        max_word_length (int): Maximum number of characters to encode.

    Returns:
        np.ndarray: A matrix representing the one-hot encoding of the word.
    """
    word_vector = np.zeros((max_word_length, 27))  # Initialize a matrix of zeros
    for i, char in enumerate(word_display[:max_word_length]):  # Process each character
        if char == '_':
            word_vector[i, 26] = 1  # Mark blanks in the last column
        elif 'a' <= char <= 'z':
            word_vector[i, ord(char) - ord('a')] = 1  # Mark the corresponding letter's column
    return word_vector  # Return the encoded matrix

def encode_guessed_letters(guessed_letters):
    """
    Convert guessed letters into a one-hot encoded vector.

    Args:
        guessed_letters (set): Set of guessed letters.

    Returns:
        np.ndarray: A one-hot encoded vector of size 26.
    """
    guessed_vector = np.zeros(26)  # Initialize a zero vector of size 26
    for letter in guessed_letters:  # Iterate through the guessed letters
        guessed_vector[ord(letter) - ord('a')] = 1  # Mark the corresponding position
    return guessed_vector  # Return the encoded vector

def parallel_generate_training_data(args):
    """
    Wrapper function for multiprocessing to generate training data in parallel.

    Args:
        args (tuple): Contains word list, sample count, and max word length.

    Returns:
        list: Training data generated by the worker process.
    """
    word_list, num_samples, max_word_length = args
    return generate_training_data(word_list, num_samples, max_word_length)

def train_model_parallel(word_list, model_path='large_hangman_model_super_parallel.pth', num_samples=10000, epochs=25, batch_size=32, lr=0.001, num_workers=4):
    """
    Train the HangmanLSTM model using parallel data generation.

    Args:
        word_list (list): List of words for training.
        model_path (str): Path to save the trained model.
        num_samples (int): Number of training examples to generate.
        epochs (int): Number of training epochs.
        batch_size (int): Number of samples per batch.
        lr (float): Learning rate for the optimizer.
        num_workers (int): Number of worker processes for parallel data generation.

    Returns:
        None
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Use GPU if available
    print(f"Using device: {device}")

    # Generate training data in parallel
    print("Generating training data in parallel...")
    chunk_size = num_samples // num_workers  # Divide workload across workers
    pool_args = [(word_list, chunk_size, 10) for _ in range(num_workers)]  # Prepare arguments for workers

    with Pool(num_workers) as pool:  # Create a pool of worker processes
        data_chunks = pool.map(parallel_generate_training_data, pool_args)  # Generate data in parallel

    # Combine all chunks into a single dataset
    data = [item for chunk in data_chunks for item in chunk]
    print(f"Generated {len(data)} training samples.")

    # Prepare data tensors for training
    print("Preparing data tensors...")
    inputs_word = torch.tensor(np.array([item[0] for item in data]), dtype=torch.float32)
    inputs_guessed = torch.tensor(np.array([item[1] for item in data]), dtype=torch.float32)
    targets = torch.tensor(np.array([item[2] for item in data]), dtype=torch.long)

    # Create a DataLoader to handle batching
    print("Creating DataLoader...")
    dataset = torch.utils.data.TensorDataset(inputs_word, inputs_guessed, targets)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)

    # Initialize the model, optimizer, and loss function
    model = HangmanLSTM().to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    # Training loop
    print("Starting training loop...")
    model.train()
    for epoch in range(epochs):  # Iterate through all epochs
        total_loss = 0
        for word_batch, guessed_batch, target_batch in dataloader:  # Iterate through all batches
            word_batch = word_batch.to(device)
            guessed_batch = guessed_batch.to(device)
            target_batch = target_batch.to(device)

            optimizer.zero_grad()  # Clear previous gradients
            outputs = model(word_batch, guessed_batch)  # Forward pass
            loss = criterion(outputs, target_batch)  # Calculate loss
            loss.backward()  # Backward pass
            optimizer.step()  # Update model parameters
            total_loss += loss.item()  # Accumulate total loss

        avg_loss = total_loss / len(dataloader)  # Calculate average loss
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")  # Display epoch loss

    # Save the trained model to a file
    torch.save(model.state_dict(), model_path)
    print(f"Model saved to {model_path}")

if __name__ == "__main__":
    # Load the word list from the file
    words_file = "words.txt"
    try:
        with open(words_file, 'r') as f:
            word_list = [line.strip().lower() for line in f if line.strip()]  # Read and clean words
        print(f"Loaded {len(word_list)} words from {words_file}.")
    except FileNotFoundError:
        print(f"Error: {words_file} not found.")  # Handle file not found error
        exit(1)

    # Train the model in parallel
    print("Starting parallel training...")
    start_time = time.time()
    train_model_parallel(word_list, model_path="trained_models/large_hangman_model_super_parallel.pth", num_samples=20000, epochs=25, batch_size=32, lr=0.001, num_workers=4)
    end_time = time.time()
    print(f"Parallel training completed in {end_time - start_time:.2f} seconds.")


Loaded 852 words from words.txt.
Starting parallel training...
Using device: cuda
Generating training data in parallel...
Generated 244091 training samples.
Preparing data tensors...
Creating DataLoader...
Starting training loop...
Epoch 1/25, Loss: 2.9479
Epoch 2/25, Loss: 2.8439
Epoch 3/25, Loss: 2.8109
Epoch 4/25, Loss: 2.8061
Epoch 5/25, Loss: 2.8026
Epoch 6/25, Loss: 2.7988
Epoch 7/25, Loss: 2.7963
Epoch 8/25, Loss: 2.7958
Epoch 9/25, Loss: 2.7946
Epoch 10/25, Loss: 2.7910
Epoch 11/25, Loss: 2.7888
Epoch 12/25, Loss: 2.7840
Epoch 13/25, Loss: 2.7802
Epoch 14/25, Loss: 2.7763
Epoch 15/25, Loss: 2.7718
Epoch 16/25, Loss: 2.7674
Epoch 17/25, Loss: 2.7634
Epoch 18/25, Loss: 2.7574
Epoch 19/25, Loss: 2.7520
Epoch 20/25, Loss: 2.7473
Epoch 21/25, Loss: 2.7417
Epoch 22/25, Loss: 2.7378
Epoch 23/25, Loss: 2.7330
Epoch 24/25, Loss: 2.7285
Epoch 25/25, Loss: 2.7239
Model saved to large_hangman_model_super_parallel.pth
Parallel training completed in 604.43 seconds.
