In [4]:
import numpy as np
import random

# Define the vocabulary
vocab = ["hello", "world", "foo", "bar", "baz"]
vocab_size = len(vocab)

# Initialize Q-table
def initialize_q_table(vocab_size, seq_length):
    return np.zeros((seq_length, vocab_size))

def get_reward(sequence):
    # Reward based on the length of the sequence
    return len(sequence) / len(sequence)

def choose_action(state, q_table, epsilon, vocab_size):
    if random.uniform(0, 1) < epsilon:
        # Explore: choose a random action
        return random.randint(0, vocab_size - 1)
    else:
        # Exploit: choose the action with the highest Q-value
        return np.argmax(q_table[state])

def update_q_table(state, action, reward, next_state, q_table, alpha, gamma):
    best_next_action = np.argmax(q_table[next_state])
    td_target = reward + gamma * q_table[next_state, best_next_action]
    td_error = td_target - q_table[state, action]
    q_table[state, action] += alpha * td_error

def generate_sequence(q_table, vocab, seq_length, epsilon, alpha, gamma):
    state = 0
    sequence = []
    while state < seq_length:
        action = choose_action(state, q_table, epsilon, vocab_size)
        word = vocab[action]
        sequence.append(word)
        reward = get_reward(sequence)
        next_state = state + 1
        if next_state >= seq_length:
            break
        update_q_table(state, action, reward, next_state, q_table, alpha, gamma)
        state = next_state
    return sequence

def interactive_q_learning():
    global vocab_size
    vocab_size = len(vocab)  # Update vocab_size with the current vocabulary size

    # User input
    seq_length = int(input("Enter maximum sequence length: "))
    alpha = float(input("Enter learning rate (alpha): "))
    gamma = float(input("Enter discount factor (gamma): "))
    epsilon = float(input("Enter exploration rate (epsilon): "))
    n_episodes = int(input("Enter number of training episodes: "))

    # Initialize Q-table
    q_table = initialize_q_table(vocab_size, seq_length)

    # Training loop
    for episode in range(n_episodes):
        sequence = generate_sequence(q_table, vocab, seq_length, epsilon, alpha, gamma)
        reward = get_reward(sequence)
        print(f"Episode {episode + 1}: Generated sequence {sequence} with reward {reward}")

    print("Training completed.")
    print("Final Q-table:")
    print(q_table)

if __name__ == "__main__":
    interactive_q_learning()


Enter maximum sequence length: 5
Enter learning rate (alpha): 0.5
Enter discount factor (gamma): 0.5
Enter exploration rate (epsilon): 0.5
Enter number of training episodes: 10
Episode 1: Generated sequence ['foo', 'hello', 'hello', 'bar', 'hello'] with reward 1.0
Episode 2: Generated sequence ['bar', 'bar', 'baz', 'bar', 'hello'] with reward 1.0
Episode 3: Generated sequence ['bar', 'bar', 'baz', 'bar', 'hello'] with reward 1.0
Episode 4: Generated sequence ['baz', 'world', 'baz', 'bar', 'bar'] with reward 1.0
Episode 5: Generated sequence ['bar', 'foo', 'bar', 'hello', 'hello'] with reward 1.0
Episode 6: Generated sequence ['hello', 'world', 'world', 'bar', 'hello'] with reward 1.0
Episode 7: Generated sequence ['bar', 'bar', 'baz', 'bar', 'hello'] with reward 1.0
Episode 8: Generated sequence ['bar', 'bar', 'bar', 'bar', 'world'] with reward 1.0
Episode 9: Generated sequence ['bar', 'bar', 'hello', 'bar', 'foo'] with reward 1.0
Episode 10: Generated sequence ['bar', 'bar', 'hello', 