In [None]:
import numpy as np
import tensorflow as tf
from collections import deque
import random

state_size = 4  # [difficulty, success_rate, avg_time, attempts]
action_size = 3  # [Increase, Maintain, Decrease]
gamma = 0.95  # Discount factor
learning_rate = 0.001
epsilon = 1.0  # Exploration rate
epsilon_decay = 0.995
epsilon_min = 0.01
batch_size = 32
memory_size = 2000

memory = deque(maxlen=memory_size)

def build_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(24, input_dim=state_size, activation='relu'),
        tf.keras.layers.Dense(24, activation='relu'),
        tf.keras.layers.Dense(action_size, activation='linear')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='mse')
    return model

model = build_model()
target_model = build_model()
target_model.set_weights(model.get_weights())

def remember(state, action, reward, next_state, done):
    memory.append((state, action, reward, next_state, done))

def act(state):
    if np.random.rand() <= epsilon:
        return random.randrange(action_size)
    q_values = model.predict(state, verbose=0)
    return np.argmax(q_values[0])

def replay():
    global epsilon
    if len(memory) < batch_size:
        return

    minibatch = random.sample(memory, batch_size)
    for state, action, reward, next_state, done in minibatch:
        target = reward
        if not done:
            target += gamma * np.amax(target_model.predict(next_state, verbose=0)[0])
        target_f = model.predict(state, verbose=0)
        target_f[0][action] = target
        model.fit(state, target_f, epochs=1, verbose=0)

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

def update_target_model():
    target_model.set_weights(model.get_weights())

for episode in range(1000):
    state = np.random.rand(1, state_size)
    for t in range(200):
        action = act(state)
        next_state = np.random.rand(1, state_size)
        reward = np.random.choice([10, -5, 5])
        done = t == 199
        remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            update_target_model()
            break
    replay()
