reinforcement
 learning program that reduces entropy in a closed system


In [None]:
import numpy as np

class GridEnvironment:
    def __init__(self, size=4):
        self.size = size
        self.state = np.random.randint(2, size=(size, size))

    def reset(self):
        self.state = np.random.randint(2, size=(self.size, self.size))
        return self.state.flatten()

    def step(self, action):
        row, col = divmod(action, self.size)
        if self.state[row, col] == 1:
            self.state[row, col] = 0
        reward = 1 if self.state[row, col] == 0 else -1
        done = np.all(self.state == 0)
        return self.state.flatten(), reward, done

    def render(self):
        print(self.state)

# Test the environment
env = GridEnvironment()
state = env.reset()
env.render()


[[0 1 1 1]
 [0 1 0 1]
 [0 1 0 1]
 [0 0 0 0]]


In [None]:
import tensorflow as tf

class QLearningAgent:
    def __init__(self, state_shape, action_size, learning_rate=0.001, gamma=0.99):
        self.state_shape = state_shape
        self.action_size = action_size
        self.gamma = gamma
        self.model = self.build_model(learning_rate)

    def build_model(self, learning_rate):
        model = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=self.state_shape),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='mse')
        return model

    def act(self, state, epsilon):
        if np.random.rand() <= epsilon:
            return np.random.randint(self.action_size)
        q_values = self.model.predict(state[np.newaxis], verbose=0)
        return np.argmax(q_values[0])

    def train(self, state, action, reward, next_state, done):
        target = reward
        if not done:
            target += self.gamma * np.amax(self.model.predict(next_state[np.newaxis], verbose=0)[0])
        target_f = self.model.predict(state[np.newaxis], verbose=0)
        target_f[0][action] = target
        self.model.fit(state[np.newaxis], target_f, epochs=1, verbose=0)

# Initialize agent
agent = QLearningAgent(state_shape=(16,), action_size=16)


In [None]:
episodes = 10
epsilon = 1.0  # Exploration rate
epsilon_min = 0.1
epsilon_decay = 0.995

for e in range(episodes):
    state = env.reset()
    done = False
    steps = 0
    while not done:
        action = agent.act(state, epsilon)
        next_state, reward, done = env.step(action)
        agent.train(state, action, reward, next_state, done)
        state = next_state
        steps += 1
        if done:
            print(f"Episode {e+1}/{episodes} - Steps: {steps} - Epsilon: {epsilon:.2f}")
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay




Episode 1/10 - Steps: 47 - Epsilon: 1.00
Episode 2/10 - Steps: 35 - Epsilon: 0.99
Episode 3/10 - Steps: 30 - Epsilon: 0.99
Episode 4/10 - Steps: 34 - Epsilon: 0.99
Episode 5/10 - Steps: 33 - Epsilon: 0.98
Episode 6/10 - Steps: 12 - Epsilon: 0.98
Episode 7/10 - Steps: 25 - Epsilon: 0.97
Episode 8/10 - Steps: 34 - Epsilon: 0.97
Episode 9/10 - Steps: 42 - Epsilon: 0.96
Episode 10/10 - Steps: 46 - Epsilon: 0.96


In [None]:
def test_agent(agent, env, episodes=10):
    success_count = 0
    steps_list = []

    for e in range(episodes):
        state = env.reset()
        done = False
        steps = 0

        while not done:
            # Predict the best action using the trained model
            action = np.argmax(agent.model.predict(state[np.newaxis], verbose=0)[0])
            next_state, reward, done = env.step(action)
            state = next_state
            steps += 1

            if done:
                steps_list.append(steps)
                if np.all(state == 0):
                    success_count += 1
                print(f"Test Episode {e+1}/{episodes} - Steps: {steps}")
                env.render()
                break

    success_rate = success_count / episodes * 100
    avg_steps = np.mean(steps_list) if steps_list else 'N/A'
    print(f"Success rate: {success_rate:.2f}%")
    print(f"Average steps per successful episode: {avg_steps}")

# Test the trained agent
test_agent(agent, env)


KeyboardInterrupt: 

In [None]:
import numpy as np
import tensorflow as tf

class GridEnvironment:
    def __init__(self, size=4):
        self.size = size
        self.state = np.random.randint(2, size=(size, size))

    def reset(self):
        self.state = np.random.randint(2, size=(self.size, self.size))
        return self.state.flatten()

    def step(self, action):
        row, col = divmod(action, self.size)
        if self.state[row, col] == 1:
            self.state[row, col] = 0
        reward = 1 if self.state[row, col] == 0 else -1
        done = np.all(self.state == 0)
        return self.state.flatten(), reward, done

    def render(self):
        print(self.state)

class QLearningAgent:
    def __init__(self, state_shape, action_size, learning_rate=0.001, gamma=0.99):
        self.state_shape = state_shape
        self.action_size = action_size
        self.gamma = gamma
        self.model = self.build_model(learning_rate)

    def build_model(self, learning_rate):
        model = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=self.state_shape),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='mse')
        return model

    def act(self, state, epsilon):
        if np.random.rand() <= epsilon:
            return np.random.randint(self.action_size)
        q_values = self.model.predict(state[np.newaxis], verbose=0)
        return np.argmax(q_values[0])

    def train(self, state, action, reward, next_state, done):
        target = reward
        if not done:
            target += self.gamma * np.amax(self.model.predict(next_state[np.newaxis], verbose=0)[0])
        target_f = self.model.predict(state[np.newaxis], verbose=0)
        target_f[0][action] = target
        self.model.fit(state[np.newaxis], target_f, epochs=1, verbose=0)

def test_agent(agent, env, episodes=10):
    success_count = 0
    steps_list = []

    for e in range(episodes):
        state = env.reset()
        done = False
        steps = 0

        while not done:
            action = np.argmax(agent.model.predict(state[np.newaxis], verbose=0)[0])
            next_state, reward, done = env.step(action)
            state = next_state
            steps += 1

            if done:
                steps_list.append(steps)
                if np.all(state == 0):
                    success_count += 1
                print(f"Test Episode {e+1}/{episodes} - Steps: {steps}")
                env.render()
                break

    success_rate = success_count / episodes * 100
    avg_steps = np.mean(steps_list) if steps_list else 'N/A'
    print(f"Success rate: {success_rate:.2f}%")
    print(f"Average steps per successful episode: {avg_steps}")

# Initialize environment and agent
env = GridEnvironment()
agent = QLearningAgent(state_shape=(16,), action_size=16)

# Training the agent
episodes = 1000
epsilon = 1.0  # Exploration rate
epsilon_min = 0.1
epsilon_decay = 0.995

for e in range(episodes):
    state = env.reset()
    done = False
    steps = 0
    while not done:
        action = agent.act(state, epsilon)
        next_state, reward, done = env.step(action)
        agent.train(state, action, reward, next_state, done)
        state = next_state
        steps += 1
        if done:
            print(f"Episode {e+1}/{episodes} - Steps: {steps} - Epsilon: {epsilon:.2f}")
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

# Test the trained agent
test_agent(agent, env)


In [None]:
import numpy as np
import tensorflow as tf

class GridEnvironment:
    def __init__(self, size=2):
        self.size = size
        self.state = np.random.randint(2, size=(size, size))

    def reset(self):
        self.state = np.random.randint(2, size=(self.size, self.size))
        return self.state.flatten()

    def step(self, action):
        row, col = divmod(action, self.size)
        if self.state[row, col] == 1:
            self.state[row, col] = 0
        reward = 1 if self.state[row, col] == 0 else -1
        done = np.all(self.state == 0)
        return self.state.flatten(), reward, done

    def render(self):
        print(self.state)

class QLearningAgent:
    def __init__(self, state_shape, action_size, learning_rate=0.001, gamma=0.99):
        self.state_shape = state_shape
        self.action_size = action_size
        self.gamma = gamma
        self.model = self.build_model(learning_rate)

    def build_model(self, learning_rate):
        model = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=self.state_shape),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='mse')
        return model

    def act(self, state, epsilon):
        if np.random.rand() <= epsilon:
            return np.random.randint(self.action_size)
        q_values = self.model.predict(state[np.newaxis], verbose=0)
        return np.argmax(q_values[0])

    def train(self, state, action, reward, next_state, done):
        target = reward
        if not done:
            target += self.gamma * np.amax(self.model.predict(next_state[np.newaxis], verbose=0)[0])
        target_f = self.model.predict(state[np.newaxis], verbose=0)
        target_f[0][action] = target
        self.model.fit(state[np.newaxis], target_f, epochs=1, verbose=0)

def test_agent(agent, env, episodes=1):
    success_count = 0
    steps_list = []

    for e in range(episodes):
        state = env.reset()
        done = False
        steps = 0

        while not done:
            action = np.argmax(agent.model.predict(state[np.newaxis], verbose=0)[0])
            next_state, reward, done = env.step(action)
            state = next_state
            steps += 1

            if done:
                steps_list.append(steps)
                if np.all(state == 0):
                    success_count += 1
                print(f"Test Episode {e+1}/{episodes} - Steps: {steps}")
                env.render()
                break

    success_rate = success_count / episodes * 100
    avg_steps = np.mean(steps_list) if steps_list else 'N/A'
    print(f"Success rate: {success_rate:.2f}%")
    print(f"Average steps per successful episode: {avg_steps}")

# Initialize environment and agent
env = GridEnvironment(size=2)
agent = QLearningAgent(state_shape=(4,), action_size=4)

# Training the agent
episodes = 10
epsilon = 1.0  # Exploration rate
epsilon_min = 0.1
epsilon_decay = 0.995

for e in range(episodes):
    state = env.reset()
    done = False
    steps = 0
    while not done:
        action = agent.act(state, epsilon)
        next_state, reward, done = env.step(action)
        agent.train(state, action, reward, next_state, done)
        state = next_state
        steps += 1
        if done:
            print(f"Episode {e+1}/{episodes} - Steps: {steps} - Epsilon: {epsilon:.2f}")
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

# Test the trained agent
test_agent(agent, env)


Episode 1/10 - Steps: 10 - Epsilon: 1.00
Episode 2/10 - Steps: 7 - Epsilon: 0.99
Episode 3/10 - Steps: 8 - Epsilon: 0.99
Episode 4/10 - Steps: 3 - Epsilon: 0.99
Episode 5/10 - Steps: 8 - Epsilon: 0.98
Episode 6/10 - Steps: 3 - Epsilon: 0.98
Episode 7/10 - Steps: 7 - Epsilon: 0.97
Episode 8/10 - Steps: 3 - Epsilon: 0.97
Episode 9/10 - Steps: 3 - Epsilon: 0.96
Episode 10/10 - Steps: 13 - Epsilon: 0.96


KeyboardInterrupt: 

In [None]:
import numpy as np
import tensorflow as tf

class GridEnvironment:
    def __init__(self, size=2):
        self.size = size
        self.state = np.random.randint(2, size=(size, size))

    def reset(self):
        self.state = np.random.randint(2, size=(self.size, self.size))
        return self.state.flatten()

    def step(self, action):
        row, col = divmod(action, self.size)
        if self.state[row, col] == 1:
            self.state[row, col] = 0
        reward = 1 if self.state[row, col] == 0 else -1
        done = np.all(self.state == 0)
        return self.state.flatten(), reward, done

    def render(self):
        print(self.state)

class QLearningAgent:
    def __init__(self, state_shape, action_size, learning_rate=0.001, gamma=0.99):
        self.state_shape = state_shape
        self.action_size = action_size
        self.gamma = gamma
        self.model = self.build_model(learning_rate)

    def build_model(self, learning_rate):
        model = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=self.state_shape),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='mse')
        return model

    def act(self, state, epsilon):
        if np.random.rand() <= epsilon:
            return np.random.randint(self.action_size)
        q_values = self.model.predict(state[np.newaxis], verbose=0)
        return np.argmax(q_values[0])

    def train(self, state, action, reward, next_state, done):
        target = reward
        if not done:
            target += self.gamma * np.amax(self.model.predict(next_state[np.newaxis], verbose=0)[0])
        target_f = self.model.predict(state[np.newaxis], verbose=0)
        target_f[0][action] = target
        self.model.fit(state[np.newaxis], target_f, epochs=1, verbose=0)

def test_agent(agent, env, episodes=1):
    success_count = 0
    steps_list = []

    for e in range(episodes):
        state = env.reset()
        done = False
        steps = 0

        while not done:
            try:
                action = np.argmax(agent.model.predict(state[np.newaxis], verbose=0)[0])
                next_state, reward, done = env.step(action)
                state = next_state
                steps += 1

                if done:
                    steps_list.append(steps)
                    if np.all(state == 0):
                        success_count += 1
                    print(f"Test Episode {e+1}/{episodes} - Steps: {steps}")
                    env.render()
                    break
            except Exception as ex:
                print(f"An error occurred: {ex}")
                break

    success_rate = success_count / episodes * 100
    avg_steps = np.mean(steps_list) if steps_list else 'N/A'
    print(f"Success rate: {success_rate:.2f}%")
    print(f"Average steps per successful episode: {avg_steps}")

# Initialize environment and agent
env = GridEnvironment(size=2)
agent = QLearningAgent(state_shape=(4,), action_size=4)

# Training the agent
episodes = 10
epsilon = 1.0  # Exploration rate
epsilon_min = 0.1
epsilon_decay = 0.995

for e in range(episodes):
    state = env.reset()
    done = False
    steps = 0
    while not done:
        action = agent.act(state, epsilon)
        next_state, reward, done = env.step(action)
        agent.train(state, action, reward, next_state, done)
        state = next_state
        steps += 1
        if done:
            print(f"Episode {e+1}/{episodes} - Steps: {steps} - Epsilon: {epsilon:.2f}")
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

# Test the trained agent
test_agent(agent, env)


Episode 1/10 - Steps: 2 - Epsilon: 1.00
Episode 2/10 - Steps: 10 - Epsilon: 0.99
Episode 3/10 - Steps: 2 - Epsilon: 0.99
Episode 4/10 - Steps: 13 - Epsilon: 0.99
Episode 5/10 - Steps: 5 - Epsilon: 0.98
Episode 6/10 - Steps: 1 - Epsilon: 0.98
Episode 7/10 - Steps: 8 - Epsilon: 0.97
Episode 8/10 - Steps: 5 - Epsilon: 0.97
Episode 9/10 - Steps: 3 - Epsilon: 0.96
Episode 10/10 - Steps: 5 - Epsilon: 0.96
Test Episode 1/1 - Steps: 1
[[0 0]
 [0 0]]
Success rate: 100.00%
Average steps per successful episode: 1.0


In [None]:
import numpy as np
import tensorflow as tf

class GridEnvironment:
    def __init__(self, size=2):
        self.size = size
        self.state = np.random.randint(2, size=(size, size))

    def reset(self):
        self.state = np.random.randint(2, size=(self.size, self.size))
        return self.state.flatten()

    def step(self, action):
        row, col = divmod(action, self.size)
        if self.state[row, col] == 1:
            self.state[row, col] = 0
        reward = 1 if self.state[row, col] == 0 else -1
        done = np.all(self.state == 0)
        return self.state.flatten(), reward, done

    def render(self):
        print(self.state)

class QLearningAgent:
    def __init__(self, state_shape, action_size, learning_rate=0.001, gamma=0.99):
        self.state_shape = state_shape
        self.action_size = action_size
        self.gamma = gamma
        self.model = self.build_model(learning_rate)

    def build_model(self, learning_rate):
        model = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=self.state_shape),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='mse')
        return model

    def act(self, state, epsilon):
        if np.random.rand() <= epsilon:
            return np.random.randint(self.action_size)
        q_values = self.model.predict(state[np.newaxis], verbose=0)
        return np.argmax(q_values[0])

    def train(self, state, action, reward, next_state, done):
        target = reward
        if not done:
            target += self.gamma * np.amax(self.model.predict(next_state[np.newaxis], verbose=0)[0])
        target_f = self.model.predict(state[np.newaxis], verbose=0)
        target_f[0][action] = target
        self.model.fit(state[np.newaxis], target_f, epochs=1, verbose=0)

def test_agent(agent, env, episodes=10):
    success_count = 0
    steps_list = []

    for e in range(episodes):
        state = env.reset()
        done = False
        steps = 0

        while not done:
            print(f"Test Episode {e+1}/{episodes} - Step {steps}")
            print(f"Current State:\n{state.reshape(env.size, env.size)}")
            action = np.argmax(agent.model.predict(state[np.newaxis], verbose=0)[0])
            print(f"Chosen Action: {action}")
            next_state, reward, done = env.step(action)
            print(f"Next State:\n{next_state.reshape(env.size, env.size)}")
            print(f"Reward: {reward}, Done: {done}")
            state = next_state
            steps += 1

            if done:
                steps_list.append(steps)
                if np.all(state == 0):
                    success_count += 1
                print(f"Test Episode {e+1}/{episodes} - Steps: {steps}")
                env.render()
                break

    success_rate = success_count / episodes * 100
    avg_steps = np.mean(steps_list) if steps_list else 'N/A'
    print(f"Success rate: {success_rate:.2f}%")
    print(f"Average steps per successful episode: {avg_steps}")

# Initialize environment and agent
env = GridEnvironment(size=2)
agent = QLearningAgent(state_shape=(4,), action_size=4)

# Training the agent
episodes = 10
epsilon = 1.0  # Exploration rate
epsilon_min = 0.1
epsilon_decay = 0.995

for e in range(episodes):
    state = env.reset()
    done = False
    steps = 0
    while not done:
        action = agent.act(state, epsilon)
        next_state, reward, done = env.step(action)
        agent.train(state, action, reward, next_state, done)
        state = next_state
        steps += 1
        if done:
            print(f"Episode {e+1}/{episodes} - Steps: {steps} - Epsilon: {epsilon:.2f}")
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

# Test the trained agent
test_agent(agent, env, episodes=2)


Episode 1/10 - Steps: 3 - Epsilon: 1.00
Episode 2/10 - Steps: 3 - Epsilon: 0.99
Episode 3/10 - Steps: 15 - Epsilon: 0.99
Episode 4/10 - Steps: 7 - Epsilon: 0.99
Episode 5/10 - Steps: 17 - Epsilon: 0.98
Episode 6/10 - Steps: 3 - Epsilon: 0.98
Episode 7/10 - Steps: 1 - Epsilon: 0.97
Episode 8/10 - Steps: 6 - Epsilon: 0.97
Episode 9/10 - Steps: 6 - Epsilon: 0.96
Episode 10/10 - Steps: 6 - Epsilon: 0.96
Test Episode 1/2 - Step 0
Current State:
[[1 0]
 [0 0]]
Chosen Action: 0
Next State:
[[0 0]
 [0 0]]
Reward: 1, Done: True
Test Episode 1/2 - Steps: 1
[[0 0]
 [0 0]]
Test Episode 2/2 - Step 0
Current State:
[[0 0]
 [1 0]]
Chosen Action: 0
Next State:
[[0 0]
 [1 0]]
Reward: 1, Done: False
Test Episode 2/2 - Step 1
Current State:
[[0 0]
 [1 0]]
Chosen Action: 0
Next State:
[[0 0]
 [1 0]]
Reward: 1, Done: False
Test Episode 2/2 - Step 2
Current State:
[[0 0]
 [1 0]]
Chosen Action: 0
Next State:
[[0 0]
 [1 0]]
Reward: 1, Done: False
Test Episode 2/2 - Step 3
Current State:
[[0 0]
 [1 0]]
Chosen 

KeyboardInterrupt: 

In [None]:
import numpy as np
import tensorflow as tf

class GridEnvironment:
    def __init__(self, size=2):
        self.size = size
        self.state = np.random.randint(2, size=(size, size))

    def reset(self):
        self.state = np.random.randint(2, size=(self.size, self.size))
        return self.state.flatten()

    def step(self, action):
        row, col = divmod(action, self.size)
        if self.state[row, col] == 1:
            self.state[row, col] = 0
        reward = 1 if self.state[row, col] == 0 else -1
        done = np.all(self.state == 0)
        return self.state.flatten(), reward, done

    def render(self):
        print(self.state)

class QLearningAgent:
    def __init__(self, state_shape, action_size, learning_rate=0.001, gamma=0.99):
        self.state_shape = state_shape
        self.action_size = action_size
        self.gamma = gamma
        self.model = self.build_model(learning_rate)

    def build_model(self, learning_rate):
        model = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=self.state_shape),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='mse')
        return model

    def act(self, state, epsilon):
        if np.random.rand() <= epsilon:
            return np.random.randint(self.action_size)
        q_values = self.model.predict(state[np.newaxis], verbose=0)
        return np.argmax(q_values[0])

    def train(self, state, action, reward, next_state, done):
        target = reward
        if not done:
            target += self.gamma * np.amax(self.model.predict(next_state[np.newaxis], verbose=0)[0])
        target_f = self.model.predict(state[np.newaxis], verbose=0)
        target_f[0][action] = target
        self.model.fit(state[np.newaxis], target_f, epochs=1, verbose=0)

def test_agent(agent, env, episodes=10, max_steps_per_episode=100):
    success_count = 0
    steps_list = []

    for e in range(episodes):
        state = env.reset()
        done = False
        steps = 0

        while not done and steps < max_steps_per_episode:
            print(f"Test Episode {e+1}/{episodes} - Step {steps}")
            print(f"Current State:\n{state.reshape(env.size, env.size)}")
            action = np.argmax(agent.model.predict(state[np.newaxis], verbose=0)[0])
            print(f"Chosen Action: {action}")
            next_state, reward, done = env.step(action)
            print(f"Next State:\n{next_state.reshape(env.size, env.size)}")
            print(f"Reward: {reward}, Done: {done}")
            state = next_state
            steps += 1

            if done:
                steps_list.append(steps)
                if np.all(state == 0):
                    success_count += 1
                print(f"Test Episode {e+1}/{episodes} - Steps: {steps}")
                env.render()
                break
        else:
            print(f"Test Episode {e+1}/{episodes} reached max steps limit")

    success_rate = success_count / episodes * 100
    avg_steps = np.mean(steps_list) if steps_list else 'N/A'
    print(f"Success rate: {success_rate:.2f}%")
    print(f"Average steps per successful episode: {avg_steps}")

# Initialize environment and agent
env = GridEnvironment(size=2)
agent = QLearningAgent(state_shape=(4,), action_size=4)

# Training the agent
episodes = 100
epsilon = 1.0  # Exploration rate
epsilon_min = 0.1
epsilon_decay = 0.995

for e in range(episodes):
    state = env.reset()
    done = False
    steps = 0
    while not done:
        action = agent.act(state, epsilon)
        next_state, reward, done = env.step(action)
        agent.train(state, action, reward, next_state, done)
        state = next_state
        steps += 1
        if done:
            print(f"Episode {e+1}/{episodes} - Steps: {steps} - Epsilon: {epsilon:.2f}")
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

# Test the trained agent with max steps limit
test_agent(agent, env, episodes=2, max_steps_per_episode=100)


Episode 1/100 - Steps: 4 - Epsilon: 1.00
Episode 2/100 - Steps: 3 - Epsilon: 0.99
Episode 3/100 - Steps: 2 - Epsilon: 0.99
Episode 4/100 - Steps: 14 - Epsilon: 0.99
Episode 5/100 - Steps: 7 - Epsilon: 0.98
Episode 6/100 - Steps: 5 - Epsilon: 0.98
Episode 7/100 - Steps: 6 - Epsilon: 0.97
Episode 8/100 - Steps: 3 - Epsilon: 0.97
Episode 9/100 - Steps: 18 - Epsilon: 0.96
Episode 10/100 - Steps: 6 - Epsilon: 0.96
Episode 11/100 - Steps: 7 - Epsilon: 0.95
Episode 12/100 - Steps: 2 - Epsilon: 0.95
Episode 13/100 - Steps: 20 - Epsilon: 0.94
Episode 14/100 - Steps: 11 - Epsilon: 0.94
Episode 15/100 - Steps: 9 - Epsilon: 0.93
Episode 16/100 - Steps: 1 - Epsilon: 0.93
Episode 17/100 - Steps: 5 - Epsilon: 0.92
Episode 18/100 - Steps: 3 - Epsilon: 0.92
Episode 19/100 - Steps: 3 - Epsilon: 0.91
Episode 20/100 - Steps: 2 - Epsilon: 0.91
Episode 21/100 - Steps: 5 - Epsilon: 0.90
Episode 22/100 - Steps: 12 - Epsilon: 0.90
Episode 23/100 - Steps: 4 - Epsilon: 0.90
Episode 24/100 - Steps: 1 - Epsilon: 0

In [None]:
import numpy as np
import tensorflow as tf

class GridEnvironment:
    def __init__(self, size=2):
        self.size = size
        self.state = np.random.randint(2, size=(size, size))

    def reset(self):
        self.state = np.random.randint(2, size=(self.size, self.size))
        return self.state.flatten()

    def step(self, action):
        row, col = divmod(action, self.size)
        reward = 0
        if self.state[row, col] == 1:
            self.state[row, col] = 0
            reward = 1
        done = np.all(self.state == 0)
        return self.state.flatten(), reward, done

    def render(self):
        print(self.state)


In [None]:
class QLearningAgent:
    def __init__(self, state_shape, action_size, learning_rate=0.001, gamma=0.99):
        self.state_shape = state_shape
        self.action_size = action_size
        self.gamma = gamma
        self.model = self.build_model(learning_rate)

    def build_model(self, learning_rate):
        model = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=self.state_shape),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='mse')
        return model

    def act(self, state, epsilon):
        if np.random.rand() <= epsilon:
            return np.random.randint(self.action_size)
        q_values = self.model.predict(state[np.newaxis], verbose=0)
        return np.argmax(q_values[0])

    def train(self, state, action, reward, next_state, done):
        target = reward
        if not done:
            target += self.gamma * np.amax(self.model.predict(next_state[np.newaxis], verbose=0)[0])
        target_f = self.model.predict(state[np.newaxis], verbose=0)
        target_f[0][action] = target
        self.model.fit(state[np.newaxis], target_f, epochs=1, verbose=0)


In [None]:
env = GridEnvironment(size=2)
agent = QLearningAgent(state_shape=(4,), action_size=4)

episodes = 100
epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.995

for e in range(episodes):
    state = env.reset()
    done = False
    steps = 0
    while not done:
        action = agent.act(state, epsilon)
        next_state, reward, done = env.step(action)
        agent.train(state, action, reward, next_state, done)
        state = next_state
        steps += 1
        if done:
            print(f"Episode {e+1}/{episodes} - Steps: {steps} - Epsilon: {epsilon:.2f}")
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay


Episode 1/100 - Steps: 8 - Epsilon: 1.00
Episode 2/100 - Steps: 5 - Epsilon: 0.99
Episode 3/100 - Steps: 3 - Epsilon: 0.99
Episode 4/100 - Steps: 16 - Epsilon: 0.99
Episode 5/100 - Steps: 15 - Epsilon: 0.98
Episode 6/100 - Steps: 7 - Epsilon: 0.98
Episode 7/100 - Steps: 6 - Epsilon: 0.97
Episode 8/100 - Steps: 3 - Epsilon: 0.97
Episode 9/100 - Steps: 6 - Epsilon: 0.96
Episode 10/100 - Steps: 8 - Epsilon: 0.96
Episode 11/100 - Steps: 12 - Epsilon: 0.95
Episode 12/100 - Steps: 6 - Epsilon: 0.95
Episode 13/100 - Steps: 7 - Epsilon: 0.94
Episode 14/100 - Steps: 2 - Epsilon: 0.94
Episode 15/100 - Steps: 9 - Epsilon: 0.93
Episode 16/100 - Steps: 8 - Epsilon: 0.93
Episode 17/100 - Steps: 8 - Epsilon: 0.92
Episode 18/100 - Steps: 1 - Epsilon: 0.92
Episode 19/100 - Steps: 7 - Epsilon: 0.91
Episode 20/100 - Steps: 10 - Epsilon: 0.91
Episode 21/100 - Steps: 5 - Epsilon: 0.90
Episode 22/100 - Steps: 4 - Epsilon: 0.90
Episode 23/100 - Steps: 1 - Epsilon: 0.90
Episode 24/100 - Steps: 8 - Epsilon: 0.

In [None]:
def test_agent(agent, env, episodes=10, max_steps_per_episode=100):
    success_count = 0
    steps_list = []

    for e in range(episodes):
        state = env.reset()
        done = False
        steps = 0

        while not done and steps < max_steps_per_episode:
            action = np.argmax(agent.model.predict(state[np.newaxis], verbose=0)[0])
            next_state, reward, done = env.step(action)
            state = next_state
            steps += 1

            if done:
                steps_list.append(steps)
                if np.all(state == 0):
                    success_count += 1
                print(f"Test Episode {e+1}/{episodes} - Steps: {steps}")
                env.render()
                break
        else:
            print(f"Test Episode {e+1}/{episodes} reached max steps limit")

    success_rate = success_count / episodes * 100
    avg_steps = np.mean(steps_list) if steps_list else 'N/A'
    print(f"Success rate: {success_rate:.2f}%")
    print(f"Average steps per successful episode: {avg_steps}")

test_agent(agent, env, episodes=2, max_steps_per_episode=100)


Test Episode 1/2 reached max steps limit
Test Episode 2/2 reached max steps limit
Success rate: 0.00%
Average steps per successful episode: N/A


In [None]:
class QLearningAgent:
    def __init__(self, state_shape, action_size, learning_rate=0.001, gamma=0.99):
        self.state_shape = state_shape
        self.action_size = action_size
        self.gamma = gamma
        self.model = self.build_model(learning_rate)

    def build_model(self, learning_rate):
        model = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=self.state_shape),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='mse')
        return model

    def act(self, state, epsilon):
        if np.random.rand() <= epsilon:
            return np.random.randint(self.action_size)
        q_values = self.model.predict(state[np.newaxis], verbose=0)
        return np.argmax(q_values[0])

    def train(self, state, action, reward, next_state, done):
        target = reward
        if not done:
            target += self.gamma * np.amax(self.model.predict(next_state[np.newaxis], verbose=0)[0])
        target_f = self.model.predict(state[np.newaxis], verbose=0)
        target_f[0][action] = target
        self.model.fit(state[np.newaxis], target_f, epochs=1, verbose=0)


In [None]:
env = GridEnvironment(size=2)
agent = QLearningAgent(state_shape=(4,), action_size=4)

episodes = 1000
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.99

for e in range(episodes):
    state = env.reset()
    done = False
    steps = 0
    while not done:
        action = agent.act(state, epsilon)
        next_state, reward, done = env.step(action)
        agent.train(state, action, reward, next_state, done)
        state = next_state
        steps += 1
        if done:
            print(f"Episode {e+1}/{episodes} - Steps: {steps} - Epsilon: {epsilon:.2f}")
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay


Episode 1/1000 - Steps: 10 - Epsilon: 1.00
Episode 2/1000 - Steps: 4 - Epsilon: 0.99
Episode 3/1000 - Steps: 14 - Epsilon: 0.98
Episode 4/1000 - Steps: 11 - Epsilon: 0.97
Episode 5/1000 - Steps: 7 - Epsilon: 0.96
Episode 6/1000 - Steps: 4 - Epsilon: 0.95
Episode 7/1000 - Steps: 10 - Epsilon: 0.94
Episode 8/1000 - Steps: 11 - Epsilon: 0.93
Episode 9/1000 - Steps: 6 - Epsilon: 0.92
Episode 10/1000 - Steps: 3 - Epsilon: 0.91
Episode 11/1000 - Steps: 4 - Epsilon: 0.90
Episode 12/1000 - Steps: 5 - Epsilon: 0.90
Episode 13/1000 - Steps: 16 - Epsilon: 0.89
Episode 14/1000 - Steps: 2 - Epsilon: 0.88
Episode 15/1000 - Steps: 5 - Epsilon: 0.87
Episode 16/1000 - Steps: 8 - Epsilon: 0.86
Episode 17/1000 - Steps: 8 - Epsilon: 0.85
Episode 18/1000 - Steps: 5 - Epsilon: 0.84
Episode 19/1000 - Steps: 4 - Epsilon: 0.83
Episode 20/1000 - Steps: 11 - Epsilon: 0.83
Episode 21/1000 - Steps: 5 - Epsilon: 0.82
Episode 22/1000 - Steps: 8 - Epsilon: 0.81
Episode 23/1000 - Steps: 7 - Epsilon: 0.80
Episode 24/10

In [None]:
def test_agent(agent, env, episodes=10, max_steps_per_episode=100):
    success_count = 0
    steps_list = []

    for e in range(episodes):
        state = env.reset()
        done = False
        steps = 0

        while not done and steps < max_steps_per_episode:
            action = np.argmax(agent.model.predict(state[np.newaxis], verbose=0)[0])
            next_state, reward, done = env.step(action)
            state = next_state
            steps += 1

            if done:
                steps_list.append(steps)
                if np.all(state == 0):
                    success_count += 1
                print(f"Test Episode {e+1}/{episodes} - Steps: {steps}")
                env.render()
                break
        else:
            print(f"Test Episode {e+1}/{episodes} reached max steps limit")

    success_rate = success_count / episodes * 100
    avg_steps = np.mean(steps_list) if steps_list else 'N/A'
    print(f"Success rate: {success_rate:.2f}%")
    print(f"Average steps per successful episode: {avg_steps}")

test_agent(agent, env, episodes=10, max_steps_per_episode=100)


Test Episode 1/10 - Steps: 4
[[0 0]
 [0 0]]
Test Episode 2/10 - Steps: 3
[[0 0]
 [0 0]]
Test Episode 3/10 - Steps: 3
[[0 0]
 [0 0]]
Test Episode 4/10 - Steps: 2
[[0 0]
 [0 0]]
Test Episode 5/10 - Steps: 3
[[0 0]
 [0 0]]
Test Episode 6/10 - Steps: 2
[[0 0]
 [0 0]]
Test Episode 7/10 - Steps: 3
[[0 0]
 [0 0]]
Test Episode 8/10 - Steps: 1
[[0 0]
 [0 0]]
Test Episode 9/10 - Steps: 1
[[0 0]
 [0 0]]
Test Episode 10/10 - Steps: 3
[[0 0]
 [0 0]]
Success rate: 100.00%
Average steps per successful episode: 2.5


https://www.tensorflow.org/quantum/tutorials/quantum_reinforcement_learning

https://arxiv.org/abs/2211.03464

In [None]:
import numpy as np
import tensorflow as tf

class GridEnvironment:
    def __init__(self, shape=(3, 3, 6)):
        self.shape = shape
        self.state = np.random.randint(2, size=shape)

    def reset(self):
        self.state = np.random.randint(2, size=self.shape)
        return self.state.flatten()

    def step(self, action):
        depth, row, col = np.unravel_index(action, self.shape)
        reward = 0
        if self.state[depth, row, col] == 1:
            self.state[depth, row, col] = 0
            reward = 10
        else:
            reward = -1

        # Handle toroidal boundary conditions
        if depth == 0:
            self.state[-1, row, col] = self.state[depth, row, col]
        if depth == self.shape[0] - 1:
            self.state[0, row, col] = self.state[depth, row, col]
        if row == 0:
            self.state[depth, -1, col] = self.state[depth, row, col]
        if row == self.shape[1] - 1:
            self.state[depth, 0, col] = self.state[depth, row, col]
        if col == 0:
            self.state[depth, row, -1] = self.state[depth, row, col]
        if col == self.shape[2] - 1:
            self.state[depth, row, 0] = self.state[depth, row, col]

        done = np.all(self.state == 0)
        return self.state.flatten(), reward, done

    def render(self):
        print(self.state)

# Testing the environment
env = GridEnvironment()
state = env.reset()
env.render()
next_state, reward, done = env.step(0)
env.render()
print(next_state, reward, done)


[[[0 0 1 0 0 0]
  [0 1 1 1 1 1]
  [1 1 0 0 0 0]]

 [[0 0 1 1 1 0]
  [1 0 0 0 1 1]
  [0 0 0 1 0 0]]

 [[1 0 0 1 1 1]
  [1 1 1 0 1 0]
  [1 0 1 1 0 1]]]
[[[0 0 1 0 0 0]
  [0 1 1 1 1 1]
  [0 1 0 0 0 0]]

 [[0 0 1 1 1 0]
  [1 0 0 0 1 1]
  [0 0 0 1 0 0]]

 [[0 0 0 1 1 1]
  [1 1 1 0 1 0]
  [1 0 1 1 0 1]]]
[0 0 1 0 0 0 0 1 1 1 1 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 0 0 1 1 0 0 0 1 0 0 0
 0 0 1 1 1 1 1 1 0 1 0 1 0 1 1 0 1] -1 False


In [None]:
class QLearningAgent:
    def __init__(self, state_shape, action_size, learning_rate=0.001, gamma=0.99):
        self.state_shape = state_shape
        self.action_size = action_size
        self.gamma = gamma
        self.model = self.build_model(learning_rate)

    def build_model(self, learning_rate):
        model = tf.keras.Sequential([
            tf.keras.layers.InputLayer(input_shape=self.state_shape),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='mse')
        return model

    def act(self, state, epsilon):
        if np.random.rand() <= epsilon:
            return np.random.randint(self.action_size)
        q_values = self.model.predict(state[np.newaxis], verbose=0)
        return np.argmax(q_values[0])

    def train(self, state, action, reward, next_state, done):
        target = reward
        if not done:
            target += self.gamma * np.amax(self.model.predict(next_state[np.newaxis], verbose=0)[0])
        target_f = self.model.predict(state[np.newaxis], verbose=0)
        target_f[0][action] = target
        self.model.fit(state[np.newaxis], target_f, epochs=1, verbose=0)


In [None]:
env = GridEnvironment(shape=(3, 3, 6))
agent = QLearningAgent(state_shape=(54,), action_size=54)

episodes = 10
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.99

for e in range(episodes):
    state = env.reset()
    done = False
    steps = 0
    while not done:
        action = agent.act(state, epsilon)
        next_state, reward, done = env.step(action)
        agent.train(state, action, reward, next_state, done)
        state = next_state
        steps += 1
        if done:
            print(f"Episode {e+1}/{episodes} - Steps: {steps} - Epsilon: {epsilon:.2f}")
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay


Episode 1/10 - Steps: 113 - Epsilon: 1.00
Episode 2/10 - Steps: 65 - Epsilon: 0.99
Episode 3/10 - Steps: 195 - Epsilon: 0.98
Episode 4/10 - Steps: 43 - Epsilon: 0.97
Episode 5/10 - Steps: 63 - Epsilon: 0.96
Episode 6/10 - Steps: 233 - Epsilon: 0.95
Episode 7/10 - Steps: 51 - Epsilon: 0.94
Episode 8/10 - Steps: 130 - Epsilon: 0.93
Episode 9/10 - Steps: 103 - Epsilon: 0.92
Episode 10/10 - Steps: 99 - Epsilon: 0.91


In [None]:
def test_agent(agent, env, episodes=10, max_steps_per_episode=100):
    success_count = 0
    steps_list = []

    for e in range(episodes):
        state = env.reset()
        done = False
        steps = 0

        while not done and steps < max_steps_per_episode:
            action = np.argmax(agent.model.predict(state[np.newaxis], verbose=0)[0])
            next_state, reward, done = env.step(action)
            state = next_state
            steps += 1

            if done:
                steps_list.append(steps)
                if np.all(state == 0):
                    success_count += 1
                print(f"Test Episode {e+1}/{episodes} - Steps: {steps}")
                env.render()
                break
        else:
            print(f"Test Episode {e+1}/{episodes} reached max steps limit")

    success_rate = success_count / episodes * 100
    avg_steps = np.mean(steps_list) if steps_list else 'N/A'
    print(f"Success rate: {success_rate:.2f}%")
    print(f"Average steps per successful episode: {avg_steps}")

test_agent(agent, env, episodes=10, max_steps_per_episode=100)


Test Episode 1/10 reached max steps limit
Test Episode 2/10 reached max steps limit
Test Episode 3/10 reached max steps limit
Test Episode 4/10 reached max steps limit
Test Episode 5/10 reached max steps limit
Test Episode 6/10 reached max steps limit
Test Episode 7/10 reached max steps limit
Test Episode 8/10 reached max steps limit
Test Episode 9/10 reached max steps limit
Test Episode 10/10 reached max steps limit
Success rate: 0.00%
Average steps per successful episode: N/A


In [None]:
import numpy as np
import tensorflow as tf

class RubiksCubeEnvironment:
    def __init__(self):
        self.size = (3, 3, 6)
        self.colors = ['y', 'r', 'b', 'g', 'w', 'o']
        self.state = self.initialize_cube()
        self.valid_rotations = ['U', 'U\'', 'D', 'D\'', 'L', 'L\'', 'R', 'R\'', 'F', 'F\'', 'B', 'B\'']

    def initialize_cube(self):
        state = np.empty(self.size, dtype=str)
        for i, color in enumerate(self.colors):
            state[:, :, i] = color
        return state

    def reset(self):
        self.state = self.initialize_cube()
        self.scramble_cube()
        return self.state.flatten()

    def scramble_cube(self, steps=20):
        for _ in range(steps):
            rotation = np.random.choice(self.valid_rotations)
            self.apply_rotation(rotation)

    def apply_rotation(self, rotation):
        if rotation == 'U':
            self.rotate_face_clockwise(0)
        elif rotation == 'U\'':
            self.rotate_face_counterclockwise(0)
        elif rotation == 'D':
            self.rotate_face_clockwise(2)
        elif rotation == 'D\'':
            self.rotate_face_counterclockwise(2)
        elif rotation == 'L':
            self.rotate_side_clockwise(0)
        elif rotation == 'L\'':
            self.rotate_side_counterclockwise(0)
        elif rotation == 'R':
            self.rotate_side_clockwise(2)
        elif rotation == 'R\'':
            self.rotate_side_counterclockwise(2)
        elif rotation == 'F':
            self.rotate_front_clockwise()
        elif rotation == 'F\'':
            self.rotate_front_counterclockwise()
        elif rotation == 'B':
            self.rotate_back_clockwise()
        elif rotation == 'B\'':
            self.rotate_back_counterclockwise()

    def rotate_face_clockwise(self, layer):
        self.state[layer, :, :] = np.rot90(self.state[layer, :, :], -1)

    def rotate_face_counterclockwise(self, layer):
        self.state[layer, :, :] = np.rot90(self.state[layer, :, :], 1)

    def rotate_side_clockwise(self, col):
        self.state[:, col, :] = np.rot90(self.state[:, col, :], -1)

    def rotate_side_counterclockwise(self, col):
        self.state[:, col, :] = np.rot90(self.state[:, col, :], 1)

    def rotate_front_clockwise(self):
        self.state[:, :, :] = np.rot90(self.state[:, :, :], axes=(1, 0), k=-1)

    def rotate_front_counterclockwise(self):
        self.state[:, :, :] = np.rot90(self.state[:, :, :], axes=(1, 0), k=1)

    def rotate_back_clockwise(self):
        self.state[:, :, :] = np.rot90(self.state[:, :, :], axes=(0, 2), k=-1)

    def rotate_back_counterclockwise(self):
        self.state[:, :, :] = np.rot90(self.state[:, :, :], axes=(0, 2), k=1)

    def step(self, action):
        prev_state = self.state.copy()
        self.apply_rotation(self.valid_rotations[action])
        reward = self.calculate_reward(prev_state, self.state)
        done = self.check_if_solved()
        return self.state.flatten(), reward, done

    def calculate_reward(self, prev_state, current_state):
        # Check if the cube is solved
        if self.check_if_solved():
            return 100  # High positive reward for solving the cube

        # Incremental reward based on face uniformity
        reward = 0
        for i in range(6):
            prev_face = prev_state[:, :, i]
            curr_face = current_state[:, :, i]
            prev_face_uniformity = np.sum(prev_face == prev_face[0, 0])
            curr_face_uniformity = np.sum(curr_face == curr_face[0, 0])
            reward += (curr_face_uniformity - prev_face_uniformity)

        reward -= 1  # Small penalty for each move to encourage fewer moves
        return reward

    def check_if_solved(self):
        for i in range(6):
            if not np.all(self.state[:, :, i] == self.state[0, 0, i]):
                return False
        return True

    def render(self):
        for i, color in enumerate(self.colors):
            print(f'Face {color}:')
            print(self.state[:, :, i])

# Testing the environment
env = RubiksCubeEnvironment()
state = env.reset()
env.render()


ValueError: could not broadcast input array from shape (6,3) into shape (3,6)

In [None]:
env = RubiksCubeEnvironment()
agent = QLearningAgent(state_shape=(54,), action_size=12)

episodes = 1000
epsilon = 1.0  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.99

for e in range(episodes):
    state = env.reset()
    done = False
    steps = 0
    while not done:
        action = agent.act(state, epsilon)
        next_state, reward, done = env.step(action)
        agent.train(state, action, reward, next_state, done)
        state = next_state
        steps += 1
        if done:
            print(f"Episode {e+1}/{episodes} - Steps: {steps} - Epsilon: {epsilon:.2f}")
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay
