In [1]:
import os
print(os.getcwd())
import sys
sys.path.append('C:\\Users\\theal\\PycharmProjects')
sys.path.append('C:\\Users\\theal\\PycharmProjects\\stockgym')

C:\Users\theal\PycharmProjects\stockgym\src\agents


In [2]:
import numpy as np
import gym
import random
import time
from stockgym.src.envs.gym_up_and_to_the_right.up_and_to_the_right_env import UpAndToTheRightEnv

In [3]:
import numpy as np
import gym

class RandomActionAgent:
    def __init__(self, env):
        self.env = env

    def get_action(self):
        valid_actions = self.env.controller.get_valid_actions()
        action = np.random.choice(valid_actions)
        return action

    def run_episode(self):
        state = self.env.reset()
        done = False
        total_reward = 0

        while not done:
            action = self.get_action()
            next_state, reward, done, truncated, info = self.env.step(action)
            total_reward += reward
            state = next_state

        return total_reward

# Create the environment
env = UpAndToTheRightEnv()

# Create and run the agent
agent = RandomActionAgent(env)

for i in range(10):
    total_reward = agent.run_episode()
    print(f"Total reward from the episode {i + 1}: {total_reward}")

Total reward from the episode 1: -13.869236094534076
Total reward from the episode 2: 24.24800247489269
Total reward from the episode 3: 33.58330584728564
Total reward from the episode 4: 24.78078373091788
Total reward from the episode 5: -12.555015803396582
Total reward from the episode 6: -46.66014719738796
Total reward from the episode 7: 31.64858940591711
Total reward from the episode 8: 56.60386855002021
Total reward from the episode 9: -36.550058308394625
Total reward from the episode 10: 2.0498161724264823


In [4]:
class QLearningAgent:
    def __init__(self, env, learning_rate=.1, discount_factor=0.999, exploration_rate=1, exploration_decay=.99):
        self.env = env
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.q_table = {}  # Initialize an empty Q-table

    def get_action(self, state):
        state_key = self.state_to_key(state)
        valid_actions = self.env.controller.get_valid_actions()
        # print(state)
        self.initialize_state_in_q_table(state_key)

        if random.uniform(0, 1) < self.exploration_rate:
            return random.choice(valid_actions)  # Explore
        else:
            # Exploit by selecting the best valid action based on Q-values
            q_values = np.array([self.q_table[state_key][a] for a in range(self.env.action_space.n)])
            # Mask q_values of invalid actions with a very low number
            mask = np.ones(len(q_values)) * -np.inf
            mask[valid_actions] = 0
            q_values_masked = q_values + mask
            return np.argmax(q_values_masked)  # Exploit

    def update_q_table(self, state, action, reward, next_state):
        state_key = self.state_to_key(state)
        next_state_key = self.state_to_key(next_state)

        # Ensure both the current and next states are in the Q-table
        self.initialize_state_in_q_table(state_key)
        self.initialize_state_in_q_table(next_state_key)

        # Find the best next action from Q-table
        best_next_action = np.argmax(self.q_table[next_state_key])
        td_target = reward + self.discount_factor * self.q_table[next_state_key][best_next_action]
        td_error = td_target - self.q_table[state_key][action]
        self.q_table[state_key][action] += self.learning_rate * td_error


    def initialize_state_in_q_table(self, state_key):
        if state_key not in self.q_table:
            self.q_table[state_key] = [0 for _ in range(self.env.action_space.n)]


    def state_to_key(self, state):
        return tuple(state)


    def run_episode(self):
        state, info = self.env.reset()
        done = False
        total_reward = 0

        while not done:
            action = self.get_action(state)
            next_state, reward, done, truncated, info = self.env.step(action)
            self.update_q_table(state, action, reward, next_state)

            total_reward += reward
            state = next_state

        self.exploration_rate *= self.exploration_decay
        return total_reward
    
    def run_episode_w_render(self):
        state, info = self.env.reset()
        done = False
        total_reward = 0

        while not done:
            action = self.get_action(state)
            next_state, reward, done, truncated, info = self.env.step(action)
            self.update_q_table(state, action, reward, next_state)

            total_reward += reward
            state = next_state
            # self.env.render()

        self.exploration_rate *= self.exploration_decay
        return total_reward

In [5]:
env = UpAndToTheRightEnv()

agent = QLearningAgent(env)

In [6]:
for i in range(1000):
    total_reward = agent.run_episode()
    print(f"Total reward from the episode {i + 1}: {total_reward}")

Total reward from the episode 1: -9.136039838553891
Total reward from the episode 2: 20.722919425167163
Total reward from the episode 3: 45.58780987437248
Total reward from the episode 4: -90.311153497487
Total reward from the episode 5: -38.614197477193564
Total reward from the episode 6: 35.255995283410925
Total reward from the episode 7: 78.5458186027584
Total reward from the episode 8: 1.961329816167933
Total reward from the episode 9: -19.63115875753332
Total reward from the episode 10: 41.33706055913137
Total reward from the episode 11: -10.837814220474058
Total reward from the episode 12: 55.258137542608054
Total reward from the episode 13: -53.4354849900535
Total reward from the episode 14: -13.968620638661237
Total reward from the episode 15: -13.591322531747302
Total reward from the episode 16: -51.41340159178368
Total reward from the episode 17: -4.097356035398677
Total reward from the episode 18: 24.273990769673915
Total reward from the episode 19: -43.85820260713777
Total 

Total reward from the episode 218: 1374.1588943907784
Total reward from the episode 219: 582.3238927129979
Total reward from the episode 220: 870.7077481081014
Total reward from the episode 221: 843.6321672795468
Total reward from the episode 222: 2565.2215268789973
Total reward from the episode 223: 2017.8371929172697
Total reward from the episode 224: 678.0091058532246
Total reward from the episode 225: 1221.139388703362
Total reward from the episode 226: 654.008071619077
Total reward from the episode 227: 1488.5259171459008
Total reward from the episode 228: 360.25011874212896
Total reward from the episode 229: 704.0531836232182
Total reward from the episode 230: 1116.9503400554795
Total reward from the episode 231: 683.6571209738904
Total reward from the episode 232: 0.7700671925269411
Total reward from the episode 233: 138.78138794908284
Total reward from the episode 234: 590.1388912457486
Total reward from the episode 235: 393.1310420336112
Total reward from the episode 236: 621.

Total reward from the episode 419: 1483.1166427186802
Total reward from the episode 420: 3650.4379705157125
Total reward from the episode 421: 2795.3739047798035
Total reward from the episode 422: 2118.86627061403
Total reward from the episode 423: 3592.9217309193527
Total reward from the episode 424: 3071.64000985996
Total reward from the episode 425: 3649.187556182163
Total reward from the episode 426: 3602.7333907326356
Total reward from the episode 427: 3679.5640727582054
Total reward from the episode 428: 3626.4515469826415
Total reward from the episode 429: 1889.696564205883
Total reward from the episode 430: 3652.2969176242786
Total reward from the episode 431: 2646.63106414489
Total reward from the episode 432: 3607.7817184193545
Total reward from the episode 433: 3660.069658062158
Total reward from the episode 434: 3638.793979900142
Total reward from the episode 435: 3661.2285791605373
Total reward from the episode 436: 2837.9950061954924
Total reward from the episode 437: 142

Total reward from the episode 616: 3663.79793621585
Total reward from the episode 617: 3666.0053327767164
Total reward from the episode 618: 3619.9629454402593
Total reward from the episode 619: 3623.7227832943217
Total reward from the episode 620: 3680.9927185347324
Total reward from the episode 621: 3634.0302023277136
Total reward from the episode 622: 3665.8854810850385
Total reward from the episode 623: 3653.00713961829
Total reward from the episode 624: 3681.6418460315144
Total reward from the episode 625: 3632.7229092645666
Total reward from the episode 626: 3629.090135800219
Total reward from the episode 627: 3655.259518606021
Total reward from the episode 628: 3682.4379720866064
Total reward from the episode 629: 3643.9468031127767
Total reward from the episode 630: 3673.5565669794737
Total reward from the episode 631: 3630.3205112180276
Total reward from the episode 632: 3640.059232225811
Total reward from the episode 633: 3655.2507113616944
Total reward from the episode 634: 

Total reward from the episode 813: 3642.9141452784747
Total reward from the episode 814: 3628.78221567301
Total reward from the episode 815: 3645.11423862272
Total reward from the episode 816: 3676.0528889738152
Total reward from the episode 817: 3640.30868802022
Total reward from the episode 818: 3672.0881566739367
Total reward from the episode 819: 3674.44044530288
Total reward from the episode 820: 3613.4425244895224
Total reward from the episode 821: 3675.687614877761
Total reward from the episode 822: 3665.4122584826187
Total reward from the episode 823: 3645.9228505999754
Total reward from the episode 824: 3669.3829165065176
Total reward from the episode 825: 3655.3102017009496
Total reward from the episode 826: 3700.9586669774617
Total reward from the episode 827: 3644.086914578001
Total reward from the episode 828: 3648.720018330116
Total reward from the episode 829: 3689.0048217151675
Total reward from the episode 830: 3634.6015027615354
Total reward from the episode 831: 3663

Total reward from the episode 998: 3662.2649407227486
Total reward from the episode 999: 3640.4581224891353
Total reward from the episode 1000: 3658.7335989778067


In [None]:
env_render = UpAndToTheRightEnv(render_mode='human')
agent_render = QLearningAgent(env_render)


In [14]:
for i in range(1):
    total_reward = agent_render.run_episode_w_render()
    print(f"Total reward from the episode {i + 1}: {total_reward}")
    
# env.close()

Total reward from the episode 1: -43.19117364930888


In [25]:
env.close()

In [10]:
agent.q_table

{(0.01,): [366.2202456951612, 11504.881429360406, 1087.6490088514588, 0, 0],
 (0.01, 1.0): [29.64955190324724,
  780.5633847142296,
  11716.90688103198,
  249.8519792659809,
  46.93757576578879],
 (0.01, 0.505, 1.0): [21.82343006816371,
  948.9885988275026,
  11929.976707023152,
  215.71586494251707,
  4.250745122399426],
 (0.01, 0.33999999999999997, 0.67, 1.0): [50.70541269358468,
  485.4321024852635,
  12143.66914519927,
  743.48219152789,
  15.635010471951222],
 (0.01, 0.2575, 0.505, 0.7525, 1.0): [12359.715360250812,
  1971.0868154992734,
  7437.656008919959,
  12143.66914519927,
  6930.710762133917]}

In [8]:
env.close()

In [27]:
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from collections import deque

class DQNAgent:
    def __init__(self, env, learning_rate=0.001, discount_factor=0.99, exploration_rate=1.0, exploration_decay=0.995, min_exploration_rate=0.01, memory_size=100000, batch_size=64):
        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        self.memory = deque(maxlen=memory_size)

        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.min_exploration_rate = min_exploration_rate
        self.batch_size = batch_size

        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()

    def build_model(self):
        """Builds a neural network for approximating Q-values."""
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def update_target_model(self):
        """Updates the target network."""
        self.target_model.set_weights(self.model.get_weights())

    def get_action(self, state):
        valid_actions = self.env.controller.get_valid_actions()
        state = self.pad_state(state)  # Assuming state padding as discussed earlier

        if np.random.rand() < self.exploration_rate:
            return np.random.choice(valid_actions)  # Explore
        else:
            q_values = self.model.predict(state)[0]
            # Mask q_values of invalid actions with a very low number
            mask = -np.inf * np.ones(len(q_values))
            mask[valid_actions] = 0
            q_values_masked = q_values + mask
            return np.argmax(q_values_masked)  # Exploit

    def remember(self, state, action, reward, next_state, done):
        """Stores experience in replay memory."""
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        """Trains the model using randomly sampled experiences from the memory."""
        if len(self.memory) < self.batch_size:
            return
        minibatch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.discount_factor * np.amax(self.target_model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)

        if self.exploration_rate > self.min_exploration_rate:
            self.exploration_rate *= self.exploration_decay
            
    def pad_state(self, state):
        """Pads the state vector with zeros if its length is less than the expected state size."""
        if not isinstance(state, np.ndarray):
            state = np.array(state)

        if state.ndim > 1:
            state = state.flatten()  # Flatten the state if it's not already 1D

        padded_state = np.zeros(self.state_size)
        start_index = self.state_size - len(state)
        padded_state[start_index:] = state
        return np.reshape(padded_state, [1, self.state_size])


    def run_episode(self):
        state = self.env.reset()
        state = self.pad_state(state)
        done = False
        total_reward = 0

        while not done:
            action = self.get_action(state)
            print(action)
            next_state, reward, done, truncated, info = self.env.step(action)
            next_state = self.pad_state(next_state)
            self.remember(state, action, reward, next_state, done)

            state = next_state
            total_reward += reward
            self.replay()

        self.exploration_rate *= self.exploration_decay
        return total_reward

        self.exploration_rate *= self.exploration_decay
        return total_reward

In [28]:
dqn_agent = DQNAgent(env)

In [29]:
for i in range(10):
    total_reward = dqn_agent.run_episode()
    print(f"Total reward from the episode {i + 1}: {total_reward}")

0
2
0
2
2
2
2
0
2
4
2
0
2
4
1
2
2
2
1
1
1
1
1
2
2
2
3
0
0
4
0
2
4
0
0
0
0
4
1
2
2
1
1
2
1
1
2
3
0
0
4
2
2
1
3
0
0
4
1
1
3
2
2
0
0
2
0
4
1
3
1
3
2
1
2
3
2
1
1
2
3
2
0
4
2
1
1
2
1
1
3
2
2
2
2
0
2
0
0
0
Total reward from the episode 1: -20.737491436185024
2
2
2
1
3
1


KeyboardInterrupt: 

In [30]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

# Assuming env is your Gym environment
states = env.observation_space.shape
actions = env.action_space.n

def build_model(states, actions):
    model = Sequential()    
    model.add(Dense(24, activation='relu', input_shape=states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

model = build_model(states, actions)

def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

ModuleNotFoundError: No module named 'rl'

In [None]:
 from stable_baselines3 import PPO
import gym

# Create environment
env = gym.make('YourCustomEnv-v0')  # Replace with your environment

# Create and train the model
model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)
