In [1]:
import os
print(os.getcwd())
import sys
sys.path.append('C:\\Users\\theal\\PycharmProjects')
sys.path.append('C:\\Users\\theal\\PycharmProjects\\stockgym')

C:\Users\theal\PycharmProjects\stockgym\src\agents


In [2]:
import numpy as np
import gym
import random
import time
from stockgym.src.envs.gym_up_and_to_the_right.up_and_to_the_right_env import UpAndToTheRightEnv

In [3]:
import numpy as np
import gym

class RandomActionAgent:
    def __init__(self, env):
        self.env = env

    def get_action(self):
        valid_actions = self.env.controller.get_valid_actions()
        action = np.random.choice(valid_actions)
        return action

    def run_episode(self):
        state = self.env.reset()
        done = False
        total_reward = 0

        while not done:
            action = self.get_action()
            next_state, reward, done, info = self.env.step(action)
            total_reward += reward
            state = next_state

        return total_reward

# Create the environment
env = UpAndToTheRightEnv(state_type="Basic", 
                         reward_type="FinalOnly",
                         num_prev_obvs=5,
                         price_movement_type="Linear",
                         offset_scaling=1.0,
                         scale=1.0,
                         slope=1.0,
                         noise=0.1,
                         starting_price=100,
                         num_steps=100,
                         multiple_units=True,
                         render=False)

# Create and run the agent
agent = RandomActionAgent(env)

for i in range(10):
    total_reward = agent.run_episode()
    print(f"Total reward from the episode {i + 1}: {total_reward}")

Total reward from the episode 1: 13.583092541572421
Total reward from the episode 2: -59.60287902968517
Total reward from the episode 3: 38.41056528315692
Total reward from the episode 4: -16.34824966865538
Total reward from the episode 5: 2.843988446780956
Total reward from the episode 6: -56.89730099175364
Total reward from the episode 7: 2.6848243277873562
Total reward from the episode 8: -41.752438634073684
Total reward from the episode 9: -17.645901032367405
Total reward from the episode 10: -50.43563122853185


In [4]:
class QLearningAgent:
    def __init__(self, env, learning_rate=.1, discount_factor=0.999, exploration_rate=1, exploration_decay=.99):
        self.env = env
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.q_table = {}  # Initialize an empty Q-table

    def get_action(self, state):
        state_key = self.state_to_key(state)
        valid_actions = self.env.controller.get_valid_actions()
        self.initialize_state_in_q_table(state_key)

        if random.uniform(0, 1) < self.exploration_rate:
            return random.choice(valid_actions)  # Explore
        else:
            # Exploit by selecting the best valid action based on Q-values
            q_values = np.array([self.q_table[state_key][a] for a in range(self.env.action_space.n)])
            # Mask q_values of invalid actions with a very low number
            mask = np.ones(len(q_values)) * -np.inf
            mask[valid_actions] = 0
            q_values_masked = q_values + mask
            return np.argmax(q_values_masked)  # Exploit

    def update_q_table(self, state, action, reward, next_state):
        state_key = self.state_to_key(state)
        next_state_key = self.state_to_key(next_state)

        # Ensure both the current and next states are in the Q-table
        self.initialize_state_in_q_table(state_key)
        self.initialize_state_in_q_table(next_state_key)

        # Find the best next action from Q-table
        best_next_action = np.argmax(self.q_table[next_state_key])
        td_target = reward + self.discount_factor * self.q_table[next_state_key][best_next_action]
        td_error = td_target - self.q_table[state_key][action]
        self.q_table[state_key][action] += self.learning_rate * td_error


    def initialize_state_in_q_table(self, state_key):
        if state_key not in self.q_table:
            self.q_table[state_key] = [0 for _ in range(self.env.action_space.n)]


    def state_to_key(self, state):
        return tuple(state)


    def run_episode(self):
        state = self.env.reset()
        done = False
        total_reward = 0

        while not done:
            action = self.get_action(state)
            next_state, reward, done, info = self.env.step(action)
            self.update_q_table(state, action, reward, next_state)

            total_reward += reward
            state = next_state

        self.exploration_rate *= self.exploration_decay
        return total_reward
    
    def run_episode_w_render(self):
        state = self.env.reset()
        done = False
        total_reward = 0

        while not done:
            action = self.get_action(state)
            next_state, reward, done, info = self.env.step(action)
            self.update_q_table(state, action, reward, next_state)

            total_reward += reward
            state = next_state
            self.env.render()
            time.sleep(1)

        self.exploration_rate *= self.exploration_decay
        return total_reward

In [5]:
env = UpAndToTheRightEnv(state_type="Basic", 
                         reward_type="FinalOnly",
                         num_prev_obvs=5,
                         price_movement_type="Linear",
                         offset_scaling=1.0,
                         scale=1.0,
                         slope=1.0,
                         noise=0.1,
                         starting_price=100,
                         num_steps=100,
                         multiple_units=True,
                         render=True)

agent = QLearningAgent(env)

In [6]:
for i in range(10):
    total_reward = agent.run_episode()
    print(f"Total reward from the episode {i + 1}: {total_reward}")

Total reward from the episode 1: -14.0900322542956
Total reward from the episode 2: 85.26661319501335
Total reward from the episode 3: 36.63999018780939
Total reward from the episode 4: 21.667132839599155
Total reward from the episode 5: -131.70016943196237
Total reward from the episode 6: 13.377931901463215
Total reward from the episode 7: 23.72882952961703
Total reward from the episode 8: -2.9218221266281716
Total reward from the episode 9: -8.083957743698889
Total reward from the episode 10: 63.8240008554983


In [7]:
total_reward = agent.run_episode_w_render()


ValueError: Given implementation shouldn't be here

In [8]:
print(total_reward)

3374.5160847884927


In [10]:
agent.q_table

{(0.01,): [366.2202456951612, 11504.881429360406, 1087.6490088514588, 0, 0],
 (0.01, 1.0): [29.64955190324724,
  780.5633847142296,
  11716.90688103198,
  249.8519792659809,
  46.93757576578879],
 (0.01, 0.505, 1.0): [21.82343006816371,
  948.9885988275026,
  11929.976707023152,
  215.71586494251707,
  4.250745122399426],
 (0.01, 0.33999999999999997, 0.67, 1.0): [50.70541269358468,
  485.4321024852635,
  12143.66914519927,
  743.48219152789,
  15.635010471951222],
 (0.01, 0.2575, 0.505, 0.7525, 1.0): [12359.715360250812,
  1971.0868154992734,
  7437.656008919959,
  12143.66914519927,
  6930.710762133917]}

In [8]:
env.close()

In [27]:
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from collections import deque

class DQNAgent:
    def __init__(self, env, learning_rate=0.001, discount_factor=0.99, exploration_rate=1.0, exploration_decay=0.995, min_exploration_rate=0.01, memory_size=100000, batch_size=64):
        self.env = env
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        self.memory = deque(maxlen=memory_size)

        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.min_exploration_rate = min_exploration_rate
        self.batch_size = batch_size

        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()

    def build_model(self):
        """Builds a neural network for approximating Q-values."""
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def update_target_model(self):
        """Updates the target network."""
        self.target_model.set_weights(self.model.get_weights())

    def get_action(self, state):
        valid_actions = self.env.controller.get_valid_actions()
        state = self.pad_state(state)  # Assuming state padding as discussed earlier

        if np.random.rand() < self.exploration_rate:
            return np.random.choice(valid_actions)  # Explore
        else:
            q_values = self.model.predict(state)[0]
            # Mask q_values of invalid actions with a very low number
            mask = -np.inf * np.ones(len(q_values))
            mask[valid_actions] = 0
            q_values_masked = q_values + mask
            return np.argmax(q_values_masked)  # Exploit

    def remember(self, state, action, reward, next_state, done):
        """Stores experience in replay memory."""
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        """Trains the model using randomly sampled experiences from the memory."""
        if len(self.memory) < self.batch_size:
            return
        minibatch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.discount_factor * np.amax(self.target_model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)

        if self.exploration_rate > self.min_exploration_rate:
            self.exploration_rate *= self.exploration_decay
            
    def pad_state(self, state):
        """Pads the state vector with zeros if its length is less than the expected state size."""
        if not isinstance(state, np.ndarray):
            state = np.array(state)

        if state.ndim > 1:
            state = state.flatten()  # Flatten the state if it's not already 1D

        padded_state = np.zeros(self.state_size)
        start_index = self.state_size - len(state)
        padded_state[start_index:] = state
        return np.reshape(padded_state, [1, self.state_size])


    def run_episode(self):
        state = self.env.reset()
        state = self.pad_state(state)
        done = False
        total_reward = 0

        while not done:
            action = self.get_action(state)
            print(action)
            next_state, reward, done, _ = self.env.step(action)
            next_state = self.pad_state(next_state)
            self.remember(state, action, reward, next_state, done)

            state = next_state
            total_reward += reward
            self.replay()

        self.exploration_rate *= self.exploration_decay
        return total_reward

        self.exploration_rate *= self.exploration_decay
        return total_reward

In [28]:
dqn_agent = DQNAgent(env)

In [29]:
for i in range(10):
    total_reward = dqn_agent.run_episode()
    print(f"Total reward from the episode {i + 1}: {total_reward}")

0
2
0
2
2
2
2
0
2
4
2
0
2
4
1
2
2
2
1
1
1
1
1
2
2
2
3
0
0
4
0
2
4
0
0
0
0
4
1
2
2
1
1
2
1
1
2
3
0
0
4
2
2
1
3
0
0
4
1
1
3
2
2
0
0
2
0
4
1
3
1
3
2
1
2
3
2
1
1
2
3
2
0
4
2
1
1
2
1
1
3
2
2
2
2
0
2
0
0
0
Total reward from the episode 1: -20.737491436185024
2
2
2
1
3
1


KeyboardInterrupt: 

In [30]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

# Assuming env is your Gym environment
states = env.observation_space.shape
actions = env.action_space.n

def build_model(states, actions):
    model = Sequential()    
    model.add(Dense(24, activation='relu', input_shape=states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

model = build_model(states, actions)

def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

ModuleNotFoundError: No module named 'rl'

In [None]:
 from stable_baselines3 import PPO
import gym

# Create environment
env = gym.make('YourCustomEnv-v0')  # Replace with your environment

# Create and train the model
model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)
