In [1]:
%load_ext autoreload
%autoreload 2

import argparse
import time
import numpy as np
import collections

import torch
import torch.nn as nn
import torch.optim as optim

from trading_session_gym.envs.trading_session_gym import TradingSession

import warnings
warnings.filterwarnings('ignore')

In [2]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
               np.array(dones), np.array(next_states)

In [3]:
class DQN(nn.Module):
    """Deep Q-network with target network"""
    
    def __init__(self, n_inputs, n_outputs):
        super(DQN, self).__init__()
        # network
        self.fc = nn.Sequential(
                    nn.Linear(n_inputs, n_inputs),
                    nn.ReLU(),
                    nn.Linear(n_inputs, n_outputs)
        )
    
    
    def forward(self, x):
        x = x.float()
        return self.fc(x)

In [4]:
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self._reset()

    def _reset(self):
        self.state = env.reset()
        self.total_reward = 0.0

    def play_step(self, net, epsilon=0.0, device="cpu"):
        done_reward = None

        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_a = np.array([self.state], copy=False)
            state_v = torch.tensor(state_a).to(device)
            q_vals_v = net(state_v)
            _, act_v = torch.max(q_vals_v, dim=1)
            action = int(act_v.item())
        
        # do step in the environment
        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward

        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            done_reward = self.total_reward
            self._reset()
        return done_reward

In [5]:
def calc_loss(batch, net, tgt_net, device="cpu", cuda_async=False):
    states, actions, rewards, dones, next_states = batch
    states_v = torch.tensor(states).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.ByteTensor(dones).to(device)
      
    if device=="cuda":
        states_v = states_v.cuda(non_blocking=cuda_async)
        next_states_v = next_states_v.cuda(non_blocking=cuda_async)
        actions_v = actions_v.cuda(non_blocking=cuda_async)
        rewards_v = rewards_v.cuda(non_blocking=cuda_async)
        done_mask = done_mask.cuda(non_blocking=cuda_async)
        
    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1).long()).squeeze(-1)
    
    next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    next_state_values = next_state_values.detach()

    expected_state_action_values = next_state_values * GAMMA + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)

In [6]:
MEAN_REWARD_BOUND = 70

GAMMA = 0
BATCH_SIZE = 100
REPLAY_SIZE = 10000
LEARNING_RATE = 1e-4
SYNC_TARGET_STEPS = 1000
REPLAY_START_SIZE = 10000

EPSILON_DECAY = 10**5
EPSILON_START = 1.0
EPSILON_FINAL = 0.02

In [None]:
device = torch.device("cpu")

env = TradingSession(action_space_config = 'discrete')

net = DQN(env.observation_space.shape[0], env.action_space.n).to(device)
tgt_net = DQN(env.observation_space.shape[0], env.action_space.n).to(device)

print(net)

buffer = ExperienceBuffer(REPLAY_SIZE)
agent = Agent(env, buffer)
epsilon = EPSILON_START

optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
total_rewards = []
step_idx = 0
best_mean_reward = None

while True:
    step_idx += 1
    epsilon = max(EPSILON_FINAL, EPSILON_START - step_idx / EPSILON_DECAY)    
    
    reward = agent.play_step(net, epsilon, device=device)
    
    if reward is not None:
        total_rewards.append(reward)
        mean_reward = np.mean(total_rewards[-100:])
        print("%d: done %d episodes, mean reward %.3f, eps %.2f" % (step_idx, len(total_rewards), mean_reward, epsilon))
        if best_mean_reward is None or best_mean_reward < mean_reward:
            torch.save(net.state_dict(), "model.dat")
            if best_mean_reward is not None:
                print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
            best_mean_reward = mean_reward
        if mean_reward > MEAN_REWARD_BOUND:
            print("Solved in %d steps!" % step_idx)
            break

    if len(buffer) < REPLAY_START_SIZE:
        continue

    if step_idx % SYNC_TARGET_STEPS == 0:
        tgt_net.load_state_dict(net.state_dict())

    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = calc_loss(batch, net, tgt_net, device=device, cuda_async = True)
    loss_t.backward()
    optimizer.step()

DQN(
  (fc): Sequential(
    (0): Linear(in_features=24, out_features=24, bias=True)
    (1): ReLU()
    (2): Linear(in_features=24, out_features=13, bias=True)
  )
)
1440: done 1 episodes, mean reward 56.523, eps 0.99
2880: done 2 episodes, mean reward 57.950, eps 0.97
Best mean reward updated 56.523 -> 57.950, model saved
4320: done 3 episodes, mean reward 57.824, eps 0.96
5760: done 4 episodes, mean reward 58.213, eps 0.94
Best mean reward updated 57.950 -> 58.213, model saved
7200: done 5 episodes, mean reward 57.831, eps 0.93
8640: done 6 episodes, mean reward 57.859, eps 0.91
10080: done 7 episodes, mean reward 57.701, eps 0.90
11520: done 8 episodes, mean reward 57.564, eps 0.88
12960: done 9 episodes, mean reward 58.051, eps 0.87
14400: done 10 episodes, mean reward 58.231, eps 0.86
Best mean reward updated 58.213 -> 58.231, model saved
15840: done 11 episodes, mean reward 58.267, eps 0.84
Best mean reward updated 58.231 -> 58.267, model saved
17280: done 12 episodes, mean rewa

201600: done 140 episodes, mean reward 55.447, eps 0.02
203040: done 141 episodes, mean reward 55.448, eps 0.02
204480: done 142 episodes, mean reward 55.428, eps 0.02
205920: done 143 episodes, mean reward 55.372, eps 0.02
207360: done 144 episodes, mean reward 55.296, eps 0.02
208800: done 145 episodes, mean reward 55.277, eps 0.02
210240: done 146 episodes, mean reward 55.238, eps 0.02
211680: done 147 episodes, mean reward 55.241, eps 0.02
213120: done 148 episodes, mean reward 55.226, eps 0.02
214560: done 149 episodes, mean reward 55.268, eps 0.02
216000: done 150 episodes, mean reward 55.238, eps 0.02
217440: done 151 episodes, mean reward 55.238, eps 0.02
218880: done 152 episodes, mean reward 55.253, eps 0.02
220320: done 153 episodes, mean reward 55.249, eps 0.02
221760: done 154 episodes, mean reward 55.276, eps 0.02
223200: done 155 episodes, mean reward 55.266, eps 0.02
224640: done 156 episodes, mean reward 55.251, eps 0.02
226080: done 157 episodes, mean reward 55.253, e

413280: done 287 episodes, mean reward 57.060, eps 0.02
414720: done 288 episodes, mean reward 57.027, eps 0.02
416160: done 289 episodes, mean reward 57.076, eps 0.02
417600: done 290 episodes, mean reward 57.125, eps 0.02
419040: done 291 episodes, mean reward 57.147, eps 0.02
420480: done 292 episodes, mean reward 57.216, eps 0.02
421920: done 293 episodes, mean reward 57.255, eps 0.02
423360: done 294 episodes, mean reward 57.330, eps 0.02
424800: done 295 episodes, mean reward 57.347, eps 0.02
426240: done 296 episodes, mean reward 57.496, eps 0.02
427680: done 297 episodes, mean reward 57.636, eps 0.02
429120: done 298 episodes, mean reward 57.702, eps 0.02
430560: done 299 episodes, mean reward 57.760, eps 0.02
432000: done 300 episodes, mean reward 57.771, eps 0.02
433440: done 301 episodes, mean reward 57.839, eps 0.02
434880: done 302 episodes, mean reward 57.951, eps 0.02
436320: done 303 episodes, mean reward 57.953, eps 0.02
437760: done 304 episodes, mean reward 58.141, e

551520: done 383 episodes, mean reward 62.518, eps 0.02
Best mean reward updated 62.409 -> 62.518, model saved
552960: done 384 episodes, mean reward 62.510, eps 0.02
554400: done 385 episodes, mean reward 62.514, eps 0.02
555840: done 386 episodes, mean reward 62.501, eps 0.02
557280: done 387 episodes, mean reward 62.519, eps 0.02
Best mean reward updated 62.518 -> 62.519, model saved
558720: done 388 episodes, mean reward 62.576, eps 0.02
Best mean reward updated 62.519 -> 62.576, model saved
560160: done 389 episodes, mean reward 62.593, eps 0.02
Best mean reward updated 62.576 -> 62.593, model saved
561600: done 390 episodes, mean reward 62.606, eps 0.02
Best mean reward updated 62.593 -> 62.606, model saved
563040: done 391 episodes, mean reward 62.694, eps 0.02
Best mean reward updated 62.606 -> 62.694, model saved
564480: done 392 episodes, mean reward 62.707, eps 0.02
Best mean reward updated 62.694 -> 62.707, model saved
565920: done 393 episodes, mean reward 62.799, eps 0.02

721440: done 501 episodes, mean reward 64.012, eps 0.02
Best mean reward updated 64.003 -> 64.012, model saved
722880: done 502 episodes, mean reward 64.056, eps 0.02
Best mean reward updated 64.012 -> 64.056, model saved
724320: done 503 episodes, mean reward 63.996, eps 0.02
725760: done 504 episodes, mean reward 63.905, eps 0.02
727200: done 505 episodes, mean reward 63.946, eps 0.02
728640: done 506 episodes, mean reward 63.905, eps 0.02
730080: done 507 episodes, mean reward 63.914, eps 0.02
731520: done 508 episodes, mean reward 63.913, eps 0.02
732960: done 509 episodes, mean reward 63.911, eps 0.02
734400: done 510 episodes, mean reward 63.958, eps 0.02
735840: done 511 episodes, mean reward 64.097, eps 0.02
Best mean reward updated 64.056 -> 64.097, model saved
737280: done 512 episodes, mean reward 64.318, eps 0.02
Best mean reward updated 64.097 -> 64.318, model saved
738720: done 513 episodes, mean reward 64.302, eps 0.02
740160: done 514 episodes, mean reward 64.322, eps 0

908640: done 631 episodes, mean reward 64.009, eps 0.02
910080: done 632 episodes, mean reward 63.996, eps 0.02
911520: done 633 episodes, mean reward 64.043, eps 0.02
912960: done 634 episodes, mean reward 64.047, eps 0.02
914400: done 635 episodes, mean reward 64.065, eps 0.02
915840: done 636 episodes, mean reward 63.894, eps 0.02
917280: done 637 episodes, mean reward 63.834, eps 0.02
918720: done 638 episodes, mean reward 63.857, eps 0.02
920160: done 639 episodes, mean reward 63.908, eps 0.02
921600: done 640 episodes, mean reward 63.888, eps 0.02
923040: done 641 episodes, mean reward 63.862, eps 0.02
924480: done 642 episodes, mean reward 63.780, eps 0.02
925920: done 643 episodes, mean reward 63.810, eps 0.02
927360: done 644 episodes, mean reward 63.947, eps 0.02
928800: done 645 episodes, mean reward 63.922, eps 0.02
930240: done 646 episodes, mean reward 63.967, eps 0.02
931680: done 647 episodes, mean reward 63.897, eps 0.02
933120: done 648 episodes, mean reward 64.046, e

1117440: done 776 episodes, mean reward 64.080, eps 0.02
1118880: done 777 episodes, mean reward 64.167, eps 0.02
1120320: done 778 episodes, mean reward 64.153, eps 0.02
1121760: done 779 episodes, mean reward 64.182, eps 0.02
1123200: done 780 episodes, mean reward 64.164, eps 0.02
1124640: done 781 episodes, mean reward 64.066, eps 0.02
1126080: done 782 episodes, mean reward 64.077, eps 0.02
1127520: done 783 episodes, mean reward 64.096, eps 0.02
1128960: done 784 episodes, mean reward 64.112, eps 0.02
1130400: done 785 episodes, mean reward 64.002, eps 0.02
1131840: done 786 episodes, mean reward 63.923, eps 0.02
1133280: done 787 episodes, mean reward 63.800, eps 0.02
1134720: done 788 episodes, mean reward 63.859, eps 0.02
1136160: done 789 episodes, mean reward 63.847, eps 0.02
1137600: done 790 episodes, mean reward 64.042, eps 0.02
1139040: done 791 episodes, mean reward 64.038, eps 0.02
1140480: done 792 episodes, mean reward 64.052, eps 0.02
1141920: done 793 episodes, mea

1324800: done 920 episodes, mean reward 64.046, eps 0.02
1326240: done 921 episodes, mean reward 64.072, eps 0.02
1327680: done 922 episodes, mean reward 64.084, eps 0.02
1329120: done 923 episodes, mean reward 64.048, eps 0.02
1330560: done 924 episodes, mean reward 64.080, eps 0.02
1332000: done 925 episodes, mean reward 64.061, eps 0.02
1333440: done 926 episodes, mean reward 64.086, eps 0.02
1334880: done 927 episodes, mean reward 64.126, eps 0.02
1336320: done 928 episodes, mean reward 64.140, eps 0.02
1337760: done 929 episodes, mean reward 64.023, eps 0.02
1339200: done 930 episodes, mean reward 64.082, eps 0.02
1340640: done 931 episodes, mean reward 64.105, eps 0.02
1342080: done 932 episodes, mean reward 64.102, eps 0.02
1343520: done 933 episodes, mean reward 64.096, eps 0.02
1344960: done 934 episodes, mean reward 64.185, eps 0.02
1346400: done 935 episodes, mean reward 64.209, eps 0.02
1347840: done 936 episodes, mean reward 64.226, eps 0.02
1349280: done 937 episodes, mea

1520640: done 1056 episodes, mean reward 64.885, eps 0.02
1522080: done 1057 episodes, mean reward 64.929, eps 0.02
1523520: done 1058 episodes, mean reward 64.899, eps 0.02
1524960: done 1059 episodes, mean reward 64.909, eps 0.02
1526400: done 1060 episodes, mean reward 64.802, eps 0.02
1527840: done 1061 episodes, mean reward 64.628, eps 0.02
1529280: done 1062 episodes, mean reward 64.998, eps 0.02
1530720: done 1063 episodes, mean reward 65.005, eps 0.02
1532160: done 1064 episodes, mean reward 64.963, eps 0.02
1533600: done 1065 episodes, mean reward 64.939, eps 0.02
1535040: done 1066 episodes, mean reward 65.045, eps 0.02
1536480: done 1067 episodes, mean reward 65.061, eps 0.02
1537920: done 1068 episodes, mean reward 65.052, eps 0.02
1539360: done 1069 episodes, mean reward 64.974, eps 0.02
1540800: done 1070 episodes, mean reward 64.930, eps 0.02
1542240: done 1071 episodes, mean reward 64.821, eps 0.02
1543680: done 1072 episodes, mean reward 65.010, eps 0.02
1545120: done 

1725120: done 1198 episodes, mean reward 64.149, eps 0.02
1726560: done 1199 episodes, mean reward 64.141, eps 0.02
1728000: done 1200 episodes, mean reward 64.214, eps 0.02
1729440: done 1201 episodes, mean reward 64.268, eps 0.02
1730880: done 1202 episodes, mean reward 64.341, eps 0.02
1732320: done 1203 episodes, mean reward 64.351, eps 0.02
1733760: done 1204 episodes, mean reward 64.394, eps 0.02
1735200: done 1205 episodes, mean reward 64.416, eps 0.02
1736640: done 1206 episodes, mean reward 64.364, eps 0.02
1738080: done 1207 episodes, mean reward 64.278, eps 0.02
1739520: done 1208 episodes, mean reward 64.308, eps 0.02
1740960: done 1209 episodes, mean reward 64.234, eps 0.02
1742400: done 1210 episodes, mean reward 64.263, eps 0.02
1743840: done 1211 episodes, mean reward 64.288, eps 0.02
1745280: done 1212 episodes, mean reward 64.268, eps 0.02
1746720: done 1213 episodes, mean reward 64.352, eps 0.02
1748160: done 1214 episodes, mean reward 64.370, eps 0.02
1749600: done 

1928160: done 1339 episodes, mean reward 64.119, eps 0.02
1929600: done 1340 episodes, mean reward 64.200, eps 0.02
1931040: done 1341 episodes, mean reward 64.253, eps 0.02
1932480: done 1342 episodes, mean reward 64.243, eps 0.02
1933920: done 1343 episodes, mean reward 64.204, eps 0.02
1935360: done 1344 episodes, mean reward 64.184, eps 0.02
1936800: done 1345 episodes, mean reward 64.078, eps 0.02
1938240: done 1346 episodes, mean reward 64.036, eps 0.02
1939680: done 1347 episodes, mean reward 64.032, eps 0.02
1941120: done 1348 episodes, mean reward 64.040, eps 0.02
1942560: done 1349 episodes, mean reward 64.032, eps 0.02
1944000: done 1350 episodes, mean reward 63.979, eps 0.02
1945440: done 1351 episodes, mean reward 63.909, eps 0.02
1946880: done 1352 episodes, mean reward 63.733, eps 0.02
1948320: done 1353 episodes, mean reward 63.707, eps 0.02
1949760: done 1354 episodes, mean reward 63.746, eps 0.02
1951200: done 1355 episodes, mean reward 63.741, eps 0.02
1952640: done 