In [1]:
from ExperienceBuffer import ExperienceBuffer
from envs import *
import pandas as pd
import random
import numpy as np
from QuantConvDQN import QuantConv
import collections
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import datetime
import torch.nn as nn


In [2]:
df = pd.read_csv('complete_df.csv',index_col=0)
df.index = pd.to_datetime(df.index)
close_prices = pd.read_csv('close.csv',index_col=0,header=None).iloc[:,0]
close_prices.index = pd.to_datetime(close_prices.index)
test_inds = df.index[df.index>datetime.datetime(2019,1,1)]
train_df = df.copy().drop(test_inds)
test_df = df.copy().loc[test_inds]
train_close = close_prices.copy().drop(test_inds)
test_close = close_prices.copy().loc[test_inds]

In [4]:
buffer = ExperienceBuffer(100000)
Experience = collections.namedtuple(
    'Experience', field_names=['state', 'action', 'reward',
                               'done', 'new_state'])
tau = torch.Tensor((2 * np.arange(200) + 1) / (2.0 * 200)).view(1, -1)

epsilon = 0.99
increment = 0.98/4000000

lr = 1e-7
gamma= 0.98

env = STEnv(train_df,train_close)
val_env = ValEnv(test_df,test_close)

net = QuantConv(env.state.shape,3).type(torch.float64)
tgt_net = QuantConv(env.state.shape,3).type(torch.float64)
best_net = QuantConv(env.state.shape,3).type(torch.float64)

for i in range(100):
    action = random.choice([0,1,2])
    state = env.state
    new_state, done, reward, _ = env.step(action)
    exp = Experience(state, action, reward, done, new_state)
    buffer.append(exp)

counter = 0
losses = collections.deque(maxlen=500)

optimizer = optim.Adam(net.parameters(), lr=lr)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.9, patience=10, verbose=True)
val_reward = -np.inf


In [None]:
start = i
for i in range(start,4000000):
    state = env.state
    if np.random.random(1) > epsilon:
        state_tensor = torch.tensor(state)
        with torch.no_grad():
            q_val = net(state_tensor.unsqueeze(0))
        action = int(q_val.mean(2).max(1)[1].item())
        
    else:
        action = random.choice([0,1,2])
    epsilon = max(0.01,epsilon-increment)
    new_state, done, reward, _ = env.step(action)
    exp = Experience(state, action, reward, done, new_state)
    buffer.append(exp)
    if len(buffer.buffer)<32:
        continue
    indices =  np.random.choice(len(buffer.buffer),32, replace=False)
    states = torch.tensor(np.array([buffer.buffer[ind][0] for ind in indices]))
    actions = torch.tensor(np.array([buffer.buffer[ind][1] for ind in indices]))
    rewards = torch.tensor(np.array([buffer.buffer[ind][2] for ind in indices]))
    dones = torch.BoolTensor(np.array([buffer.buffer[ind][3] for ind in indices]))
    next_states = torch.tensor(np.array([buffer.buffer[ind][4] for ind in indices]))
    
    Q1 = net(states)[range(32), actions.data]
    with torch.no_grad():
        
        Q2= tgt_net(next_states).detach()
        Q2 = Q2[range(32), Q2.mean(2).max(1)[1]]
        Q2[dones] = 0.0
        Q2 = Q2.detach()
    expected_state_action_values = gamma*Q2 + rewards.unsqueeze(1)
        
    diff = (expected_state_action_values.t().unsqueeze(-1) - Q1).detach()
    l = nn.SmoothL1Loss()(expected_state_action_values,Q1) * (tau-(diff<0).float()).abs()
    loss = l.mean()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    scheduler.step(loss.item())
    if i % 1000 == 0:
        
        tgt_net.load_state_dict(net.state_dict())
        print('episode: ',i)
        print('training loss: ',np.array(losses).mean())

        done = False
        actions = []
        while done == False:
            state = val_env.state

            state_tensor = torch.tensor(state)
            with torch.no_grad():
                q_val = net(state_tensor.unsqueeze(0))
            action = int(q_val.mean(2).max(1)[1].item())
            new_state, done, reward, _ = val_env.step(action)
            actions.append(action)
        reward /= 2000
        print(reward)
        if reward > val_reward:
            val_reward = reward
            best_net.load_state_dict(net.state_dict())

episode:  1000
training loss:  162.26587875366212
-0.1295387000000019
episode:  2000
training loss:  167.61660326004028
-0.13096525000000075
episode:  3000
training loss:  174.3464457836151
-0.16331840000000353
episode:  4000
training loss:  171.08782088661195
-0.20864105000000246
episode:  5000
training loss:  172.621382232666
-0.01107975000000306
