# Deep Q Network

Train a Q(s, a) function, using trajectories of (s, a, r, s', a', r', s'', ...) from matches.

Using a memory of previous (s, a, r, s'), and double Q networks

In [1]:
# PARAMS
EXPERT_PLAYER_IDX = 0
USE_GPU = False

GAMES = [{"id": gameID} for gameID in ["noBoost1v1_1", "noBoost1v1_2", "noBoost1v1_3"]]

# NOTE: can't use __file__ in jupyter notebook unfortunately, so hard-code it insteadL
ROOT_PROJECT_PATH = 'C:/Users/User/code/CPSC533V/project'

In [2]:
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import sys
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
DEVICE = torch.device("cuda" if USE_GPU else "cpu")

# Load local reusable code within 'lib/'
sys.path.append(ROOT_PROJECT_PATH)
import lib.files as libFiles
import lib.preprocess as libPreprocess
from lib.SAtoVModel import SAtoV_Model

%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 10)

In [3]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

## Load and preprocess all replays

In [4]:
for game in GAMES:
    game['data'] = libFiles.parseReplayToGameData(game['id'])
    libFiles.cleanAndDisplayGameData(game['data'])
    
    game['playerStates'], game['playerActions'] = [], []
    for p in game['data'].players:
        game['playerStates' ].append(libPreprocess.cleanPlayerStates( p.data    ))
        game['playerActions'].append(libPreprocess.cleanPlayerActions(p.controls))
    game['ballStates'] = libPreprocess.cleanBallStates(game['data'].ball)
    print ("Game data from '%s' preprocessed." % game['id'])
    print ("    ... clean data available at 'playerStates', 'playerActions' and 'ballStates' keys\n")
    
    game['expertStates'], game['expertActions'] = libPreprocess.stateAndActionsForPlayer(game, EXPERT_PLAYER_IDX)
    print ("State and action maps for player %s" % game['data'].players[EXPERT_PLAYER_IDX].name)
    print (game['expertStates'].values.shape, game['expertStates'].values.dtype)
    print (game['expertActions'].values.shape, game['expertActions'].values.dtype)
    print ("=======\n\n")

Loading...
	replays\noBoost1v1_1.replay


Could not find field_of_view in camera settings for Sundown
Could not find height in camera settings for Sundown
Could not find pitch in camera settings for Sundown
Could not find distance in camera settings for Sundown
Could not find stiffness in camera settings for Sundown
Could not find swivel_speed in camera settings for Sundown
Could not find transition_speed in camera settings for Sundown
  rhs[1] / (T_p + np.sign(rhs[1]) * omega[1] * D_p),
  rhs[2] / (T_y - np.sign(rhs[2]) * omega[2] * D_y)


2 players loaded!

Orange team:
	bot

Blue team:
	expert

12921 data points acquired
====


Game data from 'noBoost1v1_1' preprocessed.
    ... clean data available at 'playerStates', 'playerActions' and 'ballStates' keys

State and action maps for player expert
(12921, 29) float64
(12921, 3) object


Loading...
	replays\noBoost1v1_2.replay


Could not find field_of_view in camera settings for Beast
Could not find height in camera settings for Beast
Could not find pitch in camera settings for Beast
Could not find distance in camera settings for Beast
Could not find stiffness in camera settings for Beast
Could not find swivel_speed in camera settings for Beast
Could not find transition_speed in camera settings for Beast


2 players loaded!

Orange team:
	bot

Blue team:
	expert

12016 data points acquired
====


Game data from 'noBoost1v1_2' preprocessed.
    ... clean data available at 'playerStates', 'playerActions' and 'ballStates' keys

State and action maps for player expert
(12016, 29) float64
(12016, 3) object


Loading...
	replays\noBoost1v1_3.replay


Could not find field_of_view in camera settings for Middy
Could not find height in camera settings for Middy
Could not find pitch in camera settings for Middy
Could not find distance in camera settings for Middy
Could not find stiffness in camera settings for Middy
Could not find swivel_speed in camera settings for Middy
Could not find transition_speed in camera settings for Middy


2 players loaded!

Orange team:
	bot

Blue team:
	expert

12682 data points acquired
====


Game data from 'noBoost1v1_3' preprocessed.
    ... clean data available at 'playerStates', 'playerActions' and 'ballStates' keys

State and action maps for player expert
(12682, 29) float64
(12682, 3) object




In [34]:
TOTAL_EPOCHS = 500
BATCH_SZ = 100
#PRINT_INTERVAL = 1000
LOG_INTERVAL = 1000
LEARNING_RATE = 0.0001
REGULARIZER_WEIGHT = 3e-4
GAMMA = 0.7
Q_SWAP_EPOCHS = 20

W_ALOSS = 100
W_DLOSS = 50

writer = SummaryWriter(filename_suffix="dqn")

"""
# Continuous 3D action space, so we can't find the maximum easily.
# Instead, sample over a small subset of actions
#  * Throttle takes values [100% back, nothing, 100% forwards]
#  * Steer takes values [100% left, nothing, 100% right]
def maxQ(state, qModel, nT=3, nS=3):
    batchMax = None
    for tValue in range(nT):
        throttle = 2 * tValue / (nT - 1) - 1 # [-1, 1]
        for sValue in range(nS):
            steer = 2 * sValue / (nS - 1) - 1 # [-1, 1]
            for boost in [0.0, 1.0]:
                aArray = np.repeat(np.array([[tValue, sValue, boost]]), state.shape[0], axis=0)
                sa = torch.cat((state, torch.from_numpy(aArray).float()), dim=1)
                q = qModel(sa).detach().numpy() # Detach, we don't need gradients through here
                if batchMax is None:
                    batchMax = q
                else:
                    batchMax = np.maximum(batchMax, q)
    return batchMax
"""

def train_behavioral_cloning(dataBatches, thisQ, nextQ):   
    # Adam optimizer usually a good default.
    optimizer = torch.optim.Adam(thisQ.parameters(), lr=LEARNING_RATE, weight_decay=REGULARIZER_WEIGHT)
    
    # MSE loss
    loss_function = torch.nn.MSELoss().to(DEVICE)

    gradient_steps = 0

    for epoch in range(1, TOTAL_EPOCHS + 1):
        batchShuffled = random.sample(dataBatches, len(dataBatches))
        lastLoss, lastAverageR, lastAverageQ = -1, -1, -1
        for iteration, data in enumerate(batchShuffled):
            data = {k: v.to(DEVICE) for k, v in data.items()}
            saCombined = torch.cat((data['s'], data['a']), dim=1)
            
            maxQCalc = libRewards.bestQ(nextQ, data['sPrime'], returnAction=False)
            y_j_torch = data['r'] + GAMMA * maxQCalc
            y_pred = thisQ(saCombined)
    
            # Gradient descent on MSE loss between predicted and calculated Q
            loss = loss_function(y_j_torch, y_pred)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if gradient_steps % LOG_INTERVAL == 0:
                writer.add_scalar('loss', loss.item(), gradient_steps)
            
            gradient_steps += 1
            
            if iteration == len(batchShuffled) - 1:
                lastLoss = loss.item()
                lastAverageR = np.mean(data['r'].detach().numpy())
                lastAverageQ = np.mean(maxQCalc)
            
        print ('[epoch {:4d}/{}] [iter {:7d}] [loss {:.5f}] [av. R {:.5f}] [av. Q {:.5f}]'.format(
           epoch, TOTAL_EPOCHS, gradient_steps, lastLoss, lastAverageR, lastAverageQ)
        )
        
        if (epoch + 1) % Q_SWAP_EPOCHS == 0:
            nextQ.load_state_dict(thisQ.state_dict())
            nextQ.eval()
            print ("Copying Q weights")

In [6]:
"""
class QModel(nn.Module):
    def __init__(self, state_size, action_size):
        super(QModel, self).__init__()
        
        self.dimIn          = state_size + action_size # (s, a)
        self.dimH1          = 32 # hidden layer 1 has 32 dimensions
        self.dimH2          = 8  # hidden layer 2 has 32 dimensions
        self.dimOut         = 1 # scalar value, Q(s, a)
     
        self.model = torch.nn.Sequential(
            nn.Linear(self.dimIn, self.dimH1),
            nn.BatchNorm1d(self.dimH1),
            nn.ReLU(),
            nn.Linear(self.dimH1, self.dimH2),
            nn.BatchNorm1d(self.dimH2),
            nn.ReLU(),
            nn.Linear(self.dimH2, self.dimOut),
        )
        self.model.to(device)

    def forward(self, x):
        # input data type needs to be converted to float
        return self.model(x.float())
        
    def save(self, modelID):
        path = os.path.join("models", "%s.pt" % modelID)
        torch.save(self.state_dict(), path)
        print('Saved model!\n\t%s' % path)
        
    def load(self, modelID):
        path = os.path.join("models", "%s.pt" % modelID)
        self.load_state_dict(torch.load(path))
        print('Loaded model!\n\t%s' % path)
""";    

In [22]:
"""
random.seed(1234)

# Return list of batches, each a list of (s, a, r, s') tuples
def dataToBatches(states, actions, batchSz):
    nRows = states.shape[0]
    stateSz = states.shape[1]
    actionSz = actions.shape[1]
    print ("%d rows, %d state dim, %d action dim, into %d batches of size %d" % (
        nRows, stateSz, actionSz, (nRows + batchSz - 1) // batchSz, batchSz
    ))

    tOrder = list(range(0, nRows - 1))
    random.shuffle(tOrder)
    
    dataBatches = []
    for i in range(0, len(tOrder), batchSz):
        nInBatch = min(batchSz, len(tOrder) - i)
        
        s = np.zeros((nInBatch, stateSz))
        a = np.zeros((nInBatch, actionSz))
        r = np.zeros((nInBatch))
        sPrime = np.zeros((nInBatch, stateSz))
        
        for j in range(nInBatch):
            t = tOrder[i + j]
            s[j, :] = states.iloc[t, :].values
            a[j, :] = actions.iloc[t, :].values
            sPrime[j, :] = states.iloc[t+1, :].values
            r[j] = artificialReward(states.iloc[t+1, :], a[j, :]) # reward based off goodness of next state
            
        dataBatches.append({
            's': torch.from_numpy(s).float(),
            'a': torch.from_numpy(a).float(),
            'r': torch.from_numpy(r).float(),
            'sPrime': torch.from_numpy(sPrime).float()
        })
            
    return dataBatches, stateSz, actionSz
""";

In [37]:
import importlib
importlib.reload(libPreprocess)
importlib.reload(libRewards)

def runTraining():    
    combinedStates  = pd.concat([game['expertStates']  for game in GAMES])
    combinedActions = pd.concat([game['expertActions'] for game in GAMES])
    
    dataBatches, stateSz, actionSz = libPreprocess.dataToBatches(combinedStates, combinedActions, BATCH_SZ, includeRewards=True)
    qModel1 = SAtoV_Model(stateSz, actionSz, DEVICE)
    qModel2 = SAtoV_Model(stateSz, actionSz, DEVICE)
    qModel2.load_state_dict(qModel1.state_dict())
    qModel2.eval()
    
    train_behavioral_cloning(dataBatches, qModel1, qModel2)
    #qModel1.save("dqn")
    return qModel1

model = runTraining()


37619 rows, 29 state dim, 3 action dim, into 377 batches of size 100


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


[epoch    1/500] [iter     377] [loss 1.48407] [av. R 0.88126] [av. Q -0.21951]
[epoch    2/500] [iter     754] [loss 1.02454] [av. R 0.86937] [av. Q -0.22306]
[epoch    3/500] [iter    1131] [loss 0.66379] [av. R 0.84855] [av. Q -0.21891]
[epoch    4/500] [iter    1508] [loss 0.44188] [av. R 0.87522] [av. Q -0.22152]
[epoch    5/500] [iter    1885] [loss 0.25523] [av. R 0.87161] [av. Q -0.22144]
[epoch    6/500] [iter    2262] [loss 0.16910] [av. R 0.88283] [av. Q -0.22291]
[epoch    7/500] [iter    2639] [loss 0.10631] [av. R 0.86441] [av. Q -0.22334]
[epoch    8/500] [iter    3016] [loss 0.07023] [av. R 0.86441] [av. Q -0.22334]
[epoch    9/500] [iter    3393] [loss 0.04131] [av. R 0.84238] [av. Q -0.21940]
[epoch   10/500] [iter    3770] [loss 0.04668] [av. R 0.87938] [av. Q -0.22160]
[epoch   11/500] [iter    4147] [loss 0.03632] [av. R 0.87036] [av. Q -0.22144]
[epoch   12/500] [iter    4524] [loss 0.02602] [av. R 0.85567] [av. Q -0.21858]
[epoch   13/500] [iter    4901] [loss 0.

[epoch  104/500] [iter   39208] [loss 0.02550] [av. R 0.86274] [av. Q 2.42104]
[epoch  105/500] [iter   39585] [loss 0.02642] [av. R 0.86349] [av. Q 2.42475]
[epoch  106/500] [iter   39962] [loss 0.03234] [av. R 0.85513] [av. Q 2.41235]
[epoch  107/500] [iter   40339] [loss 0.02906] [av. R 0.86346] [av. Q 2.41446]
[epoch  108/500] [iter   40716] [loss 0.02771] [av. R 0.88066] [av. Q 2.40792]
[epoch  109/500] [iter   41093] [loss 0.02097] [av. R 0.88399] [av. Q 2.41680]
[epoch  110/500] [iter   41470] [loss 0.02778] [av. R 0.86666] [av. Q 2.41721]
[epoch  111/500] [iter   41847] [loss 0.02346] [av. R 0.87109] [av. Q 2.41602]
[epoch  112/500] [iter   42224] [loss 0.02483] [av. R 0.86323] [av. Q 2.41653]
[epoch  113/500] [iter   42601] [loss 0.02949] [av. R 0.89210] [av. Q 2.41456]
[epoch  114/500] [iter   42978] [loss 0.02921] [av. R 0.88156] [av. Q 2.41856]
[epoch  115/500] [iter   43355] [loss 0.02349] [av. R 0.87109] [av. Q 2.41602]
[epoch  116/500] [iter   43732] [loss 0.02448] [av. 

[epoch  207/500] [iter   78039] [loss 0.02381] [av. R 0.87321] [av. Q 2.81417]
[epoch  208/500] [iter   78416] [loss 0.02617] [av. R 0.87029] [av. Q 2.80967]
[epoch  209/500] [iter   78793] [loss 0.02556] [av. R 0.86274] [av. Q 2.81589]
[epoch  210/500] [iter   79170] [loss 0.02852] [av. R 0.88675] [av. Q 2.81117]
[epoch  211/500] [iter   79547] [loss 0.02414] [av. R 0.88440] [av. Q 2.81131]
[epoch  212/500] [iter   79924] [loss 0.02596] [av. R 0.86274] [av. Q 2.81589]
[epoch  213/500] [iter   80301] [loss 0.03258] [av. R 0.86937] [av. Q 2.80881]
[epoch  214/500] [iter   80678] [loss 0.02468] [av. R 0.86192] [av. Q 2.81126]
[epoch  215/500] [iter   81055] [loss 0.03064] [av. R 0.84349] [av. Q 2.80906]
[epoch  216/500] [iter   81432] [loss 0.02030] [av. R 0.88687] [av. Q 2.81732]
[epoch  217/500] [iter   81809] [loss 0.02603] [av. R 0.87029] [av. Q 2.80967]
[epoch  218/500] [iter   82186] [loss 0.02997] [av. R 0.85049] [av. Q 2.81724]
[epoch  219/500] [iter   82563] [loss 0.02651] [av. 

[epoch  310/500] [iter  116870] [loss 0.02501] [av. R 0.90166] [av. Q 2.88715]
[epoch  311/500] [iter  117247] [loss 0.03439] [av. R 0.85294] [av. Q 2.88706]
[epoch  312/500] [iter  117624] [loss 0.02563] [av. R 0.89996] [av. Q 2.88947]
[epoch  313/500] [iter  118001] [loss 0.02640] [av. R 0.88375] [av. Q 2.89199]
[epoch  314/500] [iter  118378] [loss 0.02443] [av. R 0.87196] [av. Q 2.89075]
[epoch  315/500] [iter  118755] [loss 0.02533] [av. R 0.85529] [av. Q 2.88270]
[epoch  316/500] [iter  119132] [loss 0.02864] [av. R 0.86420] [av. Q 2.89152]
[epoch  317/500] [iter  119509] [loss 0.03298] [av. R 0.88352] [av. Q 2.88888]
[epoch  318/500] [iter  119886] [loss 0.02469] [av. R 0.86546] [av. Q 2.88451]
[epoch  319/500] [iter  120263] [loss 0.02706] [av. R 0.88004] [av. Q 2.88950]
Copying Q weights
[epoch  320/500] [iter  120640] [loss 0.02401] [av. R 0.86716] [av. Q 2.88869]
[epoch  321/500] [iter  121017] [loss 0.02129] [av. R 0.88636] [av. Q 2.89045]
[epoch  322/500] [iter  121394] [l

[epoch  413/500] [iter  155701] [loss 0.02929] [av. R 0.86217] [av. Q 2.90040]
[epoch  414/500] [iter  156078] [loss 0.02584] [av. R 0.86359] [av. Q 2.90064]
[epoch  415/500] [iter  156455] [loss 0.02670] [av. R 0.88654] [av. Q 2.89761]
[epoch  416/500] [iter  156832] [loss 0.02620] [av. R 0.89478] [av. Q 2.89663]
[epoch  417/500] [iter  157209] [loss 0.02902] [av. R 0.85571] [av. Q 2.89986]
[epoch  418/500] [iter  157586] [loss 0.02812] [av. R 0.86519] [av. Q 2.90193]
[epoch  419/500] [iter  157963] [loss 0.02977] [av. R 0.85705] [av. Q 2.90184]
Copying Q weights
[epoch  420/500] [iter  158340] [loss 0.03598] [av. R 0.86210] [av. Q 2.90311]
[epoch  421/500] [iter  158717] [loss 0.02772] [av. R 0.86636] [av. Q 2.90050]
[epoch  422/500] [iter  159094] [loss 0.02827] [av. R 0.85902] [av. Q 2.90415]
[epoch  423/500] [iter  159471] [loss 0.02133] [av. R 0.87037] [av. Q 2.90056]
[epoch  424/500] [iter  159848] [loss 0.02676] [av. R 0.87724] [av. Q 2.90129]
[epoch  425/500] [iter  160225] [l