# Deep Q Network

Train a Q(s, a) function, using trajectories of (s, a, r, s', a', r', s'', ...) from matches.

Using a memory of previous (s, a, r, s'), and double Q networks

In [1]:
# PARAMS
EXPERT_PLAYER_IDX = 0
USE_GPU = False

GAMES = [{"id": gameID} for gameID in ["noBoost1v1_1", "noBoost1v1_2", "noBoost1v1_3"]]

# NOTE: can't use __file__ in jupyter notebook unfortunately, so hard-code it insteadL
ROOT_PROJECT_PATH = 'C:/Users/User/code/CPSC533V/project'

In [2]:
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import sys
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
DEVICE = torch.device("cuda" if USE_GPU else "cpu")

# Load local reusable code within 'lib/'
sys.path.append(ROOT_PROJECT_PATH)
import lib.files as libFiles
import lib.preprocess as libPreprocess
import lib.rewards as libRewards
from lib.SAtoVModel import SAtoV_Model

%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 10)

In [3]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

## Load and preprocess all replays

In [4]:
for game in GAMES:
    game['data'] = libFiles.parseReplayToGameData(game['id'])
    libFiles.cleanAndDisplayGameData(game['data'])
    
    game['playerStates'], game['playerActions'] = [], []
    for p in game['data'].players:
        game['playerStates' ].append(libPreprocess.cleanPlayerStates( p.data    ))
        game['playerActions'].append(libPreprocess.cleanPlayerActions(p.controls))
    game['ballStates'] = libPreprocess.cleanBallStates(game['data'].ball)
    print ("Game data from '%s' preprocessed." % game['id'])
    print ("    ... clean data available at 'playerStates', 'playerActions' and 'ballStates' keys\n")
    
    game['expertStates'], game['expertActions'] = libPreprocess.stateAndActionsForPlayer(game, EXPERT_PLAYER_IDX)
    print ("State and action maps for player %s" % game['data'].players[EXPERT_PLAYER_IDX].name)
    print (game['expertStates'].values.shape, game['expertStates'].values.dtype)
    print (game['expertActions'].values.shape, game['expertActions'].values.dtype)
    print ("=======\n\n")

Loading...
	replays\noBoost1v1_1.replay


Could not find field_of_view in camera settings for Sundown
Could not find height in camera settings for Sundown
Could not find pitch in camera settings for Sundown
Could not find distance in camera settings for Sundown
Could not find stiffness in camera settings for Sundown
Could not find swivel_speed in camera settings for Sundown
Could not find transition_speed in camera settings for Sundown
  rhs[1] / (T_p + np.sign(rhs[1]) * omega[1] * D_p),
  rhs[2] / (T_y - np.sign(rhs[2]) * omega[2] * D_y)


2 players loaded!

Orange team:
	bot

Blue team:
	expert

12921 data points acquired
====


Game data from 'noBoost1v1_1' preprocessed.
    ... clean data available at 'playerStates', 'playerActions' and 'ballStates' keys

State and action maps for player expert
(12921, 29) float64
(12921, 3) object


Loading...
	replays\noBoost1v1_2.replay


Could not find field_of_view in camera settings for Beast
Could not find height in camera settings for Beast
Could not find pitch in camera settings for Beast
Could not find distance in camera settings for Beast
Could not find stiffness in camera settings for Beast
Could not find swivel_speed in camera settings for Beast
Could not find transition_speed in camera settings for Beast


2 players loaded!

Orange team:
	bot

Blue team:
	expert

12016 data points acquired
====


Game data from 'noBoost1v1_2' preprocessed.
    ... clean data available at 'playerStates', 'playerActions' and 'ballStates' keys

State and action maps for player expert
(12016, 29) float64
(12016, 3) object


Loading...
	replays\noBoost1v1_3.replay


Could not find field_of_view in camera settings for Middy
Could not find height in camera settings for Middy
Could not find pitch in camera settings for Middy
Could not find distance in camera settings for Middy
Could not find stiffness in camera settings for Middy
Could not find swivel_speed in camera settings for Middy
Could not find transition_speed in camera settings for Middy


2 players loaded!

Orange team:
	bot

Blue team:
	expert

12682 data points acquired
====


Game data from 'noBoost1v1_3' preprocessed.
    ... clean data available at 'playerStates', 'playerActions' and 'ballStates' keys

State and action maps for player expert
(12682, 29) float64
(12682, 3) object




In [7]:
TOTAL_EPOCHS = 300
BATCH_SZ = 100
LOG_INTERVAL = 1000
LEARNING_RATE = 0.00003
REGULARIZER_WEIGHT = 3e-4
GAMMA = 0.7
Q_SWAP_EPOCHS = 20

writer = SummaryWriter(filename_suffix="dqn")


def train_behavioral_cloning(dataBatches, thisQ, nextQ):   
    # Adam optimizer usually a good default.
    optimizer = torch.optim.Adam(thisQ.parameters(), lr=LEARNING_RATE, weight_decay=REGULARIZER_WEIGHT)
    
    # MSE loss
    loss_function = torch.nn.MSELoss().to(DEVICE)

    gradient_steps = 0

    for epoch in range(1, TOTAL_EPOCHS + 1):
        batchShuffled = random.sample(dataBatches, len(dataBatches))
        lastLoss, lastAverageR, lastAverageQ = -1, -1, -1
        for iteration, data in enumerate(batchShuffled):
            data = {k: v.to(DEVICE) for k, v in data.items()}
            saCombined = torch.cat((data['s'], data['a']), dim=1)
            
            maxQCalc = libRewards.bestQ(nextQ, data['sPrime'], returnAction=False)
            y_j_torch = data['r'] + GAMMA * maxQCalc
            y_pred = thisQ(saCombined)
    
            # Gradient descent on MSE loss between predicted and calculated Q
            loss = loss_function(y_j_torch, y_pred)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if gradient_steps % LOG_INTERVAL == 0:
                writer.add_scalar('loss', loss.item(), gradient_steps)
            
            gradient_steps += 1
            
            if iteration == len(batchShuffled) - 1:
                lastLoss = loss.item()
                lastAverageR = np.mean(data['r'].detach().numpy())
                lastAverageQ = np.mean(maxQCalc)
            
        print ('[epoch {:4d}/{}] [iter {:7d}] [loss {:.5f}] [av. R {:.5f}] [av. Q {:.5f}]'.format(
           epoch, TOTAL_EPOCHS, gradient_steps, lastLoss, lastAverageR, lastAverageQ)
        )
        
        if (epoch + 1) % Q_SWAP_EPOCHS == 0:
            nextQ.load_state_dict(thisQ.state_dict())
            nextQ.eval()
            print ("Copying Q weights")

In [8]:
def runTraining(coefOverrides):    
    combinedStates  = pd.concat([game['expertStates']  for game in GAMES])
    combinedActions = pd.concat([game['expertActions'] for game in GAMES])
    
    dataBatches, stateSz, actionSz = libPreprocess.dataToBatches(
        combinedStates, combinedActions, BATCH_SZ, 
        includeRewards=True, coefOverrides=coefOverrides
    )
    qModel1 = SAtoV_Model(stateSz, actionSz, DEVICE)
    qModel2 = SAtoV_Model(stateSz, actionSz, DEVICE)
    qModel2.load_state_dict(qModel1.state_dict())
    qModel2.eval()
    
    train_behavioral_cloning(dataBatches, qModel1, qModel2)
    return qModel1

model = runTraining({})
model.save("dqn.all3Replays")


37619 rows, 29 state dim, 3 action dim, into 377 batches of size 100
[epoch    1/300] [iter     377] [loss 0.82059] [av. R 0.87967] [av. Q 0.21608]
[epoch    2/300] [iter     754] [loss 0.66440] [av. R 0.86693] [av. Q 0.21321]
[epoch    3/300] [iter    1131] [loss 0.52452] [av. R 0.84589] [av. Q 0.21603]
[epoch    4/300] [iter    1508] [loss 0.46243] [av. R 0.87347] [av. Q 0.22392]
[epoch    5/300] [iter    1885] [loss 0.37066] [av. R 0.86981] [av. Q 0.21471]
[epoch    6/300] [iter    2262] [loss 0.32905] [av. R 0.88221] [av. Q 0.22181]
[epoch    7/300] [iter    2639] [loss 0.24929] [av. R 0.86284] [av. Q 0.21883]
[epoch    8/300] [iter    3016] [loss 0.19997] [av. R 0.86284] [av. Q 0.21883]
[epoch    9/300] [iter    3393] [loss 0.14913] [av. R 0.83955] [av. Q 0.21623]
[epoch   10/300] [iter    3770] [loss 0.14165] [av. R 0.87774] [av. Q 0.21549]
[epoch   11/300] [iter    4147] [loss 0.11263] [av. R 0.86946] [av. Q 0.21760]
[epoch   12/300] [iter    4524] [loss 0.08587] [av. R 0.85320]

[epoch  103/300] [iter   38831] [loss 0.03553] [av. R 0.84608] [av. Q 2.69608]
[epoch  104/300] [iter   39208] [loss 0.03195] [av. R 0.86105] [av. Q 2.72548]
[epoch  105/300] [iter   39585] [loss 0.02838] [av. R 0.86121] [av. Q 2.71175]
[epoch  106/300] [iter   39962] [loss 0.04158] [av. R 0.85442] [av. Q 2.72222]
[epoch  107/300] [iter   40339] [loss 0.03185] [av. R 0.86171] [av. Q 2.68935]
[epoch  108/300] [iter   40716] [loss 0.03137] [av. R 0.87957] [av. Q 2.70890]
[epoch  109/300] [iter   41093] [loss 0.02390] [av. R 0.88340] [av. Q 2.69548]
[epoch  110/300] [iter   41470] [loss 0.03320] [av. R 0.86400] [av. Q 2.72300]
[epoch  111/300] [iter   41847] [loss 0.02865] [av. R 0.86900] [av. Q 2.71391]
[epoch  112/300] [iter   42224] [loss 0.02757] [av. R 0.86060] [av. Q 2.71361]
[epoch  113/300] [iter   42601] [loss 0.03333] [av. R 0.89087] [av. Q 2.72442]
[epoch  114/300] [iter   42978] [loss 0.03447] [av. R 0.87928] [av. Q 2.71272]
[epoch  115/300] [iter   43355] [loss 0.02871] [av. 

[epoch  206/300] [iter   77662] [loss 0.03869] [av. R 0.84132] [av. Q 2.98370]
[epoch  207/300] [iter   78039] [loss 0.02623] [av. R 0.87155] [av. Q 2.97897]
[epoch  208/300] [iter   78416] [loss 0.03092] [av. R 0.87022] [av. Q 2.95872]
[epoch  209/300] [iter   78793] [loss 0.02815] [av. R 0.86105] [av. Q 2.97011]
[epoch  210/300] [iter   79170] [loss 0.03173] [av. R 0.88580] [av. Q 2.97465]
[epoch  211/300] [iter   79547] [loss 0.02796] [av. R 0.88372] [av. Q 2.96911]
[epoch  212/300] [iter   79924] [loss 0.02815] [av. R 0.86105] [av. Q 2.97011]
[epoch  213/300] [iter   80301] [loss 0.03428] [av. R 0.86815] [av. Q 2.96002]
[epoch  214/300] [iter   80678] [loss 0.02677] [av. R 0.86024] [av. Q 2.97321]
[epoch  215/300] [iter   81055] [loss 0.03150] [av. R 0.84076] [av. Q 2.95854]
[epoch  216/300] [iter   81432] [loss 0.02182] [av. R 0.88553] [av. Q 2.97814]
[epoch  217/300] [iter   81809] [loss 0.03031] [av. R 0.87022] [av. Q 2.95872]
[epoch  218/300] [iter   82186] [loss 0.03247] [av. 

In [None]:
## Uncomment this code to run the forwards-penalized validation

# Penalize driving forwards, and not turning:
#forwardsPenaltyCoef = {}
#for key in libRewards.DEFAULT_COEF.keys():
#    forwardsPenaltyCoef[key] = 0.0
#forwardsPenaltyCoef['forwards']  = -2.0
#forwardsPenaltyCoef['action']    = 1.0
#forwardsPenaltyCoef['const']     = 2.0 # keep positive

#model = runTraining(forwardsPenaltyCoef)
#model.save("dqn.forwardPenalized")