In [1]:
import math
import datetime
import os, sys
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from IPython.display import Audio
import csv

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torchsummary import summary

import cv2




# local files
sys.path.insert(0, '../')
import pyClient
import utils
import model
from model import Transition

In [2]:
# Training parameters
BATCH_SIZE = 128 #original 128
GAMMA = 0.999
EPS_START = 0.95
EPS_END = 0.05
EPS_DECAY_steps = 4000 
EPS_DECAY = (EPS_START - EPS_END)/EPS_DECAY_steps
REPLAY_START_SIZE =  128 #TODO PUT BACK TO 1500 #steps taken
TARGET_UPDATE = 10 #episodes
DEVICE = 'cuda:0'

# Environment parameters
IMSIZE = 128
STACK_SIZE = 1
N_ACTIONS = 3
IP  = "127.0.0.1" # Ip address that the TCP/IP interface listens to
PORT = 13000       # Port number that the TCP/IP interface listens to


environment = pyClient.Environment(ip = IP, port = PORT, size = IMSIZE) 
agent = model.DoubleDQNAgent(imsize=IMSIZE,
                 in_channels=STACK_SIZE,
                 n_actions=N_ACTIONS,
                 memory_capacity=12000,
                 eps_start=EPS_START,
                 eps_end=EPS_END,
                 eps_delta=EPS_DECAY,
                 gamma_discount = GAMMA,
                 batch_size = BATCH_SIZE,
                 device=DEVICE)



In [3]:
# Utilities
def process_state(state_raw):
    """ @TODO """
    frame = environment.state2usableArray(state_raw)
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    frame = frame.astype('float32')
    return torch.Tensor(frame / 255.).view(1,1,environment.size, environment.size)

In [None]:
MAX_EPISODES = 5000
MAX_STEPS  = 10000
TRAINING_CONDITION = 0
LOGFILE = 'Experiments/Out/test-01.csv'

RESET_UPON_END_SIGNAL = {0:False,  # Nothing happened
                         1:True,   # Box collision
                         2:True,   # Wall collision
                         3:False}  # Reached step target



with open(LOGFILE, 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['episode','step_count', 'train_loss', 'reward'])

lr_dqn = 0.01
optimizer = optim.Adam(agent.policy_net.parameters(), lr = lr_dqn)
total_reward = 0
total_loss = 0
for episode in range(MAX_EPISODES):
    with open(LOGFILE, 'a') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        writer.writerow([episode,agent.step_count,total_reward,total_loss]) 
    
    
    if agent.step_count > MAX_STEPS:
        break
    
    if episode % TARGET_UPDATE == 0:  #episodes
        print('target net updated')
        agent.update_target_net
    
    print('episode {}'.format(episode))
    _, _, state_raw = environment.reset(TRAINING_CONDITION)
    state = process_state(state_raw).to(DEVICE)

    for t in count(): 
        action = agent.select_action(state)
        end, reward, next_state_raw = environment.step(action.item())
        next_state = process_state(next_state_raw).to(DEVICE) if not RESET_UPON_END_SIGNAL[end] else None
        if reward > 100:
            reward = -(reward -100)
        reward = torch.tensor([reward], device=DEVICE,dtype=torch.float)
        total_reward += reward.item();
        action = action.unsqueeze(0)
        agent.memory.push(state, action, next_state, reward)

        if RESET_UPON_END_SIGNAL[end]:
            break
            
        if len(agent.memory) > REPLAY_START_SIZE:
            
            state_action_values, expected_state_action_values = agent.forward()
            
            """ @TODO: Optimize model"""
            # Compute Huber loss
            loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)
            total_loss += loss.item()

            # Optimize the model
            optimizer.zero_grad()
            loss.backward()

            nn.utils.clip_grad_norm_(agent.policy_net.parameters(), 1)

            optimizer.step()
        else:
            agent.step_count = 0

target net updated
episode 0
episode 1
episode 2
episode 3
episode 4
episode 5
episode 6
episode 7
episode 8
episode 9
target net updated
episode 10
episode 11
episode 12
episode 13
episode 14
episode 15
episode 16
episode 17
episode 18
episode 19
target net updated
episode 20
episode 21
episode 22
episode 23
episode 24
episode 25
episode 26
episode 27
episode 28
episode 29
target net updated
episode 30
episode 31
episode 32
episode 33
episode 34
episode 35
episode 36
episode 37
episode 38
episode 39
target net updated
episode 40
episode 41
episode 42
episode 43
episode 44
episode 45
episode 46
episode 47
episode 48
episode 49
target net updated
episode 50
episode 51
episode 52
episode 53
episode 54
episode 55
episode 56
episode 57
episode 58
episode 59
target net updated
episode 60
episode 61
episode 62
episode 63
episode 64
episode 65
episode 66
episode 67
episode 68
episode 69
target net updated
episode 70
episode 71
episode 72
episode 73
episode 74
episode 75
episode 76
episode 77
