# ai.py notes

Importing the libraries

In [1]:
import numpy as np
import random
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
from torch.autograd import Variable

### Creating the architecture of the Neural Network

* References
    - http://pytorch.org/docs/master/nn.html#containers
    - http://pytorch.org/docs/master/nn.html#linear-layers
    - http://pytorch.org/docs/master/nn.html#torch.nn.functional.relu
    - http://pytorch.org/docs/master/nn.html#non-linear-activations

In [2]:
class Network(nn.Module):
    
    def __init__(self, input_size, nb_action):
        '''
        Creates architecture of our neural network class
        
        input_size = size of input vector
        nb_action = number of actions
        '''
        # as defined here http://pytorch.org/docs/master/nn.html#containers
        super(Network, self).__init__()
        
        self.input_size = input_size
        self.nb_action = nb_action
        
        # creating a linear layer as defined here http://pytorch.org/docs/master/nn.html#linear-layers
        self.fc1 = nn.Linear(in_features=self.input_size, out_features=30, bias=True)
        
        # creating a liner layer as defined here http://pytorch.org/docs/master/nn.html#linear-layers
        self.fc2 = nn.Linear(in_features=30, out_features=nb_action, bias=True)
        
        
    def forward(self, state):
        '''
        Applies forward propogation to neural network and returns the Q-values
        
        state = input state for the neural network (a torch Variable)
        '''
        
        # the statement passes the input state values to first linear layer (input layer) and applies,
        # rectifier linear unit function as defined here http://pytorch.org/docs/master/nn.html#non-linear-activations
        # returns the hidden neuron values
        x = F.relu(self.fc1(state))
        
        # the statment passes the hidden values to second linear layer(output layer)
        # and obtain the output q_values
        q_values = self.fc2(x)
        return q_values

### Implementing Experience Replay

* References
    - https://www.programiz.com/python-programming/methods/built-in/zip
    - http://pytorch.org/docs/master/torch.html?indexing-slicing-joining-mutating-ops#indexing-slicing-joining-mutating-ops

In [13]:
class ReplayMemory(object):
    
    def __init__(self, capacity):
        '''
        Initilizes the ReplayMemory (a.k.a. ExperienceMemory) Class 
        
        capacity = length of replay memory
        '''
        self.capacity = capacity
        self.memory = []
        
    def push(self, event):
        '''
        Appends event to replay memory and ensures memory does not exceeds given capacity
        
        event = event tuple (state, state_prime, reward, action)
        '''
        self.memory.append(event)
        if len(self.memory) > self.capacity:
            del self.memory[0]
            
    def sample(self, batch_size):
        '''
        Fetches a random sample from replay memory, reshapes it and returns as a torch Variable
        
        batch_size = number of random samples to fetch
        '''
        # as defined here https://www.programiz.com/python-programming/methods/built-in/zip
        # if list = [(1,2,3),(4,5,6)], then zip(*list) = [(1,4),(2,3),(5,6)]
        samples = zip(*random.sample(self.memory, batch_size))
        
        # as defined here http://pytorch.org/docs/master/torch.html?indexing-slicing-joining-mutating-ops#indexing-slicing-joining-mutating-ops
        return map(lambda var: Variable(torch.cat(seq=x, dim=0)), samples)

### Zip function demonstration

In [14]:
example_memory = [(1,2,3), (4,5,6), (7,8,9), (10,11,12), (13,14,15), (16,17,18)]
example_batch = random.sample(example_memory, 4)
example_batch

[(1, 2, 3), (7, 8, 9), (16, 17, 18), (4, 5, 6)]

In [15]:
samples = zip(*example_batch)
list(samples)

[(1, 7, 16, 4), (2, 8, 17, 5), (3, 9, 18, 6)]

### Network parameters demonstration

In [23]:
net = Network(input_size=5, nb_action=3)
list(net.parameters())

[Parameter containing:
  0.2680  0.1040  0.1754  0.0519  0.4036
 -0.2909 -0.3661  0.0451  0.0694 -0.1678
 -0.1794  0.4193 -0.3143  0.4197  0.0522
  0.0097 -0.1910 -0.0391  0.3963 -0.2185
 -0.3662 -0.2149 -0.1219  0.2423 -0.0812
 -0.3110  0.2229 -0.2322  0.2683 -0.4236
 -0.3746  0.3692  0.1135  0.4237  0.3748
  0.4119 -0.4135  0.3867 -0.2454 -0.0706
  0.2212 -0.1920 -0.0525  0.4148 -0.0283
 -0.1114 -0.4027  0.0401  0.0896 -0.3335
 -0.2436 -0.2807  0.3331  0.4094 -0.2917
  0.2656  0.1724  0.0951  0.3376 -0.1951
  0.2393 -0.2010  0.0043  0.1937  0.2133
  0.3713 -0.1818 -0.1340  0.4250  0.0696
 -0.2187 -0.1999  0.2647  0.2645  0.0349
 -0.2254 -0.0123  0.1156 -0.0751  0.3148
  0.0879  0.2973 -0.0681  0.0105 -0.1998
  0.1576 -0.3449 -0.2448 -0.1216  0.3219
  0.2405  0.0922  0.0080 -0.3896  0.2327
  0.2249 -0.2841  0.4381 -0.2988  0.3350
 -0.1683  0.2320 -0.0294  0.2497  0.0134
  0.2130 -0.1540  0.3665  0.2317 -0.2806
  0.3682  0.1374 -0.3794  0.1069  0.3271
 -0.3931  0.0649  0.2663 -0.1443  

### Implementing Deep Q Learning

* References
    - [http://pytorch.org/docs/master/optim.html#torch.optim.Adam]
    - [https://arxiv.org/abs/1412.6980]
    - [http://pytorch.org/docs/master/torch.html#torch.unsqueeze]
    - [http://pytorch.org/docs/master/nn.html?highlight=softmax#torch.nn.Softmax]

In [26]:
class Dqn(object):
    
    def __init__(self, input_size, nb_action, gamma):
        '''
        Initializing the Dqn object
        
        input_size = size of input vector
        nb_action = size of output vector
        gamma = the discount factor gamma in Q - learning
        '''
        
        self.gamma = gamma
        
        # the reward window to house the latest rewards and calculating rolling mean
        self.reward_window = []
        
        # instance of Network class
        self.model = Network(input_size, nb_action)
        
        # instance of ReplayMemory class
        self.memory = ReplayMemory(capacity=100000)
        
        # as defined here http://pytorch.org/docs/master/optim.html#torch.optim.Adam
        self.optimizer = optim.Adam(params=self.model.parameters(), lr=0.001)
        
        # as defined here http://pytorch.org/docs/master/torch.html#torch.unsqueeze
        self.last_state = torch.Tensor(input_size).unsqueeze(0)
        self.last_action = 0
        self.last_reward = 0.0
        
    def select_action(self, state):
        # sureity of neural network on action it decides to play 
        temperature = 7
        
        # as defined here http://pytorch.org/docs/master/nn.html?highlight=softmax#torch.nn.Softmax
        probs = F.softmax(self.model.forward(Variable(state, volatile=True)) * temperature)
        action = probs.multinomial()
        return action.data[0,0]
    
    def learn(self, batch_state, batch_next_state, batch_reward, batch_action):
        outputs = self.model(batch_state).gather(1, batch_action)