Observation: 
Cart Position: -0.00713985 (close to the center of the track).
Cart Velocity: -0.04509013 (the cart is moving slightly to the left).
Pole Angle: 0.01458127 (the pole is nearly upright).
Pole Velocity at Tip: 0.04600234 (the pole is tilting to one side at a small rate)

In [12]:
# env.step(1) returns observation, reward, done, _, info
# Return observation after performing action 1 (move cart to the right), not done yet 

(array([ 0.02323877, -0.00714208,  0.04272482,  0.04248186], dtype=float32),
 1.0,
 False,
 False,
 {})

In [3]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

In [66]:
class DeepNeural(nn.Module):
    def __init__(self, n_actions, input_dim, hidden_dim):
        super(DeepNeural, self).__init__()
        self.action = n_actions
        self.hidden_dim = hidden_dim
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, n_actions)
    
    def forward(self, state):
        #state is also observation: input_dim = 4
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        actions = self.fc3(x)
        return actions #4 values

In [7]:
model = DeepNeural(2, 4, 128)
state = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
model.forward(state)

tensor([0., 0.], grad_fn=<ReluBackward0>)

In [71]:
#create a neural net that take inputs as actions and return output
class Agent:
    def __init__(self, epsilon, discount_factor, lr, input_dims, hidden_dim, batch_size, n_actions, memory_size = 10000):
        #input_dims = observation or state number of values
        self.actions_space = [i for i in range(n_actions)] #[0, 1], we want to use to randomly choose action later
        self.batch_size = batch_size
        self.current_memory = 0
        self.memory_size = memory_size
        self.epsilon = epsilon
        self.discount_factor = discount_factor
        
        self.state_memory = np.zeros((memory_size, input_dims), dtype=np.float32)
        self.new_state_memory = np.zeros((memory_size, input_dims), dtype=np.float32)
        self.reward_memory = np.zeros(memory_size, dtype=np.int32)
        self.action_memory = np.zeros(memory_size, dtype=np.int32)
        self.terminal_memory = np.zeros(memory_size, dtype=bool) #[True, True, False, False....]
        self.Q_net = DeepNeural(n_actions, input_dims, hidden_dim)
        self.optimizer = optim.Adam(self.Q_net.parameters(), lr = lr)
        self.loss = nn.MSELoss()
    
    def store_action(self, state, action, new_state, reward, done):
        '''Purpose: we want to store the action per time to the memory'''
        idx = self.current_memory % self.memory_size
        self.state_memory[idx] = state
        self.new_state_memory[idx] = new_state
        self.action_memory[idx] = action #[0, 1, 1, 0, 1, 0, 1,....]
        self.reward_memory[idx] = reward
        self.terminal_memory[idx] = done #[False, True, false, false....]
        self.current_memory += 1
        
    def choose_action(self, state): #given the state, how to choose action to perform the next step
        if np.random.random() > self.epsilon: #epsilon is about 0.1
            actions = self.Q_net(torch.tensor([state])) #return a list of actions, choose actions with highest probability
            action = torch.argmax(actions).item()
        else:
            action = np.random.choice(self.actions_space)
        return action

    def learn(self): #loop through every batch and learn
        if self.current_memory < self.batch_size:
            return 
        self.optimizer.zero_grad()
        max_mem = min(self.current_memory, self.memory_size)
        a_batch = np.random.choice(max_mem, self.batch_size, replace = False)# choose random samples in max_mem with len = batch_size, no duplicate => (batch_size, )
        #[mm3, mm1, mm65, mm2....]
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        state_batch = torch.tensor(self.state_memory[a_batch]) #batch_size x input_dims
        new_state_batch = torch.tensor(self.new_state_memory[a_batch]) #batch_size x input_dims
        reward_batch = torch.tensor(self.reward_memory[a_batch]) #batch_size
        action_batch = torch.tensor(self.action_memory[a_batch], dtype=torch.int32) #batch_size
        terminal_batch = torch.tensor(self.terminal_memory[a_batch]) #batch_size
        
        q_val = self.Q_net(state_batch)[batch_index, action_batch] # batch_size x 1
        q_next = self.Q_net(new_state_batch)  #batch_size x n_actions -> batch_size x 2
        q_next[terminal_batch] = 0.0
        q_target = reward_batch + self.discount_factor * torch.max(q_next, dim=1)[0] # batch_size
        #torch.max(q_next, dim =1) returns a tuple of values and indices, we access values only by adding [0]
        loss = self.loss(q_target, q_val)
        loss.backward()
        self.optimizer.step() 


QUESTION: in learn method, why set q_next[terminal_batch] = 0.0

In [76]:
env = gym.make("CartPole-v0")
agent = Agent(epsilon=0.1, discount_factor=0.8, lr = 0.001, batch_size=64, n_actions=2, input_dims = 4, hidden_dim=128, memory_size=10000)
scores, eps_history = [], []
games = 500
for i in range(games):
    score = 0
    terminated = False
    observation = env.reset() #return the initial values of obs
    observation = observation[0]
    while not terminated:
        action = agent.choose_action(observation)
        next_observation, reward, terminated, truncated, info = env.step(action)
        score += reward
        agent.store_action(observation, action, next_observation, reward, terminated)
        agent.learn()
        observation = next_observation
    scores.append(score)
    eps_history.append(agent.epsilon)
    average_score = np.mean(scores[-100:])
    print(f'episode: {i}\tscore: {score}\taverage score: {average_score}\t epsilon: {agent.epsilon}')

#Why do we want to know average score: the score in each episole can be up and down, but average score will tend to increase

episode: 0	score: 14.0	average score: 14.0	 epsilon: 0.1
episode: 1	score: 10.0	average score: 12.0	 epsilon: 0.1
episode: 2	score: 10.0	average score: 11.333333333333334	 epsilon: 0.1
episode: 3	score: 10.0	average score: 11.0	 epsilon: 0.1
episode: 4	score: 9.0	average score: 10.6	 epsilon: 0.1
episode: 5	score: 9.0	average score: 10.333333333333334	 epsilon: 0.1
episode: 6	score: 9.0	average score: 10.142857142857142	 epsilon: 0.1
episode: 7	score: 9.0	average score: 10.0	 epsilon: 0.1
episode: 8	score: 14.0	average score: 10.444444444444445	 epsilon: 0.1
episode: 9	score: 12.0	average score: 10.6	 epsilon: 0.1
episode: 10	score: 10.0	average score: 10.545454545454545	 epsilon: 0.1
episode: 11	score: 9.0	average score: 10.416666666666666	 epsilon: 0.1
episode: 12	score: 15.0	average score: 10.76923076923077	 epsilon: 0.1
episode: 13	score: 11.0	average score: 10.785714285714286	 epsilon: 0.1
episode: 14	score: 43.0	average score: 12.933333333333334	 epsilon: 0.1
episode: 15	score: 5

KeyboardInterrupt: 