### Example of Reinforcement Learning 

linear Approxiatio for Policy evaluation. Use same process as the deep Q leanring. But it only takes the linear input and output Q values(same number of actions). Use e-greedy exploration. 

In [3]:
import torch.nn as nn
import torch
import torch.optim as optim
from q1_schedule import LinearExploration, LinearSchedule
from utils.test_env import EnvTest
import numpy as np

class Critic(nn.Module):
    def __init__(self, dim_input, alpha=1e-04, action_size=5):
        super(Critic, self).__init__()
        ## Assume output one rewards as the output
        self.linear = nn.Linear(dim_input, action_size)
        
        self.optimizer = optim.Adam(self.parameters(), lr=alpha, weight_decay=1e-5)
        self.loss_fun = nn.MSELoss()
        
    def forward(self, state):
        #Sate is numpy array
        x = torch.tensor(state.astype(float))
        #display('State: ', state)
        #print (x)
        Qs = self.linear(x)
        return Qs ##Output the five Q value for each action

## We know the action action is one of (0:4)   

class Agent():
    def __init__(self,gamma, env, iters=50, epsilon=1, nsteps=50):
        self.gamma = gamma
        self.nsteps = nsteps
        self.eps_begin = 1
        self.eps_end = 0.1
        
        shape=1
        for i in env.observation_space.shape:
            shape*=i            
        model= Critic(shape)
            
        self.model = model.double()
        self.iters = iters
        self.env = env
        self.epsilon = epsilon
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
    def pickActionEgreedy(self, best):
        if np.random.rand() < self.epsilon:
            return env.action_space.sample() #random 
        else:
            return best
        
    def qValues(self,state):  # Not used in this program
        #Return all the (atate, actions) pairs for that input state
        Qs = np.zeros(env.action_space.n)
        for action in range(env.action_space.n):
            Qs[action] = self.model(state, action)
        return Qs        
        
    def train(self):
        t = 0
        
        #print ('Shape return: ', s.shape)
        rewards_list =[]
                        
        for i in range(self.iters):
            total_rewards = 0
            s = self.env.reset()  #Obtain the initial stare
            
            done = False
            while not done:
                
                self.model.train()
                self.model.zero_grad()
                t = t+1
                alpha = 1 / t
                
                if t < self.nsteps:
                    self.epsilon = self.epsilon - (self.eps_begin - self.eps_end) *(t / self.nsteps)
                else: 
                    self.epsilon = self.eps_end
                
                ##Pick the action with epsilon-greddy
                best_action = torch.argmax(self.model(s.flatten())) #obtain the best action
                action = self.pickActionEgreedy(best_action.item()) #selection the action by applying the greedy alg
                
                ##obtain the new state and reward
                s_next, rewards, done, _ = self.env.step(action)
                ##save the total rewards
                total_rewards +=rewards 
                
                rewards = torch.tensor(rewards, requires_grad=True,dtype=torch.float64)
                           
                ##Appy the Q learning,obtain the TD target
                target = rewards + self.gamma * torch.max(self.model(s_next.flatten())) *(1-int(done))
            
                pred = self.model(s.flatten())[action].float()
                
                loss = self.model.loss_fun(pred, target.float())
                
                loss.backward()
                self.model.optimizer.step()                
                s = s_next 
            rewards_list.append(total_rewards)
        display(rewards_list)

if __name__ == '__main__':
    env = EnvTest((10,20))

    
    agent = Agent(0.9,env)
    agent.train()
    
    
    # train model
    #model = Linear(env, config)
    #model.run(exp_schedule, lr_schedule)


[0.7,
 0.1,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.2,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.30000000000000004,
 0.5,
 0.5,
 0.5,
 0.5,
 0.2,
 0.30000000000000004,
 0.20000000000000004,
 0.5,
 0.2,
 0.5,
 0.5,
 0.5,
 0.30000000000000004,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5,
 0.30000000000000004,
 0.5,
 0.5,
 0.2,
 0.5,
 0.5,
 0.5,
 0.5,
 0.5]

In [71]:
np.amax([np.array([1,2,3,4])])

4

In [73]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print (device)

cpu


In [364]:
ini_array1 = np.array([[1, 2, 3], [2, 4, 5], [1, 2, 3]]) 
print (ini_array1)
print (ini_array1.flatten())

[[1 2 3]
 [2 4 5]
 [1 2 3]]
[1 2 3 2 4 5 1 2 3]


In [347]:
for i in (5,):
    print (i)

5
