In [5]:
import gym
import torch
import numpy as np
import matplotlib.pyplot as plt
import random
import math
import time


def clamp(value, min_value, max_value):
    return max(min(value, max_value), min_value)

class MountainCar:
    def __init__(self,  maxStep=500):
        self.maxStep = maxStep
        self.curStep = 0
        self.pxbound = (-1.2, 0.5)
        self.vxbound = (-0.07, 0.07)

    def state_space(self):
        return (self.pxbound, self.vxbound)
    
    def action_shape(self):
        return 3
    
    def reset(self):
        self.curStep = 0
        self.px = random.random()*0.2 - 0.6        
        self.vx = 0
        return (self.px, self.vx), ""
    
    def step(self, action):
        #assert(0 <= action and action <=2)
        acc = (action - 1.0)*0.001
        self.vx = clamp(self.vx + acc - 0.0025*math.cos(3 * self.px), *self.vxbound)
        self.px = clamp(self.px + self.vx, *self.pxbound)
        if self.px == self.pxbound[0]:
            self.vx = 0
        terminated = self.px == self.pxbound[1]
        self.curStep += 1
        truncated = self.curStep >= self.maxStep
        return (self.px, self.vx), -1, terminated, truncated, ""

class PolicyNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super().__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = torch.nn.functional.relu(self.fc1(x))
        #x = torch.nn.functional.log_softmax(self.fc2(x), 1)
        x = self.fc2(x)
        return x
    
class REINFORCE:
    def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma, device):
        self.policy_net = PolicyNet(state_dim, hidden_dim, action_dim)
        self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr = learning_rate)
        self.gamma = gamma
        self.device = device

    def take_action(self, state):
        state = torch.tensor([state], dtype=torch.float).to(self.device)
        probs = torch.nn.functional.softmax(self.policy_net(state),dim=1)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action.item()

    def update(self, states, actions, rewards):
        self.optimizer.zero_grad()
        g = 0
        for i in reversed(range(len(rewards))):
            state = torch.tensor(states[i],dtype=torch.float).view(1,-1).to(self.device)
            action = torch.tensor(actions[i]).view(1,-1).to(self.device)
            reward = rewards[i]
            #output = self.policy_net(state)
            output = torch.nn.functional.log_softmax(self.policy_net(state),dim=1)            
            log_prob = output.gather(1, action)
            g = g * self.gamma + reward
            loss = log_prob * (-g * math.pow(self.gamma, i))
            loss.backward()
        self.optimizer.step()
    
    '''def update(self, states, actions, rewards):
        count = len(rewards)
        ngs = torch.zeros(count)
        g = 0
        for i in reversed(range(count)):
            g = g * self.gamma + rewards[i]
            ngs[i] = -g 

        states = torch.tensor(states, dtype=torch.float).to(self.device)
        actions = torch.tensor(actions).view(count, -1).to(self.device)
        ngs = ngs.view(count, -1).to(device)
        
        outputs = torch.nn.functional.log_softmax(self.policy_net(states),dim=1)
        #print(outputs.shape, actions.shape, ngs.shape)
        #loss = torch.sum(torch.log(outputs.gather(1, actions)) * ngs)
        loss = torch.sum(outputs.gather(1, actions) * ngs)
        #loss = (torch.nn.functional.cross_entropy(outputs.gather(1, actions), actions.to(torch.float))*ngs).sum()
        #loss = torch.nn.functional.cross_entropy(outputs, actions) * ngs
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()'''

hidden_dim = 128

lr = 1e-3
gamma = 0.98
device = torch.device("cpu") #torch.device("cuda") if torch.cuda.is_available () else torch.device("cpu")
state_dim = 2
action_dim = 3


def train(agent, num_episodes):    
    reward_stat = np.zeros(num_episodes)
    env = MountainCar()
    start_time = time.time()
    total_steps = 0
    max_reward = -100000 
    for episode in range(num_episodes):
        state, info = env.reset()
        total_reward = 0
        states=[]
        actions=[]
        rewards=[]
        while True:
            total_steps += 1
            action = agent.take_action(state)
            next_state, reward, terminated, truncated, info = env.step(action)
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            state = next_state
            total_reward += reward
            if terminated or truncated:
                break
        agent.update(states,actions,rewards)
        new_max_reard = max_reward < total_reward
        if new_max_reard:
            max_reward = total_reward
        if new_max_reard or episode*10 % num_episodes == 0:
            duration = time.time() - start_time
            steps_per_second = total_steps/duration
            print("episode:", episode, "total_reward:",total_reward, "step/second:", steps_per_second)
        reward_stat[episode] = total_reward
    return total_steps,reward_stat


In [6]:
agent = REINFORCE(state_dim, hidden_dim, action_dim, lr, gamma, device)
total_steps, reward_stat = train(agent, 1000)
print(total_steps)


episode: 0 total_reward: -500 step/second: 1744.4169394979904
episode: 100 total_reward: -500 step/second: 1640.192907960816
episode: 200 total_reward: -500 step/second: 1629.417752308192
episode: 300 total_reward: -500 step/second: 1617.3999043185195
episode: 400 total_reward: -500 step/second: 1611.597480479977
episode: 500 total_reward: -500 step/second: 1609.6150735223455
episode: 600 total_reward: -500 step/second: 1611.693696183803


KeyboardInterrupt: 