# GYM Mountain Car v0

In [1]:
import gym

In [2]:
env = gym.make('MountainCar-v0')
state = env.reset()
for _ in range(3):
#     env.render()
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
env.close()

## Q-learning

Initialize $Q(s, a)$
 
repeat (for each `episode`):
         
> Initialize $S$
>
> repeat (for each `step` in `episode`):
>
> > choose $A$ from $S$ using $\epsilon$-greedy policy derived from $Q$
> >
> > take $A$, observe $R$, $S'$
> >
> > 
> > $Q(s, a) = Q(s, a) + \alpha \left[ R + \gamma \max_a Q(S', a) - Q(S, A) \right]$
> >
> > $S \leftarrow S'$ 
> >
> until $S$ is terminal

### Approximating Functions

In [3]:
import torch

In [13]:
class DQN(torch.nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        self.layer1 = torch.nn.Linear(2, 32, dtype=torch.float64)
        self.layer2 = torch.nn.Linear(32, 32, dtype=torch.float64)
        self.layer3 = torch.nn.Linear(32, 3, dtype=torch.float64)

    def forward(self, state):
        x = torch.tensor(state, dtype=torch.float64)
        x = torch.nn.functional.relu(self.layer1(x))
        x = torch.nn.functional.relu(self.layer2(x)) 
        x = self.layer3(x)
        return x

dqn = DQN()

In [14]:
import numpy as np

In [16]:
# DL
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(dqn.parameters())
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)


# RL
epsilon = 0.5
gamma = 0.9
env = gym.make('MountainCar-v0')

episodes = []
state = env.reset()
loss_sum = 0
wins = 0
max_steps = 100 * 1000
for episode in range(100000):
    max_x = -2
    while True:
        if np.random.sample() < epsilon:
            action = env.action_space.sample()
        else:
            action = dqn(state).argmax().item()

        new_state, reward, done, info = env.step(action)
        
        reward = 300 * abs(new_state[1] - state[1]) + reward;
        max_x = max(max_x, new_state[0])
        
        if new_state[0] >= 0.5:
            reward += 1
        
        if done:
            if new_state[0] >= 0.5:
                epsilon *= 0.99
                wins += 1
            state = env.reset()
            episodes.append(episode)
            if episode % 100 == 99:
                print(f"{episode + 1}, wins={wins/100:.0%}, x={max_x:.2f}, loss={loss_sum/100:.4f}")
                wins = 0
                loss_sum = 0
            if episode % 2000 and wins > 0:
                scheduler.step()
            break

        target = reward + gamma * dqn(new_state).max().item()
        prediction = dqn(state)[action]

        loss = criterion(prediction, torch.tensor(target).double())
        loss_sum += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        state = new_state

env.close()

100, wins=0%, x=-0.41, loss=125.4814
200, wins=0%, x=-0.49, loss=2.3552
300, wins=0%, x=-0.48, loss=1.9147
400, wins=0%, x=-0.50, loss=1.1063
500, wins=0%, x=-0.27, loss=2.7254
600, wins=0%, x=-0.14, loss=2.4328
700, wins=0%, x=-0.36, loss=1.4183
800, wins=0%, x=-0.33, loss=1.2137
900, wins=0%, x=-0.31, loss=1.1172
1000, wins=0%, x=-0.30, loss=1.4393


KeyboardInterrupt: 

In [None]:
# env = gym.make('MountainCar-v0')
# state = env.reset()
# done = False
# while not done:
#     env.render()
#     action = dqn(state).argmax().item()
#     state, reward, done, info = env.step(action)
# env.close()