In [1]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch
from torch import optim
import gym
from tqdm import trange
from random import choice, sample
from collections import deque
import matplotlib.pyplot as plt
from copy import deepcopy
from torch.nn.functional import mse_loss
import cv2
import tqdm.notebook as tq

In [2]:
def mk_env():
    env = gym.make('MountainCar-v0')
    env.reset()
    return env

In [3]:
def mk_network():
    model = nn.Sequential(
      nn.Linear(2, 32),
      nn.ReLU(),
      nn.Linear(32, 32),
      nn.ReLU(),
      nn.Linear(32, 3)
    )
    return model

In [4]:
def play(model, env, device):
    observation = env.reset()
    score = 0
    while True:
        action =  model(torch.tensor(observation).to(device).float().unsqueeze(0)).argmax().item()
        observation, reward, done, _ = env.step(action)
        score += reward
        if done:
            return score

In [5]:
def q_learning(env, model, modify_reward, max_epsilon=1.0, min_epsilon=0.05, epsilon_step=1e-2, 
               gamma=0.99, batch_size=128, avaliable_actions=[0, 1, 2], amount_of_games = 200,
               game_max_length = 200, lr=0.0003, buff_len=10000, play_game=play, log_info=True,
               transform=lambda x: x, change_state=lambda old, new: new,
               batch_format=lambda x: x):
    if torch.cuda.is_available():  
      dev = "cuda" 
    else:  
      dev = "cpu"
    device = torch.device(dev)   

    observation = env.reset()
    epsilon = max_epsilon
    
    target_model = deepcopy(model)

    model.to(device)
    target_model.to(device)
  
    memory = deque(maxlen=buff_len)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    games_stat = []
    i = 0
    test_games_played = 0
    bar = tq.trange(amount_of_games * game_max_length)
    
    observation = transform(observation)
    observation = change_state(np.array([np.zeros((84, 84)) for i in range(4)]), observation)
    for i in bar:
        epsilon = max(min_epsilon, epsilon - epsilon_step)

        if np.random.rand() < epsilon:
            action = choice(avaliable_actions)
        else:
            with torch.no_grad():
                action = model(torch.tensor(batch_format(observation)).to(device).float()).argmax().item()
                
        new_observation, reward, done, _ = env.step(action)
        
        new_observation = transform(new_observation)
        new_observation = change_state(observation, new_observation)

        memory.appendleft((observation, action, modify_reward(reward, observation, new_observation), new_observation, done))
        
        if done:
            observation = env.reset()
            observation = transform(observation)
            observation = change_state(np.array([np.zeros((84, 84)) for i in range(4)]), observation)
            
            target_model = deepcopy(model)
            target_model.to(device)
            if test_games_played % 10 == 0:
              res = play_game(target_model, deepcopy(env), device)
              bar.set_description(str(res))
              games_stat.append(res)
            test_games_played += 1
            done = False
        else:
            observation = new_observation

        if len(memory) > batch_size:
            states, actions, rewards, next_states, dones = tuple(map(lambda x: torch.tensor(np.array(x)), zip(*sample(list(memory), batch_size))))
            
            actions = actions.long().to(device)
            rewards = rewards.int().to(device)
            states = states.float().to(device)
            dones = dones.int().to(device)
            next_states = next_states.float().to(device)
            
            with torch.no_grad():
                q_next = target_model((next_states).float()).max(dim=1)[0].view(-1)
                
            y_true = rewards + gamma  * q_next * dones
            y_pred = model((states).float()).gather(dim=1, index=actions.unsqueeze(1))

            output = mse_loss(y_pred, y_true.unsqueeze(1))
            
            optimizer.zero_grad() 
            output.backward()
            optimizer.step()
            
    return target_model, games_stat


        

          




      


     

In [None]:
all_models = []
all_rewards = []
for agent in range(9):
    model, rewards = q_learning(mk_env(), mk_network(), lambda reward, observation, new_observation: 
                                float(reward + 1000 * (0.9 * abs(new_observation[1]) - abs(observation[1]))))
    
    all_models.append(model)
    all_rewards.append(np.array(rewards))

In [None]:
all_rewards = np.mean(np.array(list(map(lambda x: x[:22], all_rewards))), axis=0)
plt.plot(all_rewards)
plt.xlabel('Game')
plt.ylabel('Mean reward')
plt.title('Game / Reward')
plt.show()

In [6]:
def mk_breakout_env():
    env = gym.make('Breakout-v0')
    env.reset()
    return env

In [7]:
def mk_breakout_network():
    cnn = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4)),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2)),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1)),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(in_features=64 * 7 * 7, out_features=512),
            nn.ReLU(),
            nn.Linear(512, 4)
    )
    return cnn

In [8]:
def transofrm_breakout_img(img):
    img = img[25:200]
    img = cv2.resize(img, (84, 84))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    return np.expand_dims(img, axis=0)

In [11]:
def play_breakout(model, env, device):
    observation = env.reset()
    buffer = deque(maxlen=4)
    buffer.append(np.zeros((1, 84, 84)))
    buffer.append(np.zeros((1, 84, 84)))
    buffer.append(np.zeros((1, 84, 84)))
    buffer.append(np.zeros((1, 84, 84)))
    score = 0
    while True:
        env.render()
        observation = transofrm_breakout_img(observation)
        buffer.append(observation)
        action =  model(torch.transpose(torch.tensor(buffer).to(device), 0, 1).float()).argmax().item()
        observation, reward, done, _ = env.step(action)
        score += reward
        print(done)
        if done:
            env.close()
            return score

In [None]:
all_break_models = []
all_break_rewards = []
for agent in range(1):
    break_model, break_rewards = q_learning(mk_breakout_env(), mk_breakout_network(), modify_reward=lambda x, y, z: x, max_epsilon=1.0, min_epsilon=0.05, epsilon_step=1e-3, 
               gamma=0.99, batch_size=32, avaliable_actions=[0, 1, 2, 3], amount_of_games = 600,
               game_max_length = 500, lr=0.0003, buff_len=10000, play_game=play_breakout, transform=transofrm_breakout_img, 
               change_state=lambda old, new: np.append(old[1:], np.array(new), axis=0),
               batch_format=lambda x: np.array([x]))
    
    torch.save(break_model.state_dict(), './breakout_models/model_' + str(agent))
    
    all_break_models.append(break_model)
    all_break_rewards.append(np.array(break_rewards))

In [None]:
plt.plot(all_break_rewards[0])
plt.xlabel('Game')
plt.ylabel('Score')
plt.title('Game / Score')
plt.show()

In [10]:
model = mk_breakout_network()
model.load_state_dict(torch.load('breakout_0', map_location=torch.device('cpu')))
model.eval()

Sequential(
  (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
  (1): ReLU()
  (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
  (3): ReLU()
  (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (5): ReLU()
  (6): Flatten(start_dim=1, end_dim=-1)
  (7): Linear(in_features=3136, out_features=512, bias=True)
  (8): ReLU()
  (9): Linear(in_features=512, out_features=4, bias=True)
)

In [None]:
print(play_breakout(model, mk_breakout_env(), torch.device('cpu')))

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

In [None]:
torch.cuda.is_available()