In [1]:
%load_ext tensorboard

import math
import gym 
import plotly.express as px
import numpy as np
import warnings
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import random
from collections import Counter, deque
import pandas as pd
from sklearn.model_selection import train_test_split
import datetime
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff

import os
import glob
from IPython import display as ipythondisplay
from tqdm.notebook import tqdm
from gym.wrappers import Monitor
from IPython.display import HTML
import base64
import io
import pickle
import torch
from torch import nn
import kornia.augmentation as aug
warnings.filterwarnings("ignore")

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
from pyvirtualdisplay import Display
display = Display(visible=False, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7fa730025f70>

In [4]:
import os, sys, copy, argparse, shutil
def parse_arguments():
    parser = argparse.ArgumentParser(description='Deep Q Network Argument Parser')
    parser.add_argument('--seed', dest='seed', type=int, default=1)
    parser.add_argument('--env', dest='env', type=str, default='CartPole-v0')
    parser.add_argument('--save_interval', type=int, default=50, help='save model every n episodes')
    parser.add_argument('--log_interval', type=int, default=10, help='logging every n episodes')
    parser.add_argument('--render', help='render', type=int, default=1)
    parser.add_argument('--batch_size', help='batch_size', type=int, default=32)
    parser.add_argument('--train_freq', help='train_frequency', type=int, default=1)
    parser.add_argument('--max_episode', help='maximum episode', type=int, default=None)
    parser.add_argument('--max_timesteps', help='maximum timestep', type=int, default=100000000)
    parser.add_argument('--lr', dest='lr', type=float, default=0.00025)
    parser.add_argument('--lr_decay', action='store_true', help='decay learning rate')
    parser.add_argument('--gamma', help='discount_factor', type=float, default=0.99)
    parser.add_argument('--warmup_mem', type=int, help='warmup memory size', default=1000)
    parser.add_argument('--frame_skip', type=int, help='number of frames to skip for each action', default=3)
    parser.add_argument('--frame_stack', type=int, help='number of frames to stack', default=4)
    parser.add_argument('--memory', help='memory size', type=int, default=1000000)
    parser.add_argument('--initial_epsilon', '-ie', help='initial_epsilon', type=float, default=0.5)
    parser.add_argument('--final_epsilon', '-fe', help='final_epsilon', type=float, default=0.05)
    parser.add_argument('--max_epsilon_decay_steps', '-eds', help='maximum steps to decay epsilon', type=int, default=100000)
    parser.add_argument('--max_grad_norm', type=float, default=None, help='maximum gradient norm')
    parser.add_argument('--soft_update', '-su', action='store_true', help='soft update target network')
    parser.add_argument('--double_q', '-dq', action='store_true', help='enabling double DQN')
    parser.add_argument('--dueling_net', '-dn', action='store_true', help='enabling dueling network')
    parser.add_argument('--test', action='store_true', help='test the trained model')
    parser.add_argument('--tau', type=float, default=0.01, help='tau for soft target network update')
    parser.add_argument('--hard_update_freq', '-huf', type=int, default=2000, help='hard target network update frequency')
    parser.add_argument('--save_dir', type=str, default='./data')
    parser.add_argument('--resume_step', '-rs', type=int, default=None)
    return parser.parse_args()

In [5]:
#@title Set up constants for env and training
test = False 
save_dir = './data'
render = False
max_episode = None
max_timesteps = 100000000


In [6]:
#@title Augmentations
'''
color_jitter
random_elastic_transform
random_fisheye
random_color_equalize
random_gaussian_blur
random_gaussian_noise
random_horizontal_flip
random_color_invert
random_perspective_shift
random_shift
'''
color_jitter = aug.ColorJitter(
        brightness=np.random.random(),
        contrast=np.random.random(),
        saturation=np.random.random(),
        hue=np.random.random(),
        p=0.5
        )
random_elastic_transform = aug.RandomElasticTransform()
random_fisheye = aug.RandomFisheye(
        center_x=torch.tensor([-.3, .3]).to(device),
        center_y=torch.tensor([-.3, .3]).to(device),
        gamma=torch.tensor([.9, 1.]).to(device),
        )
# need to divide by 255.0
random_color_equalize = aug.RandomEqualize()
random_gaussian_blur = aug.RandomGaussianBlur(
        kernel_size=(9, 9),
        sigma = (5., 5.)
        )
random_gaussian_noise = aug.RandomGaussianNoise()
random_horizontal_flip = aug.RandomHorizontalFlip()
random_color_invert = aug.RandomInvert()
random_perspective_shift = aug.RandomPerspective()
random_shift = nn.Sequential(aug.RandomCrop((190, 140)), nn.ReplicationPad2d(20), aug.RandomCrop((210, 160)))

'\ncolor_jitter\nrandom_elastic_transform\nrandom_fisheye\nrandom_color_equalize\nrandom_gaussian_blur\nrandom_gaussian_noise\nrandom_horizontal_flip\nrandom_color_invert\nrandom_perspective_shift\nrandom_shift\n'

In [7]:
def tie_weights(src, trg):
    assert type(src) == type(trg)
    trg.weight = src.weight
    trg.bias = src.bias


# for 84 x 84 inputs
OUT_DIM = {2: 39, 4: 35, 6: 31}
# for 64 x 64 inputs
OUT_DIM_64 = {2: 29, 4: 25, 6: 21}

''' TODO change the layer parameters ''' 
class PixelEncoder(nn.Module):
    """Convolutional encoder of pixels observations."""
    def __init__(self, obs_shape, feature_dim=50, num_layers=3, num_filters=64, output_logits=False):
        super().__init__()

        assert len(obs_shape) == 3
        self.obs_shape = obs_shape
        self.feature_dim = feature_dim
        self.num_layers = num_layers
        
        # 160, 210, 3 
        self.conv1 = nn.Conv2d(3, 32, kernel_size=5, stride=5)
        # Input to conv2: 32, 42, 32
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        # Input to conv3: 15, 20, 64
        self.conv3 = nn.Conv2d(64, 64, kernel_size=4, stride=1)
        # Output from conv3: 12, 17, 64

        # out_dim = OUT_DIM_64[num_layers] if obs_shape[-1] == 64 else OUT_DIM[num_layers]
        out_dims = (12, 17)
        self.fc = nn.Linear(num_filters * out_dims[0] * out_dims[1], self.feature_dim)
        self.ln = nn.LayerNorm(self.feature_dim)

        self.outputs = dict()
        self.output_logits = output_logits

    def reparameterize(self, mu, logstd):
        std = torch.exp(logstd)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward_conv(self, obs):
        self.outputs['obs'] = obs
        conv1 = torch.relu(self.conv1(obs))
        self.outputs['conv1'] = conv1
        conv2 = torch.relu(self.conv2(conv1))
        self.outputs['conv2'] = conv2
        conv3 = torch.relu(self.conv3(conv2))
        self.outputs['conv3'] = conv3

        h = conv3.reshape(conv3.size(0), -1)
        return h

    def forward(self, obs, detach=False):
        h = self.forward_conv(obs)

        if detach:
            h = h.detach()

        h_fc = self.fc(h)
        self.outputs['fc'] = h_fc

        h_norm = self.ln(h_fc)
        self.outputs['ln'] = h_norm

        if self.output_logits:
            out = h_norm
        else:
            out = torch.tanh(h_norm)
            self.outputs['tanh'] = out

        return out

    def copy_conv_weights_from(self, source):
        """Tie convolutional layers"""
        # only tie conv layers
        for i in range(self.num_layers):
            tie_weights(src=source.convs[i], trg=self.convs[i])

    def log(self, L, step, log_freq):
        if step % log_freq != 0:
            return

        for k, v in self.outputs.items():
            L.log_histogram('train_encoder/%s_hist' % k, v, step)
            if len(v.shape) > 2:
                L.log_image('train_encoder/%s_img' % k, v[0], step)

        for i in range(self.num_layers):
            L.log_param('train_encoder/conv%s' % (i + 1), self.convs[i], step)
        L.log_param('train_encoder/fc', self.fc, step)
        L.log_param('train_encoder/ln', self.ln, step)

In [8]:
def wrap_env(env, train=True):
    suffix = 'train' if train else 'test'
    monitor_dir = os.path.join(save_dir, 'monitor_%s' % suffix)
    os.makedirs(monitor_dir, exist_ok=True)
    if not train:
        video_save_interval = 10
        env = Monitor(env, directory=monitor_dir,
                      video_callable=lambda episode_id: episode_id % video_save_interval == 0,
                      force=True)
    else:
        if render:
            if max_episode is not None:
                video_save_interval = int(max_episode / 3)
            else:
                video_save_interval = int(max_timesteps / float(env._max_episode_steps) / 3)
            env = Monitor(env, directory=monitor_dir,
                          video_callable=lambda episode_id: episode_id % video_save_interval == 0,
                          force=True)
        else:
            env = Monitor(env, directory=monitor_dir, video_callable=False, force=True)
    return env

In [9]:
class ReplayMemory(object):
    def __init__(self, max_epi_num=2000, max_epi_len=200):
        # capacity is the maximum number of steps in memory
        self.max_epi_num = max_epi_num
        self.max_epi_len = max_epi_len
        # saves each tuple of (state, action, next state, reward)
        self.capacity = 1000 # self.max_epi_num * max_epi_len
        self.idx = 0
        self.obs_memory = np.zeros((self.capacity, 210, 160, 3)) # deque(maxlen=self.max_epi_num * max_epi_len)
        self.next_memory = np.zeros((self.capacity, 210, 160, 3))
        self.act_memory = np.zeros((self.capacity, 1))
        self.reward_memory = np.zeros((self.capacity, 1))
        self.is_av = False
        self.current_epi = 0

    def reset(self):
        self.current_epi = 0
        self.memory.clear()

    ''' deprecated for tuple buffer '''
    def create_new_epi(self):
        pass

    def remember(self, state, next_state, action, reward):
        if self.idx == self.capacity:
            self.idx = 0
        self.obs_memory[self.idx] = state.copy()
        self.next_memory[self.idx] = next_state.copy()
        self.act_memory[self.idx] = action
        self.reward_memory[self.idx] = reward
        self.idx += 1
        
        '''
        if len(self.memory) < self.capacity:
            new_sample = np.array([state, action, reward, next_state])
            if len(self.memory) == 0:
                self.memory = [new_sample]
            else:
                length = len(self.memory)
                self.memory.append(new_sample)
        '''
                
    # samples batch_size
    def sample(self, batch_size):
        if batch_size < self.idx:
            idx = np.random.randint(0, self.idx - 1, batch_size)
            return self.obs_memory[idx], self.next_memory[idx], self.act_memory[idx], self.reward_memory[idx]
        return self.obs_memory, self.next_memory, self.act_memory, self.reward_memory

    def size(self):
        return self.idx

    def is_available(self):
        self.is_av = True
        if self.idx <= 1:
            self.is_av = False
        return self.is_av

    def print_info(self):
        pass

In [10]:
#@title Create a training conv agent
import torch.nn.functional as F

class DQNetworkConv(nn.Module):
    def __init__(self, in_channels, act_dim, dueling=False):
        super(DQNetworkConv, self).__init__()
        self.act_dim = act_dim
        self.dueling = dueling
        # 160, 210, 3 
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=5, stride=5)
        # Input to conv2: 32, 42, 32
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        # Input to conv3: 15, 20, 64
        self.conv3 = nn.Conv2d(64, 64, kernel_size=4, stride=1)
        # Output from conv3: 12, 17, 64
        if self.dueling:
            self.v_fc4 = nn.Linear(12 * 17 * 64, 512)
            self.adv_fc4 = nn.Linear(12 * 17 * 64, 512)
            self.v_fc5 = nn.Linear(512, 1)
            self.adv_fc5 = nn.Linear(512, self.act_dim)
        else:
            self.fc4 = nn.Linear(12 * 17 * 64, 512)
            self.fc5 = nn.Linear(512, self.act_dim)
        self.parameters = (list(self.conv1.parameters())) + (list(self.conv2.parameters())) + (list(self.conv3.parameters())) + (list(self.fc4.parameters())) + (list(self.fc5.parameters()))

    def forward(self, st):
        out = F.relu(self.conv1(st))
        out = F.relu(self.conv2(out))
        out = F.relu(self.conv3(out))
        out = out.view(out.size(0), -1)
        if self.dueling:
            val = F.relu(self.v_fc4(out))
            adv = F.relu(self.adv_fc4(out))
            val = self.v_fc5(val)
            adv = self.adv_fc5(adv)
            out = val.expand_as(adv) + adv - adv.mean(-1, keepdim=True).expand_as(adv)
        else:
            out = F.relu(self.fc4(out))
            out = self.fc5(out)
        return out

In [11]:
#@title Create a training FC agent
import torch.nn.functional as F

class DQNetworkFC(nn.Module):
    def __init__(self, z_dim, act_dim, dueling=False):
        super(DQNetworkFC, self).__init__()
        self.act_dim = act_dim
        self.input_dim = z_dim 
        self.dueling = dueling
        if self.dueling:
            self.v_fc1 = nn.Linear(z_dim, 512)
            self.adv_fc1 = nn.Linear(z_dim, 512)
            self.v_fc2 = nn.Linear(512, 1)
            self.adv_fc2 = nn.Linear(512, 256)
            self.v_fc3 = nn.Linear(256, 1)
            self.adv_fc3 = nn.Linear(256, self.act_dim)
        else:
            self.fc1 = nn.Linear(z_dim, 512)
            self.fc2 = nn.Linear(512, 256)
            self.fc3 = nn.Linear(256, self.act_dim)

    def forward(self, st):
        out = F.relu(self.fc1(st))
        out = F.relu(self.fc2(out))
        ''' Do we need a relu on the last layer if the output is probability over action space? '''
        out = F.relu(self.fc3(out))
        return out

In [12]:
def process_obs(obs):
    obs = torch.Tensor(obs / 255.)
    if len(obs.shape) < 4:
        obs = obs.unsqueeze(0)
    obs = obs.permute(0, 3, 1, 2)
    return obs.to(device)

In [13]:
def take_action(env, action):
    state, rew, done, _ = env.step(action)
    obs = env.render(mode='rgb_array')
    return obs, rew, done

In [14]:
MAX_STEPS = 200

In [15]:
class CURL(nn.Module):
    """
    CURL
    """

    def __init__(self, obs_shape, z_dim, batch_size, encoder, output_type="continuous", critic=None, critic_target=None):
        super(CURL, self).__init__()
        self.batch_size = batch_size

        # self.encoder = critic.encoder
        self.encoder = encoder 

        # self.encoder_target = critic_target.encoder 

        self.W = nn.Parameter(torch.rand(z_dim, z_dim))
        self.output_type = output_type

    def encode(self, x, detach=False, ema=False):
        """
        Encoder: z_t = e(x_t)
        :param x: x_t, x y coordinates
        :return: z_t, value in r2
        """
        if ema:
            with torch.no_grad():
                z_out = self.encoder_target(x)
        else:
            z_out = self.encoder(x)

        if detach:
            z_out = z_out.detach()
        return z_out

    def compute_logits(self, z_a, z_mod):
        """
        Uses logits trick for CURL:
        - compute (B,B) matrix z_a (W z_pos.T)
        - positives are all diagonal elements
        - negatives are all other elements
        - to compute loss use multiclass cross entropy with identity matrix for labels
        """
        Wz = torch.matmul(self.W, z_mod.T)  # (z_dim,B)
        logits = torch.matmul(z_a, Wz)  # (B,B)
        logits = logits - torch.max(logits, 1)[0][:, None]
        return logits

In [16]:
#@title Generate a batch of negatively labelled examples given observations

def generate_negatives(obs):
    neg_idx = np.random.randint(len(obs), size=len(obs))
    pos_idx = np.arange(len(obs))
    resample = (neg_idx == pos_idx)
    for (i, r) in enumerate(resample):
        if r:
            idx = neg_idx[i]
        else:
            idx = np.random.randint(0, len(obs), 1)[0]
            while idx == i:
                idx = np.random.randint(0, len(obs), 1)[0]
        neg_idx[i] = idx
    return (obs[neg_idx]).copy()


In [17]:
#@title Create a training agent (wrapper for conv agent)

GAMMA = 0.99

class Agent(object):
    def __init__(self, act_dim, in_channels=3, max_epi_num=50, max_epi_len=300, CURL=None, aug=random_shift, conv_net=False):
        self.N_action = act_dim
        self.max_epi_num = max_epi_num
        self.max_epi_len = max_epi_len
        ''' To decide when to copy weights to the target network '''
        self.num_param_updates = 0
        self.CURL = CURL
        self.aug = aug
        if conv_net:
            self.conv_net = DQNetworkConv(in_channels, act_dim).to(device)
            self.target = DQNetworkConv(in_channels, act_dim).to(device)
        else:
            ''' if using the encoder head for contrastive loss '''
            self.conv_net = DQNetworkFC(self.CURL.encoder.feature_dim, act_dim).to(device)
            self.target = DQNetworkFC(self.CURL.encoder.feature_dim, act_dim).to(device)
        self.buffer = ReplayMemory(max_epi_num=self.max_epi_num, max_epi_len=self.max_epi_len)
        self.gamma = 0.99
        self.loss_fn = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(
            list(self.conv_net.parameters()) + 
            list(self.CURL.parameters()) + 
            list(self.CURL.encoder.parameters()), lr=1e-3)

    def remember(self, state, action, reward, next_state):
        self.buffer.remember(state, next_state, action, reward)

    ''' Copy the weights to the target network every 100 updates '''
    def train(self, batch_size=32, target_update_freq=100, use_encoder=True):
        if self.buffer.is_available():
            obs, next_obs, action_list, reward_list = self.buffer.sample(batch_size)
            
            ''' Pass through the encoder to get encodings
             If also training the contrastive loss
             include that here! 
             1. data augmentation to create pos and negative pairs
             2. encoder 
             3. update encoder loss function (using a separate optimizer) or add to the loss computed below
            '''

            losses = [] 

            # check if obs is a numpy or a torch tensor
            if use_encoder:
                obs_anchor = process_obs(obs.copy()) # / 255.)
                if self.aug is not None:
                    obs_pos = self.aug(process_obs(obs.copy()))
                else:
                    obs_pos = process_obs(obs.copy())
                # mixed_obs = generate_negatives(obs)
                # mixed_obs = process_obs(mixed_obs)
                # obs_neg = random_shift(mixed_obs)
                z_a = self.CURL.encode(obs_anchor)
                z_pos = self.CURL.encode(obs_pos)
                # Mix pairs to generate negative labels
                # z_neg = self.CURL.encode(obs_neg)
                next_obs = process_obs(next_obs.copy())
                z_next = self.CURL.encode(next_obs)

                logits = self.CURL.compute_logits(z_a, z_pos)
                labels = torch.arange(logits.shape[0]).long().to(device)
                '''
                pos_logits = self.CURL.compute_logits(z_a, z_pos)
                neg_logits = self.CURL.compute_logits(z_a, z_neg)
                # [32, 32]
                pos_labels = torch.ones(pos_logits.shape[0]).long()
                neg_labels = torch.zeros(neg_logits.shape[0]).long() 
                # TODO: stack pos and neg logits and labels (double check dim)
                logits = torch.stack([pos_logits, neg_logits]).squeeze(0)
                labels = torch.stack([pos_labels, neg_labels]).squeeze(0)
                '''
                
                # pass into the loss function
                encoding_loss = nn.CrossEntropyLoss()(logits, labels)

                ''' Combine encoding loss with rl loss below '''
                losses.append(encoding_loss)

                # Then pass that encoding through the conv_net to get Q value estimates
                Qs = self.conv_net(z_a)
                next_Qs = self.target(z_next).detach().max(1)[0]
            
            else:
                ''' If not using the encoder, pass the obs directly to the CNN '''
                obs = process_obs(obs)
                # estimate current q values from observations
                Qs = self.conv_net(obs)
                # find next max q values based on next observations
                next_Qs = self.target(next_obs).detach().max(1)[0]
            
            ''' find target q values ''' 
            next_Qs = next_Qs.cpu().numpy() 
            Qs = torch.gather(Qs, dim=1, index=torch.tensor(action_list, dtype=torch.int64).to(device)).to(device)
            target_Qs = torch.tensor(reward_list.squeeze(-1) + GAMMA * next_Qs).long().to(device)
            ''' try to set Qs equal to target_Qs '''
            q_loss = self.loss_fn(Qs, target_Qs).long()
            losses.append(q_loss)
            
            ''' Loss update for q network and encoder head '''
            losses = torch.stack(losses).sum()
            self.optimizer.zero_grad()
            losses.backward()
            self.optimizer.step()

            self.num_param_updates += 1
            if self.num_param_updates % target_update_freq == 0:
                self.target.load_state_dict(self.conv_net.state_dict())

    # TODO: check the sizes of inputs and outputs
    def get_action(self, obs, epsilon, use_encoding=True):
        ''' 
         If using an encoder, need to pass that thorugh the encoder
         then use the encoding to pass through self.conv_net
        '''
        # obs = torch.tensor(obs)
        if use_encoding:
            obs = process_obs(obs)
            obs = self.CURL.encode(obs).detach()

        # Dividing obs by 255 is handled in encoder forward (only needed for use_encoding=False)
        if len(obs.shape) == 1:
            obs = obs.unsqueeze(0)

        # epsilon greedy for selecting which action to take
        if random.random() > epsilon:
            qs = self.conv_net(obs)
            action = qs[0].argmax().data.item()
        else:
            action = random.randint(0, self.N_action-1)

        return action

def get_decay(epi_iter):
    decay = math.pow(0.999, epi_iter)
    if decay < 0.05:
        decay = 0.05
    return decay

In [24]:
def main(aug=random_shift, train_curve_filename="default_curl_4000MC_400E"):
    env = gym.make('ALE/SpaceInvaders-v5')
    max_epi_iter = 4000
    max_MC_iter = 400
    obs = env.render(mode='rgb_array')
    encoder = PixelEncoder(obs_shape=obs.shape, feature_dim=50, num_layers=2, num_filters=64, output_logits=False).to(device)
    CURL_encoder = CURL(obs_shape=obs.shape, z_dim=50, batch_size=1, encoder=encoder, output_type="continuous").to(device)
    agent = Agent(act_dim=env.action_space.n, max_epi_num=10000, max_epi_len=max_MC_iter, CURL=CURL_encoder, aug=aug)
    train_curve = []
    for epi_iter in range(max_epi_iter):
        random.seed()
        env.reset()
        obs = env.render(mode='rgb_array')
        returns = 0.0
        for MC_iter in range(max_MC_iter):
            action = agent.get_action(obs, get_decay(epi_iter))
            next_obs, reward, done = take_action(env, action)
            returns += reward * agent.gamma ** (MC_iter)
            agent.remember(obs, action, reward, next_obs)
            obs = next_obs.copy()
            if done or MC_iter >= max_MC_iter-1:
                agent.buffer.create_new_epi()
                break
        print('Episode', epi_iter, 'returns', returns)
        if epi_iter % 1 == 0:
            train_curve.append(returns)
        if epi_iter % 100 == 0:
            np.save(train_curve_filename, np.array(train_curve))
        if agent.buffer.is_available():
            for _ in range(1):
                agent.train()
    print(train_curve)

In [25]:
main()

Episode 0 returns 47.62933558298766
Episode 1 returns 19.986401756224492
Episode 2 returns 12.773375104242554
Episode 3 returns 11.0215421422098
Episode 4 returns 37.43733713797183
Episode 5 returns 20.56708221740586
Episode 6 returns 21.13421759912679
Episode 7 returns 18.89312287106762
Episode 8 returns 24.08168057818884
Episode 9 returns 31.207427442596916
Episode 10 returns 31.829641905082656
Episode 11 returns 13.718983061147078
Episode 12 returns 28.556222509527128
Episode 13 returns 23.240292634052818
Episode 14 returns 29.293520541116557
Episode 15 returns 29.63166835913544
Episode 16 returns 30.20027038243237
Episode 17 returns 51.26266977187717
Episode 18 returns 36.56622301374786
Episode 19 returns 44.84584305948491
Episode 20 returns 36.749405038238585
Episode 21 returns 17.026226217450255
Episode 22 returns 13.17765949774962
Episode 23 returns 36.264631406580904
Episode 24 returns 25.422499725540515
Episode 25 returns 10.538857629878834
Episode 26 returns 27.19220021342798

KeyboardInterrupt: 

In [None]:
main(color_jitter, "color_jitter_4000MC_400E")
main(random_elastic_transform, "random_elastic_transform_4000MC_400E")
main(random_fisheye, "random_fisheye_4000MC_400E")
main(random_color_equalize, "random_color_equalize_4000MC_400E")
main(random_gaussian_blur, "random_gaussian_blur_4000MC_400E")
main(random_gaussian_noise, "random_gaussian_noise_4000MC_400E")
main(random_horizontal_flip, "random_horizontal_flip_4000MC_400E")
main(random_color_invert, "random_color_invert_4000MC_400E")
main(random_perspective_shift, "random_perspective_shift_4000MC_400E")
main(random_shift, "random_shift_4000MC_400E")

In [None]:
def old_main():
    env = gym.make('MountainCars-v0')
    if len(env.observation_space.shape) >= 3:
        env = WrapAtariEnv(env=env, noop_max=30, frameskip=3, framestack=4, test=test)
    if not test:
        dele = input("Do you wanna recreate ckpt and log folders? (y/n)")
        if dele == 'y':
            if os.path.exists(save_dir):
                shutil.rmtree(save_dir)

    env = wrap_env(env, train=not test)
    print(env.observation_space.shape)
    if len(env.observation_space.shape) >= 3:
        q_net = DQNetworkConv
    else:
        assert(False)
    agent = DQNAgent(env=env, qnet=q_net)
    if args.test:
        agent.rollout(episodes=100, render=render)
    else:
        agent.train()
    agent.env.close()

In [None]:
def main_mountaincar():
    env = gym.make('ALE/SpaceInvaders-v5')
    env.reset()

    for i in range(5):
        env.step(env.action_space.sample())
        obs = env.render(mode='rgb_array')
        print("Step ", i, obs)
    env.close()

In [None]:
main_mountaincar()

Code references for DQN:

https://github.com/taochenshh/dqn-pytorch

https://github.com/transedward/pytorch-dqn (for sampling from replay buffer)

CURL code: https://github.com/MishaLaskin/curl