<a href="https://colab.research.google.com/github/mirklys/little-projects/blob/main/thesis/training_full.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install Box2D
!pip3 install box2d-py
!pip3 install gym[all]
!pip3 install gym[Box_2D]

In [None]:
!pip install stable_baselines
!pip install stable_baselines3

In [None]:
!pip install tensorflow

In [None]:
import gym
import os
import numpy as np
import torch as th
from torch import nn
from torch.distributions.bernoulli import Bernoulli
import matplotlib.pyplot as plt
from collections import defaultdict
import time

from stable_baselines3 import PPO, A2C, SAC
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.utils import get_device
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines.common import set_global_seeds, make_vec_env
from stable_baselines3.common.monitor import Monitor

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
PATH_BASE = '/content/gdrive/MyDrive/Thesis Project'
PATH_DATA = os.path.join(PATH_BASE, 'data/')
PATH_NETWORKS = os.path.join(PATH_BASE, 'networks/')
PATH_PLOTS = os.path.join(PATH_BASE, 'plots/')
PATH_RESULTS = os.path.join(PATH_BASE, 'results/')
PATH_LOGS = os.path.join(PATH_BASE, 'logs/')
os.chdir(PATH_BASE)

In [None]:
dev = th.device('cuda' if th.cuda.is_available() else 'cpu') # i was training on GPU because it's faster
dev

device(type='cuda')

In [None]:
class MaskedMLP(BaseFeaturesExtractor):
    def __init__(self, observation_space, dropout_param=0.0, percent_to_mask=0.0, size=32, job='train'):
        assert job == 'train' or job == 'train_masked',\
            "This class can only be initialized for jobs: train, train_masked"
        """
            This class creates an MLP network where training type can be defined (regular or masked).
            In the masked training, dropout is excluded.
            l1_size: first layer size
            l2_size: second layer size
            job: type of training, 'train' || 'train_masked'
            dropout_param: dropout size
            percent_to_mask: masking percentage, how much the network should be damaged
        """
        self.l1_size = size
        self.l2_size = size
        self.job = job
        super(MaskedMLP, self).__init__(observation_space, self.l2_size)

        self.dropout_param = dropout_param
        self.percent_to_mask = percent_to_mask

        input_size = observation_space.shape[0]

        self.linear1 = nn.Linear(input_size, self.l1_size)
        self.linear2 = nn.Linear(self.l1_size, self.l2_size)
        self.elu = nn.ELU()
        self.dropout = nn.Dropout(p=self.dropout_param)

        self.mask_units(self.percent_to_mask)

    def mask_units(self, percent_to_mask):
        self.mask_distribution = Bernoulli(th.tensor([1.0-percent_to_mask]*self.l2_size))   
        self.mask = self.mask_distribution.sample()

    def forward(self, observations):
        x = self.linear1(observations)
        x = self.elu(x)
        x = self.linear2(x)
        if self.job == 'train': x = self.dropout(x)
        l2 = self.elu(x)

        if not self.training or self.job == 'train_masked': # second layer is masked during testing or masked training
            self.mask = self.mask.to(l2.device)
            l2 = l2*self.mask


        return l2


In [None]:
"""
Training environmets
"""
game = 'LunarLander-v2'
#game = 'CartPole-v1'
steps = int(3e5)
max_rew = 200 # trained until the maximum reward. I looked up online, LunarLander hardly reaches more way more than 200, and CartPole does not exceed 500 reward
with open(os.path.join(PATH_LOGS, "{}.txt".format(game)), "w") as f:
    f.write("Started training {} models for different number of steps \n".format(game))
    for size in [128, 256, 512, 1024]:
        for dropout in np.arange(0, 1.05, 0.05):
            num_training_steps = steps
            rew = 0
            policy_kwargs = dict(
                features_extractor_class=MaskedMLP,
                features_extractor_kwargs=dict(dropout_param=dropout, size=size, job='train')
            )
            env = make_vec_env(game, n_envs=10, seed=0, vec_env_cls=DummyVecEnv) # creating environment
            model = PPO('MlpPolicy', env, verbose=0,
                        policy_kwargs = policy_kwargs, device=dev, batch_size=128, n_epochs=32, learning_rate=2e-5) #initalizing network
            d = get_device()
            f.write("Training {} {} model for {} steps \n".format(dropout, size, num_training_steps))
            t = time.process_time()
            while rew < max_rew:
                model.learn(num_training_steps)
                model_save_title = "{}.{}x{}.dropout_{}".format(game, size, size, dropout)
                model.save(os.path.join(PATH_NETWORKS, game, model_save_title))
                f.write("we saved it nevertheless")
                rew, std = evaluate_policy(model, env, n_eval_episodes=int(1e2))
                f.write("we reached {} +-{} reward score".format(rew, std))
                if rew < max_rew:
                    num_training_steps = int(1e5)
                    f.write("we need additional {} steps to try to reach around {} cumulative reward score\n".format(num_training_steps, max_rew))
                    
            f.write("it took {} min to train the {}x{} model with {}% dropout\n".format(round((time.process_time() - t)/60, 2), size, size, dropout*100))
            model_save_title = "{}.{}x{}.dropout_{}".format(game, size, size, dropout)
            model.save(os.path.join(PATH_NETWORKS, game, model_save_title))