<a href="https://colab.research.google.com/github/mirklys/little-projects/blob/main/thesis/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Mon Apr 18 06:56:12 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install cloud-tpu-client==0.10 torch==1.11.0 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-1.11-cp37-cp37m-linux_x86_64.whl

In [1]:
import torch_xla
import torch_xla.core.xla_model as xm



In [2]:
dev = xm.xla_device()


In [3]:
dev

device(type='xla', index=1)

In [2]:
!pip3 install Box2D
!pip3 install box2d-py
!pip3 install gym[all]
!pip3 install gym[Box_2D]

In [None]:
!pip install stable_baselines
!pip install stable_baselines3

In [None]:
!pip install tensorflow

In [6]:
import gym
import os
import numpy as np
import torch as th
from torch import nn
from torch.distributions.bernoulli import Bernoulli
import matplotlib.pyplot as plt
from collections import defaultdict
import time

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.utils import get_device
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines.common import set_global_seeds, make_vec_env
from stable_baselines3.common.monitor import Monitor

In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [8]:
PATH_BASE = '/content/gdrive/MyDrive/Thesis Project'
PATH_DATA = os.path.join(PATH_BASE, 'data/')
PATH_NETWORKS = os.path.join(PATH_BASE, 'networks/')
PATH_PLOTS = os.path.join(PATH_BASE, 'plots/')
PATH_RESULTS = os.path.join(PATH_BASE, 'results/')
PATH_LOGS = os.path.join(PATH_BASE, 'logs/')
os.chdir(PATH_BASE)

In [9]:
dev = th.device('cuda' if th.cuda.is_available() else 'cpu')
dev

device(type='cuda')

In [12]:
class MaskedMLP(BaseFeaturesExtractor):
    def __init__(self, observation_space, dropout_param=0.0, percent_to_mask=0.0, size=32, job='train'):
        assert job == 'train' or job == 'train_masked',\
            "This class can only be initialized for jobs: train, train_masked"

        self.l1_size = size
        self.l2_size = size
        self.job = job
        super(MaskedMLP, self).__init__(observation_space, self.l2_size)

        self.dropout_param = dropout_param
        self.percent_to_mask = percent_to_mask

        input_size = observation_space.shape[0]

        self.linear1 = nn.Linear(input_size, self.l1_size)
        self.linear2 = nn.Linear(self.l1_size, self.l2_size)
        self.elu = nn.ELU()
        self.dropout = nn.Dropout(p=self.dropout_param)

        if self.job == 'train':
            self.layer1 = nn.Sequential(
                nn.Linear(input_size, self.l1_size),
                nn.ELU()
            )
            self.layer2 = nn.Sequential(
                nn.Linear(self.l1_size, self.l2_size),
                nn.Dropout(p=self.dropout_param),
                nn.ELU(),
            )

        self.mask_units(self.percent_to_mask)

    def mask_units(self, percent_to_mask):
        self.mask_distribution = Bernoulli(th.tensor([1.0-percent_to_mask]*self.l2_size))   
        self.mask = self.mask_distribution.sample()

    def forward(self, observations):
        x = self.linear1(observations)
        x = self.elu(x)
        x = self.linear2(x)
        if self.job == 'train': x = self.dropout(x)
        l2 = self.elu(x)

        if not self.training or self.job == 'train_masked':
            self.mask = self.mask.to(l2.device)
            l2 = l2*self.mask


        return l2


In [None]:
#@title
"""
Training cartpole
"""
num_training_steps = 300000
with open(os.path.join(PATH_LOGS, "training_cartpole.txt"), "w") as f:
    f.write("training all cartpole models for 300,000 steps \n")
    for size in [128, 256, 512, 1024]:
        for dropout in [0.0, 0.2, 0.4, 0.6, 0.8]:
            policy_kwargs = dict(
                features_extractor_class=MaskedMLP,
                features_extractor_kwargs=dict(dropout_param=dropout, size=size, job='train')
            )
            env = make_vec_env('CartPole-v1', n_envs=10, seed=0, vec_env_cls=DummyVecEnv)
            model = PPO('MlpPolicy', env, verbose=0,
                        policy_kwargs = policy_kwargs, device=dev)
            t = time.process_time()
            model.learn(num_training_steps)
            rew, _ = evaluate_policy(model, env, n_eval_episodes=100)
            f.write("we reached {} reward score".format(rew))
            f.write("it took {} min to train the {}x{} model with {}% dropout".format(round((time.process_time() - t)/60, 2), size, size, dropout*100))
            model_save_title = "{}.{}x{}.dropout_{}".format('CartPole-v1', model.policy.features_extractor.l1_size, model.policy.features_extractor.l2_size, dropout)
            model.save(os.path.join(PATH_NETWORKS, 'CartPole-v1', model_save_title))
            f.write('saved it')

In [None]:
#@title
"""
Training cartpole
"""
num_training_steps = 600000
with open(os.path.join(PATH_LOGS, "training_cartpole_1024_80.txt"), "w") as f:
    f.write("training all cartpole models for 300,000 steps \n")
    for size in [1024]:
        for dropout in [0.8]:
            policy_kwargs = dict(
                features_extractor_class=MaskedMLP,
                features_extractor_kwargs=dict(dropout_param=dropout, size=size, job='train')
            )
            env = make_vec_env('CartPole-v1', n_envs=10, seed=0, vec_env_cls=DummyVecEnv)
            model_save_title = "{}.{}x{}.dropout_{}".format('CartPole-v1', size, size, dropout)
            model = PPO.load(os.path.join(PATH_NETWORKS, 'CartPole-v1', model_save_title))
            model.set_env(env)
            t = time.process_time()
            model.learn(num_training_steps)
            rew, _ = evaluate_policy(model, env, n_eval_episodes=100)
            f.write("we reached {} reward score \n".format(rew))
            f.write("it took {} min to train the {}x{} model with {}% dropout \n".format(round((time.process_time() - t)/60, 2), size, size, dropout*100))
            model_save_title = "{}.{}x{}.dropout_{}".format('CartPole-v1', model.policy.features_extractor.l1_size, model.policy.features_extractor.l2_size, dropout)
            model.save(os.path.join(PATH_NETWORKS, 'CartPole-v1', model_save_title))
            f.write('re-saved it \n')

In [None]:
"""
Training bipedal walker
"""
game = 'LunarLander-v2'
with open(os.path.join(PATH_LOGS, "training_LunarLander_128_60_80.txt"), "w") as f:
    f.write("Started training LunarLander models for different number of steps \n")
    print("Started training LunarLander models for different number of steps \n")
    for size in [128]:
        for dropout in [0.8]:
            num_training_steps = int(3e6)
            rew = 0
            policy_kwargs = dict(
                features_extractor_class=MaskedMLP,
                features_extractor_kwargs=dict(dropout_param=dropout, size=size, job='train')
            )
            env = make_vec_env(game, n_envs=10, seed=0, vec_env_cls=DummyVecEnv)
            model = PPO('MlpPolicy', env, verbose=0,
                        policy_kwargs = policy_kwargs, device=dev)
            d = get_device()
            print("device", d)
            f.write("Training {} {} model for {} steps \n".format( dropout, size,num_training_steps))
            print("Training {} {} model for {} steps \n".format( dropout, size,num_training_steps))
            t = time.process_time()
            while rew < 300*0.85:
                model.learn(num_training_steps)
                model_save_title = "{}.{}x{}.dropout_{}".format(game, model.policy.features_extractor.l1_size, model.policy.features_extractor.l2_size, dropout)
                model.save(os.path.join(PATH_NETWORKS, game, model_save_title))
                f.write("we saved it nevertheless")
                print("we saved it nevertheless")
                rew, _ = evaluate_policy(model, env, n_eval_episodes=10)
                f.write("we reached {} reward score".format(rew))
                print("we reached {} reward score".format(rew))
                if rew < 300*0.85:
                    num_training_steps = int(1e5)
                    f.write("we need additional {} steps to try to reach around 300 cumulative reward score\n".format(num_training_steps))
                    print("we need additional {} steps to try to reach around 300 cumulative reward score\n".format(num_training_steps))
                    
            f.write("it took {} min to train the {}x{} model with {}% dropout\n".format(round((time.process_time() - t)/60, 2), size, size, dropout*100))
            print("it took {} min to train the {}x{} model with {}% dropout\n".format(round((time.process_time() - t)/60, 2), size, size, dropout*100))
            model_save_title = "{}.{}x{}.dropout_{}".format(game, model.policy.features_extractor.l1_size, model.policy.features_extractor.l2_size, dropout)
            model.save(os.path.join(PATH_NETWORKS, game, model_save_title))

Started training LunarLander models for different number of steps 

device cuda
Training 0.8 128 model for 3000000 steps 

we saved it nevertheless




we reached -131.77967856654723 reward score
we need additional 100000 steps to try to reach around 300 cumulative reward score

we saved it nevertheless
we reached -126.0209431779571 reward score
we need additional 100000 steps to try to reach around 300 cumulative reward score

we saved it nevertheless
we reached -76.39692546049575 reward score
we need additional 100000 steps to try to reach around 300 cumulative reward score

we saved it nevertheless
we reached -194.02981819545974 reward score
we need additional 100000 steps to try to reach around 300 cumulative reward score

we saved it nevertheless
we reached -9.182830027810468 reward score
we need additional 100000 steps to try to reach around 300 cumulative reward score

we saved it nevertheless
we reached 11.231736828486433 reward score
we need additional 100000 steps to try to reach around 300 cumulative reward score

we saved it nevertheless
we reached -134.66420952830646 reward score
we need additional 100000 steps to try to 

In [None]:
"""
Training bipedal walker
"""
game = 'LunarLander-v2'
with open(os.path.join(PATH_LOGS, "training_LunarLander.txt"), "w") as f:
    f.write("Started training LunarLander models for different number of steps \n")
    print("Started training LunarLander models for different number of steps \n")
    for size in [256, 512, 1024]:
        for dropout in [0.4, 0.6, 0.8]:
            num_training_steps = int(1e6)
            rew = 0
            policy_kwargs = dict(
                features_extractor_class=MaskedMLP,
                features_extractor_kwargs=dict(dropout_param=dropout, size=size, job='train')
            )
            env = make_vec_env(game, n_envs=10, seed=0, vec_env_cls=DummyVecEnv)
            model = PPO('MlpPolicy', env, verbose=0,
                        policy_kwargs = policy_kwargs, device=dev, n_epochs=50, gamma=0.998)
            #model = PPO.load(os.path.join(PATH_NETWORKS, game, "{}.{}x{}.dropout_{}".format(game, size, size, dropout)))
            #model.set_env(env)
            d = get_device()
            print("device", d)
            f.write("Training {} {} model for {} steps \n".format( dropout, size,num_training_steps))
            print("Training {} {} model for {} steps \n".format( dropout, size,num_training_steps))
            t = time.process_time()
            while rew < 300*0.9:
                model.learn(num_training_steps)
                model_save_title = "{}.{}x{}.dropout_{}".format(game, model.policy.features_extractor.l1_size, model.policy.features_extractor.l2_size, dropout)
                model.save(os.path.join(PATH_NETWORKS, game, model_save_title))
                f.write("we saved it nevertheless")
                print("we saved it nevertheless")
                rew, _ = evaluate_policy(model, env, n_eval_episodes=10)
                f.write("we reached {} reward score".format(rew))
                print("we reached {} reward score".format(rew))
                if rew < 300*0.9:
                    num_training_steps = int(1e5)
                    f.write("we need additional {} steps to try to reach around 300 cumulative reward score\n".format(num_training_steps))
                    print("we need additional {} steps to try to reach around 300 cumulative reward score\n".format(num_training_steps))
                    
            f.write("it took {} min to train the {}x{} model with {}% dropout\n".format(round((time.process_time() - t)/60, 2), size, size, dropout*100))
            print("it took {} min to train the {}x{} model with {}% dropout\n".format(round((time.process_time() - t)/60, 2), size, size, dropout*100))