<a href="https://colab.research.google.com/github/neil-tan/CU-Analog/blob/master/GANdam_Ray_PBT_Lightning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DCGAN Lightning GANdam
Fully Connected for Generators and Discriminators on MNIST

## Setup

### Path
Please ensure Colab_results folder exists at the root of your Google Drive

Or, you may comment out Google Drive and set log_path manually

In [None]:
import os
from datetime import datetime
from tempfile import mkdtemp
from google.colab import drive

# Google Drive Mounting
gdrive_mount_path = '/content/gdrive'
gdrive_save_path = 'My Drive/Colab_results'
drive.mount(gdrive_mount_path)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#### Log Path
Create the log directory

In [None]:
save_log_filename = datetime.now().strftime("%d-%m-%Y_%H_%M_%S-GANdam_BAYSIAN_SWEEP")
log_path = os.path.join(gdrive_mount_path, gdrive_save_path, save_log_filename)

os.mkdir(log_path)

print(log_path + " exists: ", os.path.exists(log_path))
data_dir = mkdtemp(prefix="GANdam_data_")

/content/gdrive/My Drive/Colab_results/12-08-2020_15_54_15-GANdam_BAYSIAN_SWEEP exists:  True


#### Data Path

In [None]:
gdrive_dataset_path = "My Drive/dataset"
data_zip_path = os.path.join(gdrive_mount_path, gdrive_dataset_path, "GANdam.zip")

# data_unzip_process = os.popen("unzip " + data_zip_path.replace(" ", "\ ") + " -d .")
os.system("unzip " + data_zip_path.replace(" ", "\ ") + " -d .")

dataset_path = os.path.abspath("GANdam")
print(dataset_path + " exists: ", os.path.exists(dataset_path))

/content/GANdam exists:  True


### Installation and Imports

In [None]:
%%capture --no-stderr
  
!pip install matplotlib pytorch-lightning bayesian-optimization

# Ray Tune latest snapshot: https://docs.ray.io/en/latest/installation.html
!pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp36-cp36m-manylinux1_x86_64.whl

# previous used version:
#!pip install -U https://ray-wheels.s3-us-west-2.amazonaws.com/master/0c3b9ebeef167a9eeb6eed9fb18f328ecb5a3c6f/ray-0.9.0.dev0-cp36-cp36m-manylinux1_x86_64.whl

!python --version

In [None]:
import time
import threading
import shutil
from collections import OrderedDict
from functools import partial
from argparse import Namespace

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder 

import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.utilities.cloud_io import load as pl_load


import matplotlib.pyplot as plt

import matplotlib.animation as animation
from IPython.display import HTML

import ray
import ray.tune as tune
from ray.tune import Trainable, run, sample_from, JupyterNotebookReporter
from ray.tune.schedulers import ASHAScheduler, MedianStoppingRule, PopulationBasedTraining
from ray.tune.suggest.bayesopt import BayesOptSearch

from tensorboard import notebook

# Generators

In [None]:
class Generator_Conv(nn.Module):
    def __init__(self, latent_dim):
        super(Generator_Conv, self).__init__()
        self.latent_dim = latent_dim
        ngf = 64
        nc = 3

        self.main = nn.Sequential(
            # input is Z, going into a convolution
            nn.ConvTranspose2d(in_channels=self.latent_dim,
                               out_channels=ngf * 4,
                               kernel_size=5,
                               stride=1,
                               padding=0,
                               bias=False),
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(True),
            # state size. (ngf*8) x 4 x 4
            nn.ConvTranspose2d(in_channels=ngf * 4,
                               out_channels=ngf * 4,
                               kernel_size=6,
                               stride=2, 
                               padding=0, 
                               bias=False),
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(True),
            # state size. (ngf*4) x 8 x 8
            nn.ConvTranspose2d(in_channels=ngf * 4,
                               out_channels=ngf * 2,
                               kernel_size=5,
                               stride=2,
                               padding=0,
                               bias=False),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),
            # state size. (ngf*2) x 14 x 14
            nn.ConvTranspose2d(in_channels=ngf * 2,
                               out_channels=ngf,
                               kernel_size=3,
                               stride=2,
                               padding=0,
                               bias=False),
            nn.BatchNorm2d(ngf),
            nn.ReLU(True),
            
            nn.ConvTranspose2d(in_channels=ngf,
                               out_channels=nc,
                               kernel_size=4,
                               stride=2,
                               padding=0,
                               bias=False),
            nn.BatchNorm2d(nc),
            nn.Tanh(),
            # state size. (nc = 1) x 28 x 28
        )

    def forward(self, input):
      input_4d = input.unsqueeze(2).unsqueeze_(3)
      return self.main(input_4d)

    def print_summary(self, input_size=None):
      from torchsummary import summary
      print(self)

      if input_size is None:
        input_size = (self.latent_dim,)
#      summary(self.cuda(), input_size=input_size)
      summary(self.cuda(), input_size=input_size)

GeneratorFactory = {
    'Generator_Conv': Generator_Conv,
}

In [None]:
test_model = Generator_Conv(512)
test_model.print_summary()

Generator_Conv(
  (main): Sequential(
    (0): ConvTranspose2d(512, 256, kernel_size=(5, 5), stride=(1, 1), bias=False)
    (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): ConvTranspose2d(256, 256, kernel_size=(6, 6), stride=(2, 2), bias=False)
    (4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): ConvTranspose2d(256, 128, kernel_size=(5, 5), stride=(2, 2), bias=False)
    (7): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): ReLU(inplace=True)
    (9): ConvTranspose2d(128, 64, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (10): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): ReLU(inplace=True)
    (12): ConvTranspose2d(64, 3, kernel_size=(4, 4), stride=(2, 2), bias=False)
    (13): BatchNorm2d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats

# Discriminators

In [None]:
class Discriminator_Conv(nn.Module):
    def __init__(self):
        super(Discriminator_Conv, self).__init__()
        ndf = 128
        nc = 3

        self.main = nn.Sequential(
            # input is (nc=1) x 28 x 28
            nn.Conv2d(nc, ndf, 4, stride=2, padding=2, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            
            # state size. (ndf) x 16 x 16
            nn.Conv2d(ndf, ndf * 2, 4, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*2) x 8 x 8
            nn.Conv2d(ndf * 2, ndf * 4, 4, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(ndf * 4),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(ndf * 4, ndf * 8, 4, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(ndf * 8),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(ndf * 8, 1, 3, stride=1, padding=0, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            
            nn.Flatten(),
            nn.Linear(36, 1),
            
            
            nn.Sigmoid()
        )

    def forward(self, input):
        return self.main(input)

    def print_summary(self, input_size=(3, 128, 128)):
      from torchsummary import summary
      print(self)
      summary(self.cuda(), input_size=input_size)

# https://colab.research.google.com/drive/1F_RNcHzTfFuQf-LeKvSlud6x7jXYkG31#scrollTo=pcPCt8JG7tI-&line=1&uniqifier=1
DiscriminatorFactory = {
    'Discriminator_Conv': Discriminator_Conv,
}

In [None]:
test_model = Discriminator_Conv()
test_model.print_summary()

Discriminator_Conv(
  (main): Sequential(
    (0): Conv2d(3, 128, kernel_size=(4, 4), stride=(2, 2), padding=(2, 2), bias=False)
    (1): LeakyReLU(negative_slope=0.2, inplace=True)
    (2): Conv2d(128, 256, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): LeakyReLU(negative_slope=0.2, inplace=True)
    (5): Conv2d(256, 512, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (6): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): LeakyReLU(negative_slope=0.2, inplace=True)
    (8): Conv2d(512, 1024, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (9): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): LeakyReLU(negative_slope=0.2, inplace=True)
    (11): Conv2d(1024, 1, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (12): LeakyReLU(negative_slope=0.2, inplac

## Image Processing

In [None]:
import torchvision.transforms.functional as TF

class UnitAspectRatioPaddingTransform:

    def __init__(self):
        pass

    def __call__(self, x):
        d_w_h = x.width - x.height
        
        if d_w_h == 0:
            return x
        
        pad_l = 0
        pad_t = 0
        pad_r = 0
        pad_b = 0
        
        if d_w_h > 0:
            # wider
            pad_t = int(np.ceil(d_w_h/2))
            pad_b = int(np.floor(d_w_h/2))
        elif d_w_h < 0:
            # taller
            pad_l = int(np.ceil(-d_w_h/2))
            pad_r = int(np.floor(-d_w_h/2))

        # TF.pad calls np.pad() underneath
        return TF.pad(x, padding=(pad_l, pad_t, pad_r, pad_b), padding_mode='edge')

# Lightning Module

In [None]:
metric_name = "exp_avg_distance"
metric_mode = "min"

In [None]:
class GANdam(pl.LightningModule):

    def __init__(self, hparams, data_dir=None):
        super(GANdam, self).__init__()

        self.hparams = hparams if isinstance(hparams, Namespace) else Namespace(**hparams)
        self.data_dir = data_dir or os.getcwd()
        
        # networks
        mnist_shape = (1, 28, 28)
        self.generator = GeneratorFactory[self.hparams.generator](latent_dim=self.hparams.latent_dim)
        self.discriminator = DiscriminatorFactory[self.hparams.discriminator]()
        
        # cache for generated images
        self.generated_imgs = None
        self.last_imgs = None
 
        self.gen_img_list = list()

        # exponential average metric
        self.exp_avg_metric_val = None
        
    def forward(self, z):
        return self.generator(z)
    
    def adversarial_loss(self, y_hat, y):
        return F.binary_cross_entropy(y_hat, y)
    
    def configure_optimizers(self):
        b1 = self.hparams.b1
        b2 = self.hparams.b2
        
        opt_g = torch.optim.Adam(self.generator.parameters(), lr=self.hparams.lr_g, betas=(b1, b2))
        opt_d = torch.optim.Adam(self.discriminator.parameters(), lr=self.hparams.lr_d, betas=(b1, b2))
        
        return [opt_g, opt_d], []
    
    def training_step(self, batch, batch_nb, optimizer_idx):
        #train generator
        imgs, classes = batch
        self.last_imgs = imgs
        
        latent_samplings = torch.randn(imgs.shape[0], self.hparams.latent_dim, device=self.device)
        #detach() to stop gradient backprop to the generator
        self.img_generated = self.generator(latent_samplings).detach()
        
        real_labels = torch.ones(imgs.shape[0], 1, device=self.device)
        
        if optimizer_idx == 0:
            g_loss = self.adversarial_loss(self.discriminator(self.generator(latent_samplings)), real_labels)
            
            tqdm_dict = {'g_loss': g_loss}
            output = OrderedDict({
                'loss': g_loss,
                'progress_bar': tqdm_dict,
                'log': tqdm_dict
            })
            self.current_g_loss = g_loss
        
        elif optimizer_idx == 1:
            d_loss_real = self.adversarial_loss(self.discriminator(imgs), real_labels)
            
            fake_labels = torch.zeros(imgs.shape[0], 1, device=self.device)
            d_loss_fake = self.adversarial_loss(self.discriminator(self.img_generated), fake_labels)
            
            d_loss = (d_loss_real + d_loss_fake) / 2

            # measures the min/max equilibrium
            self.d_eq_distance = torch.abs(d_loss - 0.5)
            self.exp_avg_distance = self.exp_avg_metric(self.d_eq_distance)
            
            # d_eq_distance_metric is a duplication of d_eq_distance
            # It is required for a workaround relating to HParams metrics in TensorBoard
            # https://github.com/PyTorchLightning/pytorch-lightning/issues/1228#issuecomment-659175500
            tqdm_dict = {'d_loss': d_loss, 'd_eq_distance': self.d_eq_distance, metric_name: self.exp_avg_distance}
            output = OrderedDict({
                'loss': d_loss,
                'progress_bar': tqdm_dict,
                'log': tqdm_dict
            })
            self.current_d_loss = d_loss
          

        return output
    
    def exp_avg_metric(self, val, alpha = 0.1):
      if self.exp_avg_metric_val == None:
        self.exp_avg_metric_val = val

      self.exp_avg_metric_val = alpha * val + (1 - alpha) * self.exp_avg_metric_val

      return self.exp_avg_metric_val

    
    def train_dataloader(self):
        transform = transforms.Compose([UnitAspectRatioPaddingTransform(),
                                transforms.Resize(128, interpolation=2),
                                transforms.ToTensor(),
                                transforms.Normalize([0.5], [0.5])])

        dataset = ImageFolder(dataset_path, transform=transform)

        return DataLoader(dataset, batch_size=32, shuffle=False)
    
    def on_epoch_end(self):
        z = torch.randn(16, self.hparams.latent_dim, device=self.device)
        sample_imgs = self(z)
        if len(sample_imgs.shape) < 4:
          sample_imgs = sample_imgs.unsqueeze(1) #adding the color channel


        grid = torchvision.utils.make_grid(sample_imgs, nrow=4, padding=5, normalize=True)
#         plt.imshow(np.transpose(np.asarray(grid),(1,2,0)))
        self.gen_img_list.append(grid.detach().cpu())

        self.logger.experiment.add_image(f'generated_images', grid, self.current_epoch)

    # A workaround for TB, see metric_name above
    def on_fit_start(self):
      metric_placeholder = {metric_name: 50}
      self.logger.log_hyperparams(self.hparams, metrics=metric_placeholder)

# Lightning Trial Setup

In [None]:
checkpoint_fname = "checkpoint"

In [None]:
class TuneCallback(Callback):

  def on_epoch_end(self, trainer, pl_module):
    # How to save checkpoints:
    # https://github.com/ray-project/ray/blob/9b1772253f47d20d8aa1bc727d67630b2026b9e5/python/ray/tune/examples/mnist_pytorch_lightning.py#L158

    # path = tune.make_checkpoint_dir(trainer.global_step)
    # path = tune.checkpoint_dir()
    # trainer.save_checkpoint(os.path.join(path, checkpoint_fname))
    # tune.save_checkpoint(path)

    with tune.checkpoint_dir(trainer.global_step) as path:
      trainer.save_checkpoint(os.path.join(path, checkpoint_fname))

    # reports needs to be called after tune.save_checkpoint(path)
    report_dict = {
        metric_name : pl_module.exp_avg_distance.item(),
        'loss_d' : pl_module.current_d_loss.item(),
        'loss_g' : pl_module.current_g_loss.item(),
        'current_epoch' : pl_module.current_epoch,
    }
    tune.report(**report_dict)


In [None]:
class MyTrainableClass(Trainable):
  # https://docs.ray.io/en/latest/tune/api_docs/trainable.html?highlight=get_trial_dir#ray.tune.Trainable.trial_name
    """Example agent whose learning curve is a random sigmoid.
    The dummy hyperparameters "width" and "height" determine the slope and
    maximum reward value reached.
    """

    # checkpoint files aren't removed

    def setup(self, config):
      self.num_gpus = config["num_gpus"]

      self.trainer = pl.Trainer(
        gpus=self.num_gpus,
         progress_bar_refresh_rate=100,
         logger=TensorBoardLogger(
             save_dir=self.logdir,
             name="",
             version="."),
         max_epochs=1, #single step
         checkpoint_callback=False, # checkpoint saved by ray
         )
      
      self.model = GANdam(Namespace(**config), data_dir=config["data_dir"])

      self.timestep = 0

    def step(self):
      # epoch isn't incremented in the
      self.trainer.fit(self.model)
      self.timestep = self.trainer.global_step

      result = {
        metric_name : self.model.exp_avg_distance.item(),
        'loss_d' : self.model.current_d_loss.item(),
        'loss_g' : self.model.current_g_loss.item(),
        'current_epoch' : self.model.current_epoch,
      }

      return result

    def save_checkpoint(self, checkpoint_dir):
      self.trainer.save_checkpoint(os.path.join(checkpoint_dir, checkpoint_fname))

    def load_checkpoint(self, checkpoint_path):
      chkp_file_path = os.path.join(checkpoint_path, checkpoint_fname)
      self.model.load_from_checkpoint(chkp_file_path)

      self.trainer = pl.Trainer(
              resume_from_checkpoint=chkp_file_path,
              gpus=self.num_gpus,
               progress_bar_refresh_rate=100,
               logger=TensorBoardLogger(
                   save_dir=self.logdir,
                   name="",
                   version="."),
               max_epochs=1, #single step
              #  max_epochs=self.timestep+1, #single step
               checkpoint_callback=False, # checkpoint saved by ray
               )

      self.timestep = self.trainer.global_step

In [None]:
def train_gandam(config, checkpoint_dir=None, data_dir=None, num_epochs=10, num_gpus=1):

  gan_model = GANdam(Namespace(**config), data_dir=data_dir)

  if checkpoint_dir:
    chkp_file_path = os.path.join(checkpoint_dir, checkpoint_fname)
    gan_model.load_from_checkpoint(chkp_file_path)

  else:
    chkp_file_path = None
    
  trainer = pl.Trainer(
                  resume_from_checkpoint=chkp_file_path,
                  gpus=num_gpus,
                   progress_bar_refresh_rate=100,
                   logger=TensorBoardLogger(
                       save_dir=os.path.join(log_path, self.trial_name),
                       name="",
                       version="."),
                   max_epochs=num_epochs,
                   checkpoint_callback=False)


  trainer.fit(gan_model)

  return {"model" : gan_model}

# Ray Tune

In [None]:
# https://docs.ray.io/en/master/_modules/ray/tune/progress_reporter.html#JupyterNotebookReporter
# reporter = CLIReporter(
reporter = JupyterNotebookReporter(overwrite=True,
    metric_columns=[metric_name, "loss_d", "loss_g", "current_epoch", "training_iteration"])

In [None]:
def tune_gandam_pbt(num_samples=3, num_epochs=5, cpus_per_trial=1, gpus_per_trial=1):

  # result properties: https://github.com/ray-project/ray/blob/master/python/ray/tune/result.py
  def trail_stopper(trial_id, result):
    stop = False
    stop |= result["training_iteration"] > num_epochs
    # stop |= result[metric_name] > 0.499
    return stop

  def rand_log_linear_lr(max=0.1, min=0.00001):
    return 10**(np.random.uniform(low=np.log10(min), high=np.log10(max))) 


  scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric=metric_name,
    mode=metric_mode,
    perturbation_interval=6,
    hyperparam_mutations={
        "lr_d": lambda: rand_log_linear_lr(max=0.1, min=0.00001),
        "lr_g": lambda: rand_log_linear_lr(max=0.1, min=0.00001),
  })

  config = {
      "generator": "Generator_Conv",
      "discriminator": "Discriminator_Conv",
      'batch_size': 512,
      'latent_dim': 512,
      "lr_d": tune.sample_from(lambda spec: rand_log_linear_lr(max=0.01, min=0.0001)),
      "lr_g": tune.sample_from(lambda spec: rand_log_linear_lr(max=0.01, min=0.0001)),
      'b1': 0.5,
      'b2': 0.999,
      # "batch_size": tune.choice([32, 64, 128]),
      "num_gpus": 1,
      "data_dir": data_dir

  }

  result = tune.run(
                 MyTrainableClass,
                 resources_per_trial={"cpu": cpus_per_trial, "gpu": gpus_per_trial},
                 stop=trail_stopper,
                 config=config,
                 num_samples=num_samples,
                 scheduler=scheduler,
                 queue_trials=True,
                 progress_reporter=reporter,
                 local_dir=log_path,
                 checkpoint_at_end=True,
                 verbose=2,
                 name="tune_pbt_gandam")
  
  return result

# Start TensorBoard & HParam Sweeping

In [None]:
# Start tensorboard.
%reload_ext tensorboard

#watch -n 1 timeout -sHUP 1m tensorboard --logdir .
def tfb_restarter():
    tfb_shell_str = "while true; do timeout -sHUP 1m tensorboard --logdir " + log_path.replace(" ", "\ ") + " --port=6006; done"
    os.popen(tfb_shell_str).wait()

thread1 = threading.Thread(target = tfb_restarter)
thread1.start()

time.sleep(3)

# https://github.com/tensorflow/tensorboard/blob/master/tensorboard/notebook.py
notebook.display(port=6006, height=1000) 

Exception in thread Thread-6:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-40-6f797c9fb07e>", line 7, in tfb_restarter
    os.popen(tfb_shell_str).wait()
  File "/usr/lib/python3.6/os.py", line 1008, in __getattr__
    return getattr(self._stream, name)
AttributeError: '_io.TextIOWrapper' object has no attribute 'wait'



Selecting TensorBoard with logdir /content/gdrive/My Drive/Colab_results/12-08-2020_11_18_17-GANdam_BAYSIAN_SWEEP (started 0:52:37 ago; port 6006, pid 97046).


<IPython.core.display.Javascript object>

In [None]:
%%capture --no-display --no-stderr
analysis = tune_gandam_pbt(num_samples=4, cpus_per_trial=2, gpus_per_trial= 1, num_epochs=30)
# analysis = tune_dcgan_mstop(num_samples=5, num_epochs=5)

Trial name,status,loc,lr_d,lr_g,exp_avg_distance,loss_d,loss_g,current_epoch,training_iteration
MyTrainableClass_1fe01_00000,ERROR,,0.00319706,0.00789307,0.342204,0.107164,6.72367,0,31
MyTrainableClass_1fe01_00001,RUNNING,172.28.0.2:381791,0.000158201,0.000732072,0.284638,0.263121,4.46904,0,15
MyTrainableClass_1fe01_00002,RUNNING,,0.000158201,0.000328182,0.290999,0.138008,4.01695,0,18
MyTrainableClass_1fe01_00003,PAUSED,,0.000131834,0.00091509,0.145481,0.307735,1.32954,0,12

Trial name,# failures,error file
MyTrainableClass_1fe01_00000,1,"/content/gdrive/My Drive/Colab_results/12-08-2020_15_54_15-GANdam_BAYSIAN_SWEEP/tune_pbt_gandam/MyTrainableClass_0_lr_d=0.0031971,lr_g=0.0078931_2020-08-12_15-54-37kqs8xg0j/error.txt"


(pid=381791) 
(pid=381791)   | Name          | Type               | Params
(pid=381791) -----------------------------------------------------
(pid=381791) 0 | generator     | Generator_Conv     | 6 M   
(pid=381791) 1 | discriminator | Discriminator_Conv | 11 M  
2020-08-13 02:20:43,088	INFO (unknown file):0 -- gc.collect() freed 66 refs in 0.13033863599412143 seconds


KeyboardInterrupt: ignored

# Training - Best HParams

In [None]:
print("best trial name: " + str(analysis.get_best_trial(metric=metric_name, mode=metric_mode, scope='last')))

best_hyperparameters = analysis.get_best_config(metric=metric_name, mode=metric_mode, scope='last')
print("best hyperparameters: " + str(best_hyperparameters))

gan_model = GANdam(Namespace(**best_hyperparameters), data_dir=data_dir)

# Restore Best Result
best_checkpoint_path = os.path.abspath(analysis.get_best_logdir(metric=metric_name, mode=metric_mode, scope='last'))
chkp_file_path = os.path.join(best_checkpoint_path, "checkpoint", checkpoint_fname)

gan_model.load_from_checkpoint(chkp_file_path)
  
trainer = pl.Trainer(
                resume_from_checkpoint=chkp_file_path,
                gpus=1,
                default_root_dir=os.path.join(log_path, "lightning_log_final"),
                checkpoint_callback=False,
                max_epochs=0)
trainer.fit(gan_model)

# #Re-Train
# gan_model = DCGAN(Namespace(**best_hyperparameters))
# trainer = pl.Trainer(gpus=1, max_epochs=20, default_root_dir=os.path.join(log_path, "lightning_log_final"))
# trainer.fit(gan_model)

# print("d_loss: ", gan_model.current_d_loss.item(), " g_loss: ", gan_model.current_g_loss.item())

NameError: ignored

In [None]:
z = torch.randn(8 * 8, gan_model.hparams.latent_dim, device=gan_model.device)
sample_imgs = gan_model(z)
if len(sample_imgs.shape) < 4:
  sample_imgs = sample_imgs.unsqueeze(1) #adding the color channel
grid = torchvision.utils.make_grid(sample_imgs.detach(), padding=5, normalize=True)
fig = plt.figure()
plt.imshow(np.transpose(grid,(1,2,0)), animated=True)

### Animation

In [None]:

fig = plt.figure()
ims = [[plt.imshow(np.transpose(i,(1,2,0)), animated=True)] for i in gan_model.gen_img_list]
ani = animation.ArtistAnimation(fig, ims, interval=1000, repeat_delay=1000, blit=True)
HTML(ani.to_jshtml())

# Wrapping Up

In [None]:
# https://pytorch-lightning.readthedocs.io/en/latest/weights_loading.html
trainer.save_checkpoint(os.path.join(log_path, "model.ckpt"))

import pickle
pickle.dump( best_hyperparameters, open(os.path.join(log_path, "hparams.pkl"), "wb" ))

In [None]:
shutil.rmtree(data_dir)

In [None]:
def ls_dirs(path):
  relative_paths = [os.path.join(path, i) for i in os.listdir(path)]
  return [i for i in relative_paths if os.path.isdir(i)]

for exp_dir in ls_dirs(log_path):
  for trail_dir in ls_dirs(os.path.join(log_path, exp_dir)):
        redundant_chkp_path = os.path.join(trail_dir, "checkpoint")
        if os.path.isdir(redundant_chkp_path) and redundant_chkp_path != best_checkpoint_path:
          print("removing " + redundant_chkp_path)
          shutil.rmtree(redundant_chkp_path)

# Useful Links

[Lightning Callbacks](https://github.com/PyTorchLightning/pytorch-lightning/blob/0.8.5/pytorch_lightning/callbacks/base.py)

[Lightning Trainer](https://github.com/PyTorchLightning/pytorch-lightning/blob/e1bc208f66891e22f0139619a1be5c06235a0f34/pytorch_lightning/trainer/trainer.py)

[Lightning Module](https://github.com/PyTorchLightning/pytorch-lightning/blob/1369012bc71f257dcf7423ec65146d055ddc1cc7/pytorch_lightning/core/lightning.py#L1574)

[Lightning Logger](https://github.com/PyTorchLightning/pytorch-lightning/blob/62ce00f96c09de6d137c810921a6cd9e7b60aff5/pytorch_lightning/trainer/logging.py)

[Lightning HParams Logging Issue](https://github.com/PyTorchLightning/pytorch-lightning/issues/1228)

[Ray Lightning Mnist](https://github.com/krfricke/ray/blob/5921475d9fa7b2bcb62d2413636f8e656a00f688/python/ray/tune/examples/mnist_pytorch_lightning.py)

[Ray Session](https://github.com/ray-project/ray/blob/d35f0e40d07bab06b41ce5493c2f50b6725a1857/python/ray/tune/session.py)

[Ray Result Dict](https://github.com/ray-project/ray/blob/master/python/ray/tune/result.py)

[Ray Function API](https://docs.ray.io/en/master/tune/api_docs/trainable.html#function-api)

[Ray PBT](https://docs.ray.io/en/master/tune/tutorials/tune-advanced-tutorial.html#replaying-a-pbt-run)

[Ray Search Space](https://github.com/ray-project/ray/blob/4bc1d7c043cca03a743fbeb98b0434fecafa7001/doc/source/tune/api_docs/grid_random.rst)

[Ray Memory](https://docs.ray.io/en/master/memory-management.html)

[Ray PBT Example](https://github.com/ray-project/ray/blob/master/python/ray/tune/examples/pbt_dcgan_mnist/pbt_dcgan_mnist.py)