## Train GAN on CelebA dataset

In [1]:
import os
import sys

from classes.GAN import GAN
from utils.callbacks import WandbImagesVAE, SaveGeneratorWeights, SaveVAEWeights, WandbVAECallback, WandbImagesGAN, \
    SaveGANWeights
import tensorflow as tf
from tensorflow import keras
import numpy as np
import wandb
from wandb.keras import WandbCallback
from imutils import paths
import matplotlib.pyplot as plt

2021-12-14 13:12:59.634741: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 227 MB memory:  -> device: 0, name: NVIDIA A100-SXM-80GB, pci bus id: 0000:07:00.0, compute capability: 8.0
2021-12-14 13:12:59.637462: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 78356 MB memory:  -> device: 1, name: NVIDIA A100-SXM-80GB, pci bus id: 0000:0f:00.0, compute capability: 8.0
2021-12-14 13:12:59.639100: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 78356 MB memory:  -> device: 2, name: NVIDIA A100-SXM-80GB, pci bus id: 0000:47:00.0, compute capability: 8.0
2021-12-14 13:12:59.640910: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 78356 MB memory:  -> device: 3, name: NVIDIA A100-SXM-80GB, pci bus id: 

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="1"

### Model definition
And initial configuration

In [3]:
wandb.login()


encoder_architecture=[(0,32),(0,64),(1,128),(1,128),(1,256)]
decoder_architecture=[(0,256),(0,128),(1,128),(1,64),(1,32)]

g=GAN((128,128,3),
      latent_dim=512,
      encoder_architecture=encoder_architecture,
      decoder_architecture=decoder_architecture)


config={"dataset":"celebA", "type":"GAN","encoder_architecture":encoder_architecture,"decoder_architecture":decoder_architecture}
config.update(g.get_dict())


images_dir=r"/home/matteo/NeuroGEN/Dataset/Img/img_align_celeba"

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmatteoferrante[0m (use `wandb login --relogin` to force relogin)


## Generator

In [4]:
g.generator.summary()

Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
latent_input (InputLayer)    [(None, 512)]             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              525312    
_________________________________________________________________
reshape (Reshape)            (None, 4, 4, 64)          0         
_________________________________________________________________
leaky_re_lu_3 (LeakyReLU)    (None, 4, 4, 64)          0         
_________________________________________________________________
conv_transpose_res_block (Co (None, 8, 8, 256)         147712    
_________________________________________________________________
conv_transpose_res_block_1 ( (None, 16, 16, 128)       295040    
_________________________________________________________________
conv_transpose_res_block_2 ( (None, 32, 32, 128)       4602

## Discriminator

In [5]:
g.discriminator.summary()

Model: "discriminator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
image_input (InputLayer)     [(None, 128, 128, 3)]     0         
_________________________________________________________________
conv_res_block (ConvResBlock (None, 64, 64, 32)        896       
_________________________________________________________________
conv_res_block_1 (ConvResBlo (None, 32, 32, 64)        18496     
_________________________________________________________________
conv_res_block_2 (ConvResBlo (None, 16, 16, 128)       386560    
_________________________________________________________________
conv_res_block_3 (ConvResBlo (None, 8, 8, 128)         460288    
_________________________________________________________________
conv_res_block_4 (ConvResBlo (None, 4, 4, 256)         1543168   
_________________________________________________________________
flatten (Flatten)            (None, 4096)            

In [6]:
#other important definitions

EPOCHS=250
BS=512
INIT_LR=5e-5

config["epochs"]=EPOCHS
config["BS"]=BS
config["init_lr"]=INIT_LR

config["nota"]="DGX"

wandb.init(project="TorVergataExperiment-Generative",config=config)

  warn("The `IPython.html` package has been deprecated since IPython 4.0. "


## Dataloaders

In [7]:
def load_images(imagePath):
    # read the image from disk, decode it, resize it, and scale the
    # pixels intensities to the range [0, 1]
    image = tf.io.read_file(imagePath)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, (128, 128)) / 255.0

    # eventually load other information like attributes here

    # return the image and the extra info

    return image


print("[INFO] loading image paths...")
imagePaths = list(paths.list_images(images_dir))


train_len=int(0.8*len(imagePaths))
val_len=int(0.1*len(imagePaths))
test_len=int(0.1*len(imagePaths))

train_imgs=imagePaths[:train_len]                                #      80% for training
val_imgs=imagePaths[train_len:train_len+val_len]                 #      10% for validation
test_imgs=imagePaths[train_len+val_len:]                         #      10% for testing

print(f"[TRAINING]\t {len(train_imgs)}\n[VALIDATION]\t {len(val_imgs)}\n[TEST]\t\t {len(test_imgs)}")


train_dataset = tf.data.Dataset.from_tensor_slices(train_imgs)
train_dataset = (train_dataset
    .shuffle(1024)
    .map(load_images)
    .cache()
    .repeat()
    .batch(BS)
)

ts=len(train_imgs)//BS

##VALIDATION

val_dataset = tf.data.Dataset.from_tensor_slices(val_imgs)
val_dataset = (val_dataset
    .shuffle(1024)
    .map(load_images)
    .cache()
    .repeat()
    .batch(BS)
)

vs=len(val_imgs)//BS

## TEST

test_dataset = tf.data.Dataset.from_tensor_slices(test_imgs)
test_dataset = (test_dataset
    .shuffle(1024)
    .map(load_images)
    .cache()
    .batch(BS)
)


[INFO] loading image paths...
[TRAINING]	 138545
[VALIDATION]	 17318
[TEST]		 17319


## Compile
And set callbacks

In [8]:
os.makedirs("models/gan",exist_ok=True)
model_check=SaveGANWeights(filepath="models/gan")

g.compile()


try:                                  #workaround to use Wandbcallback at first attempt
    wb=WandbCallback()
except:
    wb=WandbCallback()
    
    
callbacks=[
    WandbImagesGAN(target_shape=(128,128,3)),
    wb,
    model_check,
]


## Train

In [9]:
g.fit(train_dataset,validation_data=test_dataset,steps_per_epoch=ts,validation_steps=vs,epochs=EPOCHS,callbacks=callbacks)


2021-12-14 13:13:33.558046: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')
Epoch 1/250


2021-12-14 13:13:34.465700: E tensorflow/stream_executor/cuda/cuda_blas.cc:226] failed to create cublas handle: CUBLAS_STATUS_NOT_INITIALIZED
2021-12-14 13:13:34.465791: W tensorflow/core/framework/op_kernel.cc:1692] OP_REQUIRES failed at matmul_op_impl.h:630 : Internal: Attempting to perform BLAS operation using StreamExecutor without BLAS support


InternalError: Attempting to perform BLAS operation using StreamExecutor without BLAS support [Op:MatMul]