In [1]:
import glob
import itertools
import json
import os
import sys

import tqdm
from tensorflow.keras.models import load_model

from classes.VQVAE import VQVAE2
from classes.PixelCNN2 import ConditionalPixelCNN2
from utils.callbacks import WandbImagesVQVAE, Save_VQVAE_Weights, Save_PixelCNN_Weights, Save_VQVAE2_Weights, \
    WandbImagesVQVAE2
from utils.functions import map_vqvae2_weights

import tensorflow as tf
from tensorflow import keras
import numpy as np
import wandb
from wandb.keras import WandbCallback
from tensorflow.data import AUTOTUNE


from os.path import join as opj
from imutils import paths
import tqdm
import glob
import json

In [2]:
#set the first GPU
os.environ["CUDA_VISIBLE_DEVICES"]="0"

print(os.environ.get("CUDA_VISIBLE_DEVICES"))

0


In [3]:
wandb.login()

phase="VQ-VAE2_Training"
#phase="PixelCNN_Training"

config={"dataset":"celebA", "type":"VQ-VAE2","phase":phase}

images_dir=r"C:\Users\matte\Dataset\tor_vergata\Dataset\Img\img_align_celeba" #local
images_dir=r"/home/matteo/NeuroGEN/Dataset/Img/img_align_celeba"

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmatteoferrante[0m (use `wandb login --relogin` to force relogin)


In [4]:
BS = 64
EPOCHS=40
INIT_LR=1e-4

config["BS"]=BS
config["EPOCHS"]=EPOCHS
config["INIT_LR"]=INIT_LR

## Dataloaders

In [5]:
def load_images(imagePath):
    # read the image from disk, decode it, resize it, and scale the
    # pixels intensities to the range [0, 1]
    image = tf.io.read_file(imagePath)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, (128, 128)) / 255.0

    #eventually load other information like attributes here
    
    # return the image and the extra info
    
    
    return image

In [6]:
print("[INFO] loading image paths...")
imagePaths = list(paths.list_images(images_dir))


train_len=int(0.8*len(imagePaths))
val_len=int(0.1*len(imagePaths))
test_len=int(0.1*len(imagePaths))

train_imgs=imagePaths[:train_len]                                #      80% for training
val_imgs=imagePaths[train_len:train_len+val_len]                 #      10% for validation
test_imgs=imagePaths[train_len+val_len:]                         #      10% for testing

print(f"[TRAINING]\t {len(train_imgs)}\n[VALIDATION]\t {len(val_imgs)}\n[TEST]\t\t {len(test_imgs)}")

[INFO] loading image paths...
[TRAINING]	 138545
[VALIDATION]	 17318
[TEST]		 17319


In [7]:
#TRAINING 

train_dataset = tf.data.Dataset.from_tensor_slices(train_imgs)
train_dataset = (train_dataset
    .map(load_images, num_parallel_calls=AUTOTUNE)
    .shuffle(1024)       
    .cache()
    .repeat()
    .batch(BS)
    .prefetch(AUTOTUNE)
)

ts=len(train_imgs)//BS

##VALIDATION

val_dataset = tf.data.Dataset.from_tensor_slices(val_imgs)
val_dataset = (val_dataset
    .map(load_images, num_parallel_calls=AUTOTUNE)
    .cache()
    .shuffle(1024) 
    .repeat()
    .batch(BS)
    .prefetch(AUTOTUNE)
)

vs=len(val_imgs)//BS

## TEST

test_dataset = tf.data.Dataset.from_tensor_slices(test_imgs)
test_dataset = (test_dataset
    .map(load_images, num_parallel_calls=AUTOTUNE)
    .cache()
    .batch(BS)
    .shuffle(1024)            
    .prefetch(AUTOTUNE)
)

## Model definition

In [8]:
print(f"[INFO] Training VQ_VAE Model")

input_shape=(128,128,3)
latent_dim=256
num_embeddings=1536


config["latent_dim"]=latent_dim
config["num_embeddings"]=num_embeddings
config["input_shape"]=input_shape

g=VQVAE2(input_shape,latent_dim=latent_dim,num_embeddings=num_embeddings,train_variance=4,n_res_channel=latent_dim,channels=256)




[INFO] Training VQ_VAE Model


In [9]:
print(g.encoder_b.summary())

print(g.encoder_t.summary())
print(g.conditional_bottom.summary())

print(g.decoder.summary())

Model: "bottom_encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128, 128, 3)]     0         
_________________________________________________________________
conv2d (Conv2D)              (None, 64, 64, 256)       12544     
_________________________________________________________________
activation (Activation)      (None, 64, 64, 256)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 32, 32, 256)       1048832   
_________________________________________________________________
activation_1 (Activation)    (None, 32, 32, 256)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 16, 16, 256)       1048832   
_________________________________________________________________
activation_2 (Activation)    (None, 16, 16, 256)    

## Phase 1: Train the VQ VAE 2

In [10]:
wandb.init(project="TorVergataExperiment-Generative",config=config)

In [11]:
os.makedirs("models/vqvae2_celeba",exist_ok=True)


model_check= Save_VQVAE2_Weights("models/vqvae2_celeba")



es=tf.keras.callbacks.EarlyStopping(
    monitor="val_val_reconstruction_loss",
    min_delta=0,
    patience=3,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
)


callbacks=[
    WandbImagesVQVAE2(test_dataset,sample=False),
    WandbCallback(),
    model_check,
    es,
]

In [12]:
g.compile(keras.optimizers.Adam(INIT_LR))

In [13]:
g.fit(train_dataset,validation_data=val_dataset,steps_per_epoch=ts,validation_steps=vs,epochs=20,callbacks=callbacks)

g.save_dict("models/vqvae2_celeba/dict.json")
g.vqvae.save_weights("models/vqvae2_celeba/model_vqvae2_weights.h5")
g.vqvae.save("models/vqvae2_celeba/model_vqvae2_model.h5")

Epoch 1/20
 351/2164 [===>..........................] - ETA: 23:11 - loss: 0.0407 - reconstruction_loss: 0.0096 - vqvae_loss: 0.0252

ResourceExhaustedError:  OOM when allocating tensor with shape[64,256,128,128] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node gradient_tape/vq_vae2/decoder/conv2d_transpose_9/conv2d_transpose/Conv2D (defined at /home/matteo/NeuroGEN/TorVergataExperiments/Generative/classes/VQVAE.py:602) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_train_function_6587]

Errors may have originated from an input operation.
Input Source operations connected to node gradient_tape/vq_vae2/decoder/conv2d_transpose_9/conv2d_transpose/Conv2D:
 vq_vae2/decoder/conv2d_transpose_9/conv2d_transpose/ReadVariableOp (defined at /home/matteo/NeuroGEN/TorVergataExperiments/Generative/classes/VQVAE.py:593)

Function call stack:
train_function


In [None]:
x=np.random.randint(0,256,size=(64,128,128,3))

In [None]:
h_top = g.encoder_t(g.encoder_b(x))  # data flows until the top
e_top = g.quantizer_t(h_top)

In [None]:
h_bottom_conditioned = g.conditional_bottom([g.encoder_b(x), e_top])

e_bottom = g.quantizer_b(h_bottom_conditioned)


In [None]:
e_top.shape,e_bottom.shape

In [None]:
g.decode((e_top,e_bottom))