Noisy / blank samples after many training steps #312

alif-munim · 2023-02-08T05:10:37Z

Hi! I'm working on training a text-to-video model using the starter code provided, but have been struggling to get anything other than noise, even after 5000+ training steps. I have since moved on to trying to implement text-to-image, but here as well I am only able to generate a black square after many training steps. I'm wondering if I am doing something wrong. I'm currently using ~600 images and captions from the Flickr8K dataset, and my code is below. Any help would be greatly appreciated!

### IMPORTS
import torch
import torchvision
import imageio.v3 as iio
import numpy as np 
import os
import torch
import cv2
import pandas


### DATA PREPROCESSING
f = []
images = []

for (dirpath, dirnames, filenames) in os.walk('/content/flickr8k/images'):
    f.extend(filenames)
    break

for filename in f:
    images.append(filename)
    
images = sorted(images)
captions = pandas.read_csv("/content/flickr8k/captions.txt")

count = 0
img_tensors = []
img_captions = []

# Get images, resize, and convert to tensor
# Also get corresponding captions and append both to lists
for img in images:
    im_path = '/content/flickr8k/images/' + str(img) 

    im = iio.imread(im_path, index=None)
    im = cv2.resize(im, (64,64))
    img_arr = np.asarray(im)

    # print(img_arr.shape)
    # img_arr = np.moveaxis(img_arr, -1, 1)
    img_tensor = torch.from_numpy(img_arr)
    img_tensor = img_tensor.type(torch.float32)
    img_tensor = img_tensor.to(device="cuda")

    img_caption = (captions.loc[captions['image'] == str(img)].iloc[0]['caption'])

    img_tensors.append(img_tensor)
    img_captions.append(img_caption)

# Create image batches
img_batches = []
for i in range(0, len(img_tensors), 2):

  if len(img_tensors) - (i + 2) > 0:
    a = img_tensors[i]
    b = img_tensors[i+1]

    z = torch.stack((a,b), dim=0)
    img_batches.append(z)

# Create caption batches
caption_batches = []
batch_size = 2
for i in range(0, len(img_captions), batch_size):

  if len(img_captions) - (i + batch_size) > 0:
    a = img_captions[i]
    b = img_captions[i+1]

    z = [a,b]
    caption_batches.append(z)




### INITIALIZE IMAGEN

from imagen_pytorch import t5
from imagen_pytorch import Unet, Imagen, ImagenTrainer
from google.colab import drive

# Tokenize captions using t5-large
embed_batches = []
for i in range(len(caption_batches)):
  encoded_input = t5.t5_encode_text(caption_batches[i], name='t5-large')
  embed_batches.append(encoded_input)

# Use a single u-net and instantiate imagen, trainer
unet1 = Unet(
    dim = 64, 
    cond_dim = 512,
    dim_mults = (1, 2, 4, 8),
    num_resnet_blocks = 3,
    layer_attns = (False, True, True, True),
)

imagen = Imagen(
    unets = (unet1),
    text_encoder_name = 't5-large',
    image_sizes = (64),
    timesteps = 1000,
    cond_drop_prob = 0.1
).cuda()

trainer = ImagenTrainer(image)





### TRAINING LOOP

drive.mount('/content/gdrive', force_remount=True)
save_path = '/content/gdrive/My Drive/Research'

print(f'Checkpoints will be saved at {save_path}')

# Load trainer from checkpoint if available, save every 5 epochs
current_epoch = 25
checkpoint = 'imagen_1unet_text2image_epoch' + str(current_epoch) + '.ckpt'
num_epochs = 50

if checkpoint: 
  trainer.load(os.path.join(save_path, checkpoint))
else:
  current_epoch = 0

for epoch in range(num_epochs):

  print(f'Beginning epoch {current_epoch + epoch + 1}...')

  for i in range(len(img_batches)):
    image_batch = img_batches[i]
    image_batch = image_batch.moveaxis(-1, 1)
    image_batch = image_batch.to(device="cuda")
    embed_batch = embed_batches[i]

    loss = trainer(
      image_batch,
      text_embeds = embed_batch,
      unet_number = 1,            
      max_batch_size = 4          
    )

    trainer.update(unet_number = 1)

    if i % 5 == 0 or i == (len(img_batches) - 1):
      print(f'Image batches processed: {i}/{len(img_batches)}')

  if ((current_epoch + epoch + 1) % 5) == 0:
    ckpt_name = 'imagen_1unet_text2image_epoch' + str(current_epoch + epoch + 1) + '.ckpt'

    trainer.save(os.path.join(save_path, ckpt_name))
    print(f'Saved checkpoint for epoch {current_epoch + epoch + 1}')
  
  print(f'\n\n')




### SAMPLE AND VISUALIZE
images = trainer.sample(
    texts = caption_batches[0], cond_scal
batch_idx = 0

x = images[batch_idx].cpu().detach().numpy()
x = np.moveaxis(x, 0, -1)
x = x.astype(np.uint8)

imshow(x)
print(caption_batches[0][batch_idx])

alif-munim · 2023-02-08T06:37:15Z

Here is a link to my python notebook, where I visualize the dataset as well as my samples:
https://github.com/alif-munim/deep-learning/blob/master/imagen_flickr8k.ipynb

alif-munim · 2023-02-15T22:03:58Z

Closing after receiving some help from the awesome @HReynaud (thread in question can be found here: #305)

For future reference, the main issue was that I was using trainer.update() for my gradient updates, rather than trainer.train_step.

trainer.update() only does the backpropagation operation ie. tensor.backward() in pytorch.
trainer.train_step() first runs the foward process and then automatically calls trainer.update() to train the model. If you don't run the forward step first the model has no gradients to backpropagate through.

alif-munim closed this as completed Feb 15, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Noisy / blank samples after many training steps #312

Noisy / blank samples after many training steps #312

alif-munim commented Feb 8, 2023

alif-munim commented Feb 8, 2023

alif-munim commented Feb 15, 2023

Noisy / blank samples after many training steps #312

Noisy / blank samples after many training steps #312

Comments

alif-munim commented Feb 8, 2023

alif-munim commented Feb 8, 2023

alif-munim commented Feb 15, 2023