In [None]:
'''
# Colab Fast Start (Around 5 minutes)
# Clone the repositories
!git clone https://github.com/NVlabs/stylegan3
!git clone https://github.com/openai/CLIP
!git clone https://github.com/salesforce/BLIP

# Install the requirements
!pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 ninja==1.11.1.1
!pip install ftfy==6.1.3 regex==2023.12.25 tqdm==4.66.2
!pip install transformers==4.19.4 timm==0.9.16 fairscale==0.4.13

# Download the pre-trained models
!mkdir pretrained_models
!gdown -O test.png https://drive.google.com/uc?id=1hfVAbs5nkXcUpG6FCAafid7F7ZsqRRkk
#!curl -L --output pretrained_models/stylegan2-afhqcat-512x512.pkl 'https://api.ngc.nvidia.com/v2/models/org/nvidia/team/research/stylegan2/1/files?redirect=true&path=stylegan2-afhqcat-512x512.pkl'
!curl -L --output pretrained_models/stylegan2-ffhq-512x512.pkl 'https://api.ngc.nvidia.com/v2/models/org/nvidia/team/research/stylegan2/1/files?redirect=true&path=stylegan2-ffhq-512x512.pkl'
!curl -L --output pretrained_models/ViT-B-32.pt 'https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt'
!curl -L --output pretrained_models/model_base_capfilt_large.pth https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth
#!curl -L --output pretrained_models/model_base_caption_capfilt_large.pth https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth

# Download dataset:
!gdown -O dataset.zip https://drive.google.com/uc?id=13-4a7KROKa5onLd1N4CfM36WH531rre0
!unzip -q dataset.zip
'''

In [1]:
# Stylegan2-ADA-PyTorch Implementation for Stylegan3 https://github.com/NVlabs/stylegan3

import sys
sys.path.append('stylegan3') # a folder stylegan3 is in the same directory as this notebook

import pickle
import torch
import PIL.Image
from IPython.display import display
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"

with open('pretrained_models/stylegan2-ffhq-512x512.pkl', 'rb') as f:
    G = pickle.load(f)['G_ema'].to(device)  # torch.nn.Module
    
def generate_image_stylegan2(name):
    
    z = torch.randn([1, G.z_dim]).to(device)
    w = G.mapping(z, None) # styleGAN output
    # w: 1, 16, 512
    
    img = G.synthesis.forward(w)
    #z = clipout.to(device)                       # THIS IS OUR AIM #TODO # Optimize this tensor
    #c = None                                # class labels (not used in this example)
    #img = G(z, c)                           # NCHW, float32, dynamic range [-1, +1], no truncation

    img = (img.permute(0, 2, 3, 1) * 127.5 + 128).clamp(0, 255).to(torch.uint8)
    outdir = 'dataset'
    PIL.Image.fromarray(img[0].cpu().numpy(), 'RGB').save(f'{outdir}/images/{name:05d}.png') # save image
    
    torch.save(w, f'{outdir}/tensors/{name:05d}.pt')
    # w = torch.load('tensor.pt') # load tensor
    #img = PIL.Image.fromarray(img[0].cpu().numpy(), 'RGB').resize((256, 256))


In [7]:
# Stylegan2 Generate Test
'''
for i in range(20):
    generate_image_stylegan2(i)
'''

In [2]:
# BLIP - Bootstrapping Language-Image Pretraining https://github.com/salesforce/BLIP

# Requirements:
# !conda install transformers=4.19.4 timm=0.9.16 fairscale=0.4.13

# Download pretrained model to pwd/pretrained_models directory from:
# !curl -LO https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth

import sys
sys.path.append('BLIP') # a folder BLIP is in the same directory as this notebook

from BLIP.models.blip import blip_decoder # to use class in: pwd/BLIP/models/blip.py

from PIL import Image
import requests
import os
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
image_size = 512

model_url = 'pretrained_models/model_base_capfilt_large.pth'
#model_url = 'pretrained_models/model_base_caption_capfilt_large.pth' # Worse
#model_url = 'pretrained_models/model_base_retrieval_coco.pth' # Assertion ERROR
#model_url = 'pretrained_models/model_large_retrieval_coco.pth' # Assertion ERROR
#model_url = 'pretrained_models/model_base_retrieval_flickr.pth' # Assertion ERROR
#model_url = 'pretrained_models/model_large_caption.pth' # Assertion ERROR

med_config_path = os.getcwd() + '/BLIP/configs/med_config.json'
model = blip_decoder(pretrained=model_url, image_size=image_size, vit='base', med_config = med_config_path)
model = model.to(device)
    
def get_caption_BLIP(file_num, txt_file_name):
    filename = 'dataset/images/'+file_num+'.png'
    raw_image = Image.open(filename).convert('RGB')
    #w,h = raw_image.size
    #display(raw_image.resize((w//5,h//5)))

    transform = transforms.Compose([
        transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
        ])
    image = transform(raw_image).unsqueeze(0).to(device)
    #model.eval()

    with torch.no_grad():
        # beam search
        caption = model.generate(image, sample=False, num_beams=3, max_length=20, min_length=10)
        # nucleus sampling
        # caption = model.generate(image, sample=True, top_p=0.9, max_length=20, min_length=5)
        print(str(file_num)+'.png caption: '+caption[0])
        
        with open(txt_file_name, 'a', encoding='utf-8') as txtfile:
            txtfile.write(str(file_num) + ',' + caption[0] + '\n')

    blipout = caption[0]

reshape position embedding from 196 to 1024
load checkpoint from pretrained_models/model_base_capfilt_large.pth


In [14]:
# BLIP Caption Save Test
'''
txt_name = 'image_captions.txt'
for i in range(1,20):
    i = str(i).zfill(5)
    get_caption_BLIP(i,txt_name)
print("TXT file '{}' has been created successfully with image file names and their captions.".format(txt_name))
'''

image0001.png caption: a woman with a tooth in her mouth
image0002.png caption: a woman with long brown hair
image0003.png caption: a man and a woman smiling
image0004.png caption: a little boy with glasses on
image0005.png caption: a woman with a red flower in her hair
image0006.png caption: a woman with a hat on
image0007.png caption: a woman with long blonde hair
image0008.png caption: a man in a suit and tie
image0009.png caption: a man in a blue shirt
image0010.png caption: a woman with long blonde hair
image0011.png caption: a young woman with blonde hair
image0012.png caption: a man with a beard and a blue shirt
image0013.png caption: a woman wearing a blue hat
image0014.png caption: a woman wearing sunglasses and smiling
image0015.png caption: a smiling woman with glasses on
image0016.png caption: a woman with a smile on her face
image0017.png caption: a man with a beard and glasses
image0018.png caption: a man with a toothy look on his face
image0019.png caption: a man with gl

In [3]:
# COMBINED
from datetime import datetime
now = datetime.now()
formatted_date_time = now.strftime("%Y-%m-%d_%H-%M-%S")
txt_name = 'dataset/image_captions.txt' # first line should be: name,caption
import os
file_list = os.listdir('dataset/images/')
file_list.sort()
try:
    last = int(file_list[-1][0:5])+1
except:
    last = 1
    
number_of_photos = 48

for i in range(last,last+number_of_photos):
    generate_image_stylegan2(i)
    i = str(i).zfill(5)
    get_caption_BLIP(i,txt_name)
    
print("TXT file '{}' has been created successfully with image file names and their captions.".format(txt_name))

02001.png caption: a man and a woman smiling
02002.png caption: a little girl with a flower in her hair
02003.png caption: a man with glasses and a tie
02004.png caption: a man smiling with a tooth in his mouth
02005.png caption: a man with glasses on his face
02006.png caption: a little girl with brown hair
02007.png caption: a bald man with a beard
02008.png caption: a woman wearing a hat and sunglasses
02009.png caption: a man sticking his tongue out
02010.png caption: a little girl with long hair
02011.png caption: a woman with a smile on her face
02012.png caption: a man in a suit and tie
02013.png caption: a little girl with a flower in her hair
02014.png caption: a man with a tooth on his face
02015.png caption: a little girl with blonde hair
02016.png caption: a man smiling for the camera
02017.png caption: a woman with a cell phone
02018.png caption: a man with glasses on his face
02019.png caption: a woman holding a bird in her hand
02020.png caption: a woman with glasses and

In [5]:
# Clean extra commas from the captions

input_file_path = 'dataset/image_captions.txt'
output_file_path = 'dataset/image_captions_cleaned.txt'

with open(input_file_path, 'r') as input_file, open(output_file_path, 'w') as output_file:
    for line in input_file:
        parts = line.strip().split(',', 1)  # Split the line into two parts at the first comma
        if len(parts) > 1:
            first_part = parts[0]
            second_part = parts[1].replace(',', '')  # Remove any additional commas from the second part
            output_file.write(first_part + ',' + second_part + '\n')  # Write the cleaned line to the output file
        else:
            output_file.write(line)  # If there's no comma or only one part, write the line as is


In [6]:
# Read dataset test
import os
imagess = os.listdir('dataset')
# import txt file as csv file with pandas with comma separator
import pandas as pd
dtype_dict = {'name': str, 'caption': str}
df = pd.read_csv('dataset/image_captions_cleaned.txt', sep=",", header=0, dtype=dtype_dict)
df.head()

Unnamed: 0,name,caption
0,1,a man with a black jacket
1,2,a young child with a very look on his face
2,3,a woman with a big smile on her face
3,4,a man in a white shirt
4,5,a man with a bandana on his head
