In [None]:
import pandas as pd
from PIL import Image
from utils import download_diffusiondb
from clip_dataset_encoder import encode_dataframe
import os
from image_noiser import ImageNoiser
import numpy as np

Download parquet files and metadata

In [None]:
#Download parquet files
!python scripts/dataset_parquet_files.py

In [None]:
hord_score_df = pd.read_parquet('parquets/hord_diffusiondb_scores.parquet')
train_split_df = pd.read_parquet('parquets/train_split.parquet')
validate_split_df = pd.read_parquet('parquets/validate_split.parquet')
prepared_hord_df = pd.read_parquet('parquets/prepared_hord_diffusion_dataset.parquet')

Download images in shards, until it exceeds needed amount

In [None]:
NUM_IMAGES_TO_COLLECT = 1000
part_id = 0
collected_samples = sum([len(files) for r, d, files in os.walk("images")])
df_latents = pd.DataFrame(columns=prepared_hord_df.columns)
print(f'collected images so far: {collected_samples}')
while collected_samples < NUM_IMAGES_TO_COLLECT:
    part_id += 1
    download_diffusiondb.main(index=part_id, range_max=None, output='images/', unzip=True, large=True)
    for image_name in os.listdir(f'images/part-{part_id:06}'):
        matching_rows = prepared_hord_df[prepared_hord_df['image_name'] == image_name]
        if len(matching_rows) == 0:
            os.remove(f'images/part-{part_id:06}/{image_name}')
        else:
            df_latents = pd.concat([df_latents, matching_rows], ignore_index=True)
    collected_samples += len(os.listdir(f'images/part-{part_id:06}'))
    print(f'collected images so far: {collected_samples}')

In [None]:
#Create latent representations of all images in images/ folder and save them in latent_images/ in the corresponding shards
#image_processor is warning about latents range, but in expected range [0, 1] latents are not approximated correctly
if not os.path.exists('latent_images'):
    os.mkdir('latent_images')
noise_timestep = 10 #Create latents at step 10 for now, may reduce or increase later based on results of experiments
scheduler_timesteps = 20
image_noiser = ImageNoiser()
for part_idx in range(1, part_id+1):
    if not os.path.exists(f'latent_images/part-{part_idx:06}'):
        os.mkdir(f'latent_images/part-{part_idx:06}')
    for image_name in os.listdir(f'images/part-{part_idx:06}'):
        img = Image.open(f'images/part-{part_idx:06}/{image_name}')
        #encode an image to a latent
        img_latent = image_noiser.encode_image(img)
        #Add noise to that latent with DPMSolverScheduler
        noisy_latent = image_noiser.add_noise_to_latent(img_latent, noise_timestep, scheduler_timesteps)
        #approximate that latent with matrix multiplication which works for some fucking reason lmao
        noisy_latent_approx = image_noiser.approx_latent(noisy_latent)
        #Save that image in latent_images/ folder with the same part_id
        noisy_latent_approx.save(f'latent_images/part-{part_idx:06}/{image_name.split(".")[0]}.png')
        

In [None]:
def create_path_to_latent(row):
    return f"latent_images/part-{row['part_id']:06d}/{row['image_name'].split('.')[0]}.png"
df_latents['latent_image_path'] = df_latents.apply(create_path_to_latent, axis=1)

In [None]:
df_latents_train, df_latents_val = np.split(df_latents.sample(frac=1, random_state=42), [int(0.8*len(df_latents))])

In [None]:
df_latents.to_parquet('parquets/latents-approximated.parquet', index=False)
df_latents_train.to_csv('csvs/clip_train.csv', sep="\t", index=False)
df_latents_train.to_parquet('parquets/latents-approximated-train.parquet', index=False)
df_latents_val.to_csv('csvs/clip_val.csv', sep="\t", index=False)
df_latents_val.to_parquet('parquets/latents-approximated-val.parquet', index=False)

*OPTIONAL*: Run CLIP training, change config if needed

In [None]:
!./scripts/run_clip_train.sh

On trained CLIP, run dataframe encoding

In [None]:
encode_dataframe("hf-hub:Outrun32/CLIP-ViT-B-16-noise-tuned",df_latents,clip_batch_size=128)

Run MLP training on embeddings

In [None]:
!python artifact_estimator/train.py

Then, go to demo.ipynb