# Stage II - Fine-Tuning the Text to Image Model


## Install Requirements

In [1]:
!pip install -U pip
!pip install boto3 torch==2.2.2 transformers accelerate xformers bitsandbytes
!pip install git+https://github.com/huggingface/diffusers

Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl.metadata (19 kB)
Collecting xformers
  Downloading xformers-0.0.26.post1-cp39-cp39-manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.4-py3-none-any.whl.metadata (12 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.5.15-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m133.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading 

In [2]:
!pip list | grep -E "boto|torch|diffusers|transformers|accelerate|xformers|bitsandbytes"

accelerate                0.31.0
bitsandbytes              0.43.1
boto3                     1.34.111
botocore                  1.34.111
codeflare-torchx          0.6.0.dev2
diffusers                 0.30.0.dev0
torch                     2.2.2+cu121
torchvision               0.17.2+cu121
transformers              4.41.2
xformers                  0.0.25.post1


In [3]:
# import required libraries
import os, random, yaml, threading
from datetime import datetime
try:
    import boto3
    from boto3.s3.transfer import TransferConfig
    from botocore.exceptions import ClientError
    import torch
    import torch.cuda as tc
    from diffusers import DiffusionPipeline
except Exception as e:
    print(f"Caught Exception: {e}")

  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)


### Check GPU

**It's recommended that you shut down any other notebook kernels.**

This fine tuning process uses a lot of video memory.   Here, we'll check on how much we have available.

In [5]:
accelerator = "cpu"
print(f"Checking for the availability of a GPU...")
if tc.is_available():
    device_name = tc.get_device_name()
    device_capabilities = tc.get_device_capability()
    device_available_mem, device_total_mem = [x / 1024**3 for x in tc.mem_get_info()]
    print(f"A GPU is available! [{device_name} - {device_capabilities} - {device_available_mem}/{device_total_mem} GB VRAM]")
    accelerator = "cuda"

print(f"Using accelerator: {accelerator}")

Checking for the availability of a GPU...
A GPU is available! [Tesla V100-SXM2-16GB - (7, 0) - 15.4710693359375/15.7725830078125 GB VRAM]
Using accelerator: cuda


## Setup the training job

Here we set up all the options for training.  Most are environment variables which will allow us to override values from pipelines and run this notebook with different setting such as the base model or number of training steps and learning rate.

In [6]:
# setup working environment
WORKING_DIR = os.environ.get("working_dir", f"/opt/app-root/src/pipelines-pvc/")
MODEL_NAME = os.environ.get("model_name", "stabilityai/stable-diffusion-2-1")
OUTPUT_DIR = os.path.join(os.getcwd(), f"{WORKING_DIR}/stable_diffusion_weights/redhat_dog")
DATA_DIR = os.path.join(os.getcwd(), f"{WORKING_DIR}/data")
INSTANCE_DATA_URL = os.environ.get("instance_data_url", "https://rhods-public.s3.amazonaws.com/sample-data/images/redhat-dog.tar.gz")
INSTANCE_DIR = os.path.join(DATA_DIR, "instance_dir")
CLASS_DIR = os.path.join(DATA_DIR, "class_dir")
INSTANCE_PROMPT = os.environ.get("instance_prompt", "photo of a rhteddy dog")
CLASS_PROMPT = os.environ.get("class_prompt", "a photo of dog")

# training steps
NUM_CLASS_IMAGES = int(os.environ.get("num_class_images", "100"))
MAX_TRAIN_STEPS = int(os.environ.get("max_train_steps", "800"))

## Start the training process

### Set up the Training Job

In [7]:
# cleanup storage
!rm -rf $OUTPUT_DIR
!rm -rf $CLASS_DIR

In [8]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(CLASS_DIR, exist_ok=True)

print(f"Weights will be saved at {OUTPUT_DIR}")
print(f"It will be based on the model {MODEL_NAME}")
print(f"Training data located in downloaded from {INSTANCE_DATA_URL}")
print(f"We're going to train the difference between \"{INSTANCE_PROMPT}\" and \"{CLASS_PROMPT}\"")

Weights will be saved at /opt/app-root/src/pipelines-pvc//stable_diffusion_weights/redhat_dog
It will be based on the model stabilityai/stable-diffusion-2-1
Training data located in downloaded from https://rhods-public.s3.amazonaws.com/sample-data/images/redhat-dog.tar.gz
We're going to train the difference between "photo of a rhteddy dog" and "a photo of dog"


In [9]:
!accelerate config default

Configuration already exists at /opt/app-root/src/.cache/huggingface/accelerate/default_config.yaml, will not override. Run `accelerate config` manually or pass a different `save_location`.


### Start Training

Here we kick off the training job with our chosen settings.  This will take about 15 minutes depending on settings and hardware.

In [11]:
!echo "MODEL_NAME=$MODEL_NAME"
!echo "OUTPUT_DIR=$OUTPUT_DIR"
!echo "DATA_DIR=$DATA_DIR"
!echo "INSTANCE_DIR=$INSTANCE_DIR"
!echo "CLASS_DIR=$CLASS_DIR"
!echo "INSTANCE_PROMPT=$INSTANCE_PROMPT"
!echo "CLASS_PROMPT=$CLASS_PROMPT"
!echo "NUM_CLASS_IMAGES=$NUM_CLASS_IMAGES"
!echo "MAX_TRAIN_STEPS=$MAX_TRAIN_STEPS"

MODEL_NAME=stabilityai/stable-diffusion-2-1
OUTPUT_DIR=/opt/app-root/src/pipelines-pvc//stable_diffusion_weights/redhat_dog
DATA_DIR=/opt/app-root/src/pipelines-pvc//data
INSTANCE_DIR=/opt/app-root/src/pipelines-pvc//data/instance_dir
CLASS_DIR=/opt/app-root/src/pipelines-pvc//data/class_dir
INSTANCE_PROMPT=photo of a rhteddy dog
CLASS_PROMPT=a photo of dog
NUM_CLASS_IMAGES=100
MAX_TRAIN_STEPS=800


In [14]:
# download the training script from github
#TODO: rewrite that from scratch using jupyter
!wget -O train_dreambooth.py https://raw.githubusercontent.com/huggingface/diffusers/main/examples/dreambooth/train_dreambooth.py

--2024-06-25 15:03:08--  https://raw.githubusercontent.com/huggingface/diffusers/main/examples/dreambooth/train_dreambooth.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 58466 (57K) [text/plain]
Saving to: ‘train_dreambooth.py’


2024-06-25 15:03:08 (41.6 MB/s) - ‘train_dreambooth.py’ saved [58466/58466]



In [16]:
# cleanup cuda cache
tc.empty_cache()

# start the training job
!accelerate launch train_dreambooth.py \
  --pretrained_model_name_or_path=$MODEL_NAME  \
  --instance_data_dir=$INSTANCE_DIR \
  --class_data_dir=$CLASS_DIR \
  --output_dir=$OUTPUT_DIR \
  --with_prior_preservation --prior_loss_weight=1.0 \
  --instance_prompt="$INSTANCE_PROMPT" \
  --class_prompt="$CLASS_PROMPT" \
  --resolution=512 \
  --train_batch_size=1 \
  --gradient_accumulation_steps=2\
  --gradient_checkpointing \
  --use_8bit_adam \
  --learning_rate=5e-6 \
  --lr_scheduler="constant" \
  --lr_warmup_steps=0 \
  --num_class_images=$NUM_CLASS_IMAGES \
  --max_train_steps=$MAX_TRAIN_STEPS \
  --enable_xformers_memory_efficient_attention

  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
06/25/2024 15:05:12 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda

Mixed precision type: no

{'image_encoder'} was not found in config. Values will be initialized to default values.
Loading pipeline components...:   0%|                     | 0/6 [00:00<?, ?it/s]{'sample_max_value', 'thresholding', 'clip_sample_range', 'dynamic_thresholding_ratio', 'rescale_betas_zero_snr', 'timestep_spacing'} was not found in config. Values will be initialized to default values.
Loaded scheduler as DDIMScheduler from `scheduler` subfolder of stabilityai/stable-diffusion-2-1.
Loaded text_encoder as CLIPTextModel from `text_encoder` subfolder of stabilityai/stable-diffusion-2-1

In [17]:
# show results
!ls $OUTPUT_DIR

checkpoint-500	   logs		     scheduler	   tokenizer  vae
feature_extractor  model_index.json  text_encoder  unet
