In [None]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

try:
    import jupyter_black

    jupyter_black.load()
except:
    print("black not installed")

# Pre-Trained Models and APIs

## Goals

- Learn how to use pre-trained models on HuggingFace
- Learn how to condition the generation process
- Learn how to use APIs

## Setup

Let's define paths, install & load the necessary Python packages.

**Optionally: Save the notebook to your personal google drive to persist changes.**

Mount your google drive to store data and results (if running the code in Google Colab).

In [None]:
try:
    import google.colab

    IN_COLAB = True
except:
    IN_COLAB = False

print(f"In colab: {IN_COLAB}")

In [None]:
if IN_COLAB:
    from google.colab import drive

    drive.mount("/content/drive")

**Modify the following paths if necessary.**

That is where your data will be stored.

In [None]:
from pathlib import Path

if IN_COLAB:
    DATA_PATH = Path("/content/drive/MyDrive/cas-dl-module-genai-part2")
else:
    DATA_PATH = Path("../../data")

Install `dl_genai_lectures`

In [None]:
try:
    import dl_genai_lectures

    print("dl_genau_lectures installed, all good")
except ImportError as e:
    import os

    if Path("/workspace/code/src").exists():
        print("Installing from local repo")
        os.system("cd /workspace/code  && pip install .")
    else:
        print("Installing from git repo")
        os.system("pip install git+https://github.com/marco-willi/cas-dl-genai-exercises-fs2025")

Load all packages

In [None]:
import io
import os

import diffusers
import numpy as np
import requests
import seaborn as sns
import torch
from diffusers import StableDiffusionPipeline
from matplotlib import pyplot as plt
from PIL import Image
from torch import nn
from torch.nn import functional as F
from torchvision.transforms.v2 import functional as TF

from dl_genai_lectures import visualize

Define a default device for your computations.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using: {device}")

## 1) Text-to-Image Generation


Stable diffusion is a well known open-source model for image generation.

First we need to log into hugging face.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Then we initialize a pipeline (and also download all model assets if not already available):

In [None]:
model_id = "runwayml/stable-diffusion-v1-5"
pipe = StableDiffusionPipeline.from_pretrained(model_id, cache_dir=DATA_PATH.joinpath("HF_CACHE"))
pipe = pipe.to(device)

Now we can generate images!

In [None]:
prompt = "A futuristic city floating in the clouds, artstation style"
generator = torch.Generator(device=device).manual_seed(123)
image = pipe(prompt, generator=generator).images[0]
image.show()

There are different parameters that guide the diffusion process, such as "guidance_scale". Let's see how the generation changes.

In [None]:
generator = torch.Generator(device=device).manual_seed(123)
image = pipe(prompt, guidance_scale=1.0, generator=generator).images[0]
generator = torch.Generator(device=device).manual_seed(123)
image.show()

In [None]:
generator = torch.Generator(device=device).manual_seed(123)
image = pipe(prompt, guidance_scale=9.0, generator=generator).images[0]
image.show()

**Question**: Which do you like better?

## 2) Conditional Image Generation

Image generation can be controlled by different condtioning factors, IF a model was trained with those factors.

A well-known conditioning is canny edges.

In [None]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

url = "https://github.com/pytorch/vision/blob/main/gallery/assets/dog2.jpg?raw=true"
r = requests.get(url, allow_redirects=True)
image = Image.open(io.BytesIO(r.content))

image = np.array(image)

low_threshold = 100
high_threshold = 200

image = cv2.Canny(image, low_threshold, high_threshold)
image = image[:, :, None]
image = np.concatenate([image, image, image], axis=2)
canny_image = Image.fromarray(image)
canny_image

In [None]:
import torch
from diffusers import ControlNetModel, StableDiffusionControlNetPipeline
from diffusers.utils import load_image
from torchvision import transforms

# Load the ControlNet model conditioned on Canny edges
controlnet = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-canny",
    # torch_dtype=torch.float16
)

pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    controlnet=controlnet,
    safety_checker=None,
    # torch_dtype=torch.float16
).to(device)

pipe.enable_xformers_memory_efficient_attention()

In [None]:
# Make sure the edge map is a 3-channel PIL image
canny_image_condition = canny_image.convert("RGB")

prompt = (
    "A dog lying on the grass, photorealistic, 8k resolution, highly detailed, cinematic lighting"
)
generator = torch.Generator(device=device).manual_seed(123)

output = pipe(prompt, image=canny_image_condition, num_inference_steps=30, generator=generator)
output.images[0].show()

## 3) Generation on low ressource hardware

Modern (image) generative models require a lot of ressources, particularly, VRAM. There are several options to reduce ressources, some depend on additional libraries:

- reduce model precision from floating point (32 bit) to half-precision (16 bit) or even much further to (8-bit or 4-bit)  [bitsandbytes](https://github.com/bitsandbytes-foundation/bitsandbytes)
- off-load computations to cpu (RAM)
- use efficient implementations of operations, e.g [xformers](https://github.com/facebookresearch/xformers)

Here is the link to a HuggingFace guide:  [Reduce memory usage](https://huggingface.co/docs/diffusers/en/optimization/memory)


Note: some options are hardware dependent.

In [None]:
import torch
from diffusers import StableDiffusionPipeline

model_id = "dreamlike-art/dreamlike-photoreal-2.0"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe = pipe.to(device)

pipe.enable_vae_slicing()
pipe.enable_xformers_memory_efficient_attention()

In [None]:
prompt = "photo, a church in the middle of a field of crops, bright cinematic lighting, gopro, fisheye lens"
image = pipe(prompt).images[0]
image.show()

**Task** Try some of the tricks when running the following model. Warning: This is a large model that requires a huge download.

In [None]:
from diffusers import (
    BitsAndBytesConfig,
    SD3Transformer2DModel,
    StableDiffusion3Pipeline,
)

model_id = "stabilityai/stable-diffusion-3.5-medium"

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)
model_nf4 = SD3Transformer2DModel.from_pretrained(
    model_id,
    subfolder="transformer",
    quantization_config=nf4_config,
    torch_dtype=torch.bfloat16,
)

pipeline = StableDiffusion3Pipeline.from_pretrained(
    model_id, transformer=model_nf4, torch_dtype=torch.bfloat16
)
pipeline.enable_model_cpu_offload()

In [None]:
prompt = "A whimsical and creative image depicting a hybrid creature that is a mix of a waffle and a hippopotamus, basking in a river of melted butter amidst a breakfast-themed landscape. It features the distinctive, bulky body shape of a hippo. However, instead of the usual grey skin, the creature's body resembles a golden-brown, crispy waffle fresh off the griddle. The skin is textured with the familiar grid pattern of a waffle, each square filled with a glistening sheen of syrup. The environment combines the natural habitat of a hippo with elements of a breakfast table setting, a river of warm, melted butter, with oversized utensils or plates peeking out from the lush, pancake-like foliage in the background, a towering pepper mill standing in for a tree.  As the sun rises in this fantastical world, it casts a warm, buttery glow over the scene. The creature, content in its butter river, lets out a yawn. Nearby, a flock of birds take flight"

image = pipeline(
    prompt=prompt,
    num_inference_steps=40,
    guidance_scale=4.5,
    max_sequence_length=512,
).images[0]

## 4) Using APIs - Example "Replicate"

Explore the API and the models: https://replicate.com/explore


In [None]:
import replicate

In [None]:
def pil_to_replicate_file(pil_image, format="PNG"):
    """
    Converts a PIL image to a file-like object compatible with Replicate API.
    Args:
        pil_image: PIL.Image object
        format: 'PNG' or 'JPEG'
    Returns:
        io.BytesIO object with image data
    """
    byte_stream = io.BytesIO()
    pil_image.save(byte_stream, format=format)
    byte_stream.seek(0)  # Reset cursor to the beginning
    return byte_stream

We set the API key.

In [None]:
REPLICATE_API_TOKEN = os.getenv("REPLICATE_API_TOKEN")

if REPLICATE_API_TOKEN is None or REPLICATE_API_TOKEN == "":
    REPLICATE_API_TOKEN = getpass("Enter your Replicate API key: ")
    os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN

### Text to Image

In [None]:
output = replicate.run(
    "black-forest-labs/flux-schnell",
    input={
        "prompt": "A futuristic city floating in the clouds, artstation style",
    },
)

image_data = output[0].read()
image = Image.open(io.BytesIO(image_data))
image

### Face Swap Model

In [None]:
url = "https://replicate.delivery/pbxt/Mb44Wp0W7Xfa1Pp91zcxDzSSQQz8GusUmXQXi3GGzRxDvoCI/0_1.webp"
r = requests.get(url, allow_redirects=True)
swap_image = Image.open(io.BytesIO(r.content)).convert("RGB")
swap_image


url = "https://replicate.delivery/pbxt/Mb44XIUHkUrmyyH1OP5K1WmFN7SNN0eUSU16A8rBtuXe7eYV/cyberpunk_80s_example.png"
r = requests.get(url, allow_redirects=True)
target_image = Image.open(io.BytesIO(r.content)).convert("RGB")
target_image

In [None]:
output = replicate.run(
    "easel/advanced-face-swap",
    input={
        "swap_image": pil_to_replicate_file(swap_image),
        "hair_source": "target",
        "user_gender": "default",
        "target_image": pil_to_replicate_file(target_image),
        "user_b_gender": "default",
    },
)
print(output)

In [None]:
image_data = output.read()
image = Image.open(io.BytesIO(image_data))
image

### Image Upscaling

In [None]:
url = "https://m.media-amazon.com/images/M/MV5BMTQxNzI3MjE2OF5BMl5BanBnXkFtZTcwNjk4MjAzMw@@._V1_QL75_UX603_.jpg"
r = requests.get(url, allow_redirects=True)
lowres_image = Image.open(io.BytesIO(r.content)).convert("RGB")
lowres_image

In [None]:
output = replicate.run(
    "recraft-ai/recraft-crisp-upscale",
    input={
        "image": pil_to_replicate_file(lowres_image),
    },
)

In [None]:
image_data = output.read()
image = Image.open(io.BytesIO(image_data))
image