In [None]:
!pip install torch

In [2]:
!pip install diffusers==0.11.1
!pip install transformers scipy ftfy accelerate

Collecting diffusers==0.11.1
  Downloading diffusers-0.11.1-py3-none-any.whl.metadata (29 kB)
Downloading diffusers-0.11.1-py3-none-any.whl (524 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m524.9/524.9 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: diffusers
  Attempting uninstall: diffusers
    Found existing installation: diffusers 0.32.2
    Uninstalling diffusers-0.32.2:
      Successfully uninstalled diffusers-0.32.2
Successfully installed diffusers-0.11.1
Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1


Stable Diffusion Pipeline
StableDiffusionPipeline is an end-to-end inference pipeline that you can use to generate images from text with just a few lines of code.

First, we load the pre-trained weights of all components of the model. In this notebook we use Stable Diffusion version 1.4 (CompVis/stable-diffusion-v1-4), but there are other variants that you may want to try:

runwayml/stable-diffusion-v1-5
stabilityai/stable-diffusion-2-1-base
stabilityai/stable-diffusion-2-1. This version can produce images with a resolution of 768x768, while the others work at 512x512.
In addition to the model id CompVis/stable-diffusion-v1-4, we're also passing a specific revision and torch_dtype to the from_pretrained method.

We want to ensure that every free Google Colab can run Stable Diffusion, hence we're loading the weights from the half-precision branch fp16 and also tell diffusers to expect the weights in float16 precision by passing torch_dtype=torch.float16.

If you want to ensure the highest possible precision, please make sure to remove torch_dtype=torch.float16 at the cost of a higher memory usage.

In [3]:
!pip install --upgrade huggingface-hub==0.26.2 transformers==4.46.1 tokenizers==0.20.1 diffusers==0.31.0

Collecting huggingface-hub==0.26.2
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting transformers==4.46.1
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers==0.20.1
  Downloading tokenizers-0.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting diffusers==0.31.0
  Downloading diffusers-0.31.0-py3-none-any.whl.metadata (18 kB)
Downloading huggingface_hub-0.26.2-py3-none-any.whl (447 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.5/447.5 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.46.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB

In [8]:
# Import necessary libraries
from diffusers import StableDiffusionPipeline, EulerAncestralDiscreteScheduler
import torch

# Define a function to create the text-to-image pipeline
def text_to_image(
    prompt,
    model_id="runwayml/stable-diffusion-v1-5",
    device="cuda" if torch.cuda.is_available() else "cpu",
    save_path="generated_img.png",
    num_inference_steps=75,  # More steps = better quality but slower
    guidance_scale=8.5,      # Higher values = more adherence to the prompt
    height=512, width=512,   # Image resolution
    seed=None                # Seed for reproducibility
):
    """
    Generate an image from a text prompt using a pre-trained Stable Diffusion model.

    Args:
        prompt (str): The text prompt to generate the image.
        model_id (str): The model ID for the pre-trained Stable Diffusion model.
        device (str): The device to use for inference ("cuda" or "cpu").
        save_path (str): The path to save the generated image.
        num_inference_steps (int): Number of denoising steps (default: 75).
        guidance_scale (float): How closely to follow the prompt (default: 8.5).
        height (int): Height of the generated image (default: 512).
        width (int): Width of the generated image (default: 512).
        seed (int): Random seed for reproducibility (default: None).

    Returns:
        PIL.Image: The generated image.
    """
    # Load the pre-trained Stable Diffusion model with optimized settings
    pipe = StableDiffusionPipeline.from_pretrained(
        model_id,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
    )

    # Use a more stable scheduler for better image quality
    pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)

    pipe = pipe.to(device)

    # Set a random seed for reproducibility (if provided)
    generator = torch.manual_seed(seed) if seed is not None else None

    # Generate the image
    print("Generating image...")
    image = pipe(
        prompt,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale,
        height=height,
        width=width,
        generator=generator
    ).images[0]

    # Save the generated image
    image.save(save_path)
    print(f"Image saved to {save_path}")

    return image

# Get the text prompt from the user
prompt = input("Enter the text prompt for image generation: ")

# Generate the image using the pipeline
generated_image = text_to_image(
    prompt,
    save_path="generated_img.png",
    num_inference_steps=100,  # Increase for better quality
    guidance_scale=10,       # Adjust for prompt adherence
    height=768, width=512,   # Aspect ratio can be changed
    seed=42                  # Set a seed for reproducibility
)



Enter the text prompt for image generation: Bioluminescent mushrooms growing in mystical forest setting, highly detailed


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Generating image...


  0%|          | 0/100 [00:00<?, ?it/s]

Image saved to generated_img.png
