Added im2img

lucataco · Nov 7, 2023 · 418193e · 418193e
1 parent 1dd17f4
commit 418193e
Show file tree

Hide file tree

Showing 8 changed files with 221 additions and 23 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 __pycache__
 .cog
-model-cache
+safety-cache
+sdxl-cache
diff --git a/README.md b/README.md
@@ -8,10 +8,19 @@ First, download the pre-trained weights:
 
 Then, you can run predictions:
 
-    cog predict -i prompt="with smoke, half ice and half fire and ultra realistic in detail.wolf, typography, dark fantasy, wildlife photography, vibrant, cinematic and on a black background" -i seed=36446545871
+    cog predict -i prompt="with smoke, half ice and half fire and ultra realistic in detail.wolf, typography, dark fantasy, wildlife photography, vibrant, cinematic and on a black background" -i seed=36446545872
 
-## Example:
+Or img2img:
 
-"with smoke, half ice and half fire and ultra realistic in detail.wolf, typography, dark fantasy, wildlife photography, vibrant, cinematic and on a black background"
+    cog predict -i image=@output.0.png -i prompt="a wolf with pink and blue fur" -i seed=21272 -i disable_safety_checker=True
+
+
+## Examples:
+
+txt2img
 
 ![alt text](output.0.png)
+
+img2img
+
+![alt text](output.img2img.png)
diff --git a/cog.yaml b/cog.yaml
@@ -3,11 +3,13 @@ build:
   python_version: "3.11"
   python_packages:
     - "torch==2.0.1"
-    - "torchvision"
-    - "transformers"
-    - "accelerate"
-    - "safetensors"
-    - "git+https://github.com/huggingface/diffusers"
+    - "torchvision==0.15.2"
+    - "transformers==4.31.0"
+    - "diffusers==0.22.0"
+    - "accelerate==0.21.0"
 
+  run:
+    - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget
+
 # predict.py defines how predictions are run on your model
 predict: "predict.py:Predictor"
diff --git a/feature-extractor/preprocessor_config.json b/feature-extractor/preprocessor_config.json
@@ -0,0 +1,20 @@
+{
+    "crop_size": 224,
+    "do_center_crop": true,
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_resize": true,
+    "feature_extractor_type": "CLIPFeatureExtractor",
+    "image_mean": [
+      0.48145466,
+      0.4578275,
+      0.40821073
+    ],
+    "image_std": [
+      0.26862954,
+      0.26130258,
+      0.27577711
+    ],
+    "resample": 3,
+    "size": 224
+  }
diff --git a/output.0.png b/output.0.png
diff --git a/output.img2img.png b/output.img2img.png
diff --git a/predict.py b/predict.py
@@ -2,20 +2,34 @@
 
 from cog import BasePredictor, Input, Path
 import os
+import time
 import torch
+import shutil
+import subprocess
+import numpy as np
+from typing import List
+from diffusers.utils import load_image
+from transformers import CLIPImageProcessor
 from diffusers import (
-    StableDiffusionXLPipeline,
+    DiffusionPipeline,
+    StableDiffusionXLImg2ImgPipeline,
+    StableDiffusionXLInpaintPipeline,
     DDIMScheduler,
     DPMSolverMultistepScheduler,
     EulerAncestralDiscreteScheduler,
     EulerDiscreteScheduler,
     HeunDiscreteScheduler,
     PNDMScheduler
 )
-from typing import List
+from diffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
 
 MODEL_NAME = "segmind/SSD-1B"
-MODEL_CACHE = "model-cache"
+MODEL_CACHE = "./sdxl-cache"
+SAFETY_CACHE = "./safety-cache"
+FEATURE_EXTRACTOR = "./feature-extractor"
+SAFETY_URL = "https://weights.replicate.delivery/default/sdxl/safety-1.0.tar"
 
 class KarrasDPM:
     def from_config(config):
@@ -31,15 +45,72 @@ def from_config(config):
     "PNDM": PNDMScheduler,
 }
 
+def download_weights(url, dest):
+    start = time.time()
+    print("downloading url: ", url)
+    print("downloading to: ", dest)
+    subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
+    print("downloading took: ", time.time() - start)
+
 class Predictor(BasePredictor):
     def setup(self) -> None:
         """Load the model into memory to make running multiple predictions efficient"""
-        self.pipe = StableDiffusionXLPipeline.from_pretrained(
+        start = time.time()
+        self.tuned_model = False
+        self.is_lora = False
+
+        print("Loading safety checker...")
+        if not os.path.exists(SAFETY_CACHE):
+            download_weights(SAFETY_URL, SAFETY_CACHE)
+        self.safety_checker = StableDiffusionSafetyChecker.from_pretrained(
+            SAFETY_CACHE, torch_dtype=torch.float16
+        ).to("cuda")
+        self.feature_extractor = CLIPImageProcessor.from_pretrained(FEATURE_EXTRACTOR)
+
+        self.txt2img_pipe = DiffusionPipeline.from_pretrained(
             MODEL_CACHE,
             torch_dtype=torch.float16,
             use_safetensors=True,
             variant="fp16",
         ).to("cuda")
+        print("Loading SDXL img2img pipeline...")
+        self.img2img_pipe = StableDiffusionXLImg2ImgPipeline(
+            vae=self.txt2img_pipe.vae,
+            text_encoder=self.txt2img_pipe.text_encoder,
+            text_encoder_2=self.txt2img_pipe.text_encoder_2,
+            tokenizer=self.txt2img_pipe.tokenizer,
+            tokenizer_2=self.txt2img_pipe.tokenizer_2,
+            unet=self.txt2img_pipe.unet,
+            scheduler=self.txt2img_pipe.scheduler,
+        )
+        self.img2img_pipe.to("cuda")
+        print("Loading SDXL inpaint pipeline...")
+        self.inpaint_pipe = StableDiffusionXLInpaintPipeline(
+            vae=self.txt2img_pipe.vae,
+            text_encoder=self.txt2img_pipe.text_encoder,
+            text_encoder_2=self.txt2img_pipe.text_encoder_2,
+            tokenizer=self.txt2img_pipe.tokenizer,
+            tokenizer_2=self.txt2img_pipe.tokenizer_2,
+            unet=self.txt2img_pipe.unet,
+            scheduler=self.txt2img_pipe.scheduler,
+        )
+        self.inpaint_pipe.to("cuda")
+        print("setup took: ", time.time() - start)
+
+    def load_image(self, path):
+        shutil.copyfile(path, "/tmp/image.png")
+        return load_image("/tmp/image.png").convert("RGB")
+
+    def run_safety_checker(self, image):
+        safety_checker_input = self.feature_extractor(image, return_tensors="pt").to(
+            "cuda"
+        )
+        np_image = [np.array(val) for val in image]
+        image, has_nsfw_concept = self.safety_checker(
+            images=np_image,
+            clip_input=safety_checker_input.pixel_values.to(torch.float16),
+        )
+        return image, has_nsfw_concept
 
     @torch.inference_mode()
     def predict(
@@ -52,6 +123,14 @@ def predict(
             description="Negative Input prompt",
             default="scary, cartoon, painting"
         ),
+        image: Path = Input(
+            description="Input image for img2img or inpaint mode",
+            default=None,
+        ),
+        mask: Path = Input(
+            description="Input mask for inpaint mode. Black areas will be preserved, white areas will be inpainted.",
+            default=None,
+        ),
         width: int = Input(
             description="Width of output image",
             default=768
@@ -77,17 +156,78 @@ def predict(
         guidance_scale: float = Input(
             description="Scale for classifier-free guidance", ge=1, le=50, default=7.5
         ),
+        prompt_strength: float = Input(
+            description="Prompt strength when using img2img / inpaint. 1.0 corresponds to full destruction of information in image",
+            ge=0.0,
+            le=1.0,
+            default=0.8,
+        ),
         seed: int = Input(
             description="Random seed. Leave blank to randomize the seed", default=None
         ),
+        apply_watermark: bool = Input(
+            description="Applies a watermark to enable determining if an image is generated in downstream applications. If you have other provisions for generating or deploying images safely, you can use this to disable watermarking.",
+            default=True,
+        ),
+        lora_scale: float = Input(
+            description="LoRA additive scale. Only applicable on trained models.",
+            ge=0.0,
+            le=1.0,
+            default=0.6,
+        ),
+        replicate_weights: str = Input(
+            description="Replicate LoRA weights to use. Leave blank to use the default weights.",
+            default=None,
+        ),
+        disable_safety_checker: bool = Input(
+            description="Disable safety checker for generated images. This feature is only available through the API. See https://replicate.com/docs/how-does-replicate-work#safety",
+            default=False
+        )
     ) -> List[Path]:
         """Run a single prediction on the model"""
         if seed is None:
             seed = int.from_bytes(os.urandom(2), "big")
         print(f"Using seed: {seed}")
-        generator = torch.Generator("cuda").manual_seed(seed)
 
-        self.pipe.scheduler = SCHEDULERS[scheduler].from_config(self.pipe.scheduler.config)
+        if replicate_weights:
+            self.load_trained_weights(replicate_weights, self.txt2img_pipe)
+
+         # OOMs can leave vae in bad state
+        if self.txt2img_pipe.vae.dtype == torch.float32:
+            self.txt2img_pipe.vae.to(dtype=torch.float16)
+
+        sdxl_kwargs = {}
+        if self.tuned_model:
+            # consistency with fine-tuning API
+            for k, v in self.token_map.items():
+                prompt = prompt.replace(k, v)
+        print(f"Prompt: {prompt}")
+        if image and mask:
+            print("inpainting mode")
+            sdxl_kwargs["image"] = self.load_image(image)
+            sdxl_kwargs["mask_image"] = self.load_image(mask)
+            sdxl_kwargs["strength"] = prompt_strength
+            sdxl_kwargs["width"] = width
+            sdxl_kwargs["height"] = height
+            pipe = self.inpaint_pipe
+        elif image:
+            print("img2img mode")
+            sdxl_kwargs["image"] = self.load_image(image)
+            sdxl_kwargs["strength"] = prompt_strength
+            pipe = self.img2img_pipe
+        else:
+            print("txt2img mode")
+            sdxl_kwargs["width"] = width
+            sdxl_kwargs["height"] = height
+            pipe = self.txt2img_pipe
+
+        # toggles watermark for this prediction
+        if not apply_watermark:
+            watermark_cache = pipe.watermark
+            pipe.watermark = None
+
+        pipe.scheduler = SCHEDULERS[scheduler].from_config(pipe.scheduler.config)
+        generator = torch.Generator("cuda").manual_seed(seed)
 
         common_args = {
             "prompt": [prompt] * num_outputs,
@@ -97,16 +237,31 @@ def predict(
             "num_inference_steps": num_inference_steps,
         }
 
-        sdxl_kwargs = {}
-        sdxl_kwargs["width"] = width
-        sdxl_kwargs["height"] = height
+        if self.is_lora:
+            sdxl_kwargs["cross_attention_kwargs"] = {"scale": lora_scale}
 
-        output = self.pipe(**common_args, **sdxl_kwargs)
+        output = pipe(**common_args, **sdxl_kwargs)
+
+        if not apply_watermark:
+            pipe.watermark = watermark_cache
+            self.refiner.watermark = watermark_cache
+
+        if not disable_safety_checker:
+            _, has_nsfw_content = self.run_safety_checker(output.images)
 
         output_paths = []
-        for i, _ in enumerate(output.images):
+        for i, image in enumerate(output.images):
+            if not disable_safety_checker:
+                if has_nsfw_content[i]:
+                    print(f"NSFW content detected in image {i}")
+                    continue
             output_path = f"/tmp/out-{i}.png"
-            output.images[i].save(output_path)
+            image.save(output_path)
             output_paths.append(Path(output_path))
 
+        if len(output_paths) == 0:
+            raise Exception(
+                f"NSFW content detected. Try running it again, or try a different prompt."
+            )
+
         return output_paths
diff --git a/script/download-weights b/script/download-weights
@@ -5,6 +5,9 @@ import os
 import sys
 import torch
 from diffusers import StableDiffusionXLPipeline
+from diffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
 
 # append project directory to path so predict.py can be imported
 sys.path.append('.')
@@ -14,11 +17,19 @@ from predict import MODEL_NAME, MODEL_CACHE
 if not os.path.exists(MODEL_CACHE):
     os.makedirs(MODEL_CACHE)
 
-# SD-XL-Base-1.0 refiner
+# SSD-1B
 pipe = StableDiffusionXLPipeline.from_pretrained(
     MODEL_NAME,
     torch_dtype=torch.float16,
     use_safetensors=True,
     variant="fp16",
 )
-pipe.save_pretrained(MODEL_CACHE, safe_serialization=True)
+pipe.save_pretrained(MODEL_CACHE, safe_serialization=True)
+
+
+# safety checker
+safety = StableDiffusionSafetyChecker.from_pretrained(
+    "CompVis/stable-diffusion-safety-checker",
+    torch_dtype=torch.float16,
+)
+safety.save_pretrained("./safety-cache")