In [1]:
!pip install diffusers

Collecting diffusers
  Downloading diffusers-0.30.0-py3-none-any.whl.metadata (18 kB)
Downloading diffusers-0.30.0-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: diffusers
Successfully installed diffusers-0.30.0


In [2]:
import cv2
import numpy as np
from PIL import Image
import os

In [3]:

def extract_frames(video_path, is_gif=False):
    frames = []

    if is_gif:
        gif = Image.open(video_path)
        for frame in range(gif.n_frames):
            gif.seek(frame)
            frame_image = gif.convert('RGB')
            frame_array = np.array(frame_image)
            frames.append(frame_array)
    else:
        cap = cv2.VideoCapture(video_path)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        for i in range(frame_count):
            ret, frame = cap.read()
            if ret:
                frames.append(frame)
            else:
                break
        cap.release()

    return frames

def calculate_mean_squared_error(frames):
    squared_errors = []
    for i in range(len(frames) - 1):
        frame1 = frames[i]
        frame2 = frames[i + 1]
        squared_error = np.sum((frame1 - frame2) ** 2)
        squared_errors.append(squared_error)

    mean_squared_error = np.mean(squared_errors)
    return mean_squared_error

def compare_videos(video1_path, video2_path, is_gif1=False, is_gif2=False):

    frames1 = extract_frames(video1_path, is_gif=is_gif1)
    frames2 = extract_frames(video2_path, is_gif=is_gif2)

    # Resize frames if they are not of the same shape
    if frames1[0].shape != frames2[0].shape:
        frames2 = [cv2.resize(frame, (frames1[0].shape[1], frames1[0].shape[0])) for frame in frames2]

    # Calculate mean squared error for each video
    mse_video1 = calculate_mean_squared_error(frames1)
    mse_video2 = calculate_mean_squared_error(frames2)

    print(f"Mean Squared Error for Video 1: {mse_video1}")
    print(f"Mean Squared Error for Video 2: {mse_video2}")

    if mse_video1 < mse_video2:
        print("Video 1 has a lower mean squared error between consecutive frames compared to Video 2.")
    else:
        print("Video 2 has a lower mean squared error between consecutive frames compared to Video 1.")


###AnimateDiff

In [4]:
import torch
from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
from diffusers.utils import export_to_gif

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [5]:
def generate_gif_from_prompt(prompt, output_path="animation.gif"):
    # Load the motion adapter
    adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)

    # Load SD 1.5 based finetuned model
    model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
    pipe = AnimateDiffPipeline.from_pretrained(model_id, motion_adapter=adapter, torch_dtype=torch.float16)
    scheduler = DDIMScheduler.from_pretrained(
        model_id,
        subfolder="scheduler",
        clip_sample=False,
        timestep_spacing="linspace",
        beta_schedule="linear",
        steps_offset=1,
    )
    pipe.scheduler = scheduler

    # Enable memory savings
    pipe.enable_vae_slicing()
    pipe.enable_model_cpu_offload()

    # Generate the frames using the provided prompt
    output = pipe(
        prompt=prompt,
        negative_prompt="bad quality, worse quality",
        num_frames=16,
        guidance_scale=7.5,
        num_inference_steps=25,
        generator=torch.Generator("cpu").manual_seed(42),
    )

    # Export the frames to a GIF
    frames = output.frames[0]
    export_to_gif(frames, output_path)
    print(f"GIF saved to {output_path}")



###ModelscopeT2V

In [6]:
import torch
from diffusers import DiffusionPipeline
from diffusers.utils import export_to_video

def generate_video_from_prompt(prompt, output_path="modelscopet2v.mp4", fps=10):

    pipeline = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16")

    # Enable memory savings
    pipeline.enable_model_cpu_offload()
    pipeline.enable_vae_slicing()

    # Generate video frames using the provided prompt
    video_frames = pipeline(prompt).frames[0]

    # Export the frames to a video
    export_to_video(video_frames, output_path, fps=fps)
    print(f"Video saved to {output_path}")


###Inference Pipeline

In [7]:
# Example animatediff usage
prompt = (
    "dogecoin meme"
)
generate_gif_from_prompt(prompt, "animation.gif")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/455 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/1.82G [00:00<?, ?B/s]

The config attributes {'motion_activation_fn': 'geglu', 'motion_attention_bias': False, 'motion_cross_attention_dim': None} were passed to MotionAdapter, but are not expected and will be ignored. Please verify your config.json configuration file.


model_index.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/737 [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/577 [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

GIF saved to animation.gif


In [8]:
# Example modelscopev2 usage
prompt = "dogecoin meme"
generate_video_from_prompt(prompt, "modelscopet2v.mp4", fps=10)

model_index.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

text_encoder/config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

model.fp16.safetensors:   0%|          | 0.00/681M [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/755 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/2.82G [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/657 [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Video saved to modelscopet2v.mp4


In [9]:
# Example usage of compare_videos
video1_path = './modelscopet2v.mp4'
video2_path = './animation.gif'

compare_videos(video1_path, video2_path, is_gif1=False, is_gif2=True)

Mean Squared Error for Video 1: 5936286.8
Mean Squared Error for Video 2: 10089070.533333333
Video 1 has a lower mean squared error between consecutive frames compared to Video 2.


In [10]:
# Example animatediff usage
prompt = (
    "cat memes"
)
generate_gif_from_prompt(prompt, "animation2.gif")

The config attributes {'motion_activation_fn': 'geglu', 'motion_attention_bias': False, 'motion_cross_attention_dim': None} were passed to MotionAdapter, but are not expected and will be ignored. Please verify your config.json configuration file.


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

GIF saved to animation2.gif


In [11]:
# Example modelscopev2 usage
prompt = "cat memes"
generate_video_from_prompt(prompt, "modelscopet2v2.mp4", fps=10)

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Video saved to modelscopet2v2.mp4


In [12]:
# Example usage of compare_videos
video1_path = './modelscopet2v2.mp4'  # Replace with actual path to video1
video2_path = './animation2.gif'  # Replace with actual path to video2

compare_videos(video1_path, video2_path, is_gif1=False, is_gif2=True)

Mean Squared Error for Video 1: 4191880.6
Mean Squared Error for Video 2: 6952409.6
Video 1 has a lower mean squared error between consecutive frames compared to Video 2.


###Training Part

In [None]:
!unzip data.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: images/images/image_2793.jpg  
  inflating: images/images/image_2794.png  
  inflating: images/images/image_2795.png  
  inflating: images/images/image_2796.png  
  inflating: images/images/image_2797.jpg  
  inflating: images/images/image_2798.png  
  inflating: images/images/image_2799.png  
  inflating: images/images/image_28.jpg  
  inflating: images/images/image_280.jpg  
  inflating: images/images/image_2800.png  
  inflating: images/images/image_2801.png  
  inflating: images/images/image_2802.png  
  inflating: images/images/image_2803.png  
  inflating: images/images/image_2804.png  
  inflating: images/images/image_2805.jpg  
  inflating: images/images/image_2806.jpg  
  inflating: images/images/image_2807.jpg  
  inflating: images/images/image_2808.jpg  
  inflating: images/images/image_2809.png  
  inflating: images/images/image_281.png  
  inflating: images/images/image_2810.jpg  
  inflating: im

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os
from tqdm import tqdm


In [None]:
class MemeDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, str(self.annotations.iloc[idx, 0]))
        image = Image.open(img_name).convert("RGB")
        label = str(self.annotations.iloc[idx, 1])

        if self.transform:
            image = self.transform(image)

        return image, label

# Example usage
csv_file = './labels.csv'
root_dir = './images/images/'
dataset = MemeDataset(csv_file, root_dir)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [None]:
!pip install diffusers



In [None]:
from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter

# Load the motion adapter
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
# Load SD 1.5 based finetuned model
model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
pipe = AnimateDiffPipeline.from_pretrained(model_id, motion_adapter=adapter, torch_dtype=torch.float16)
scheduler = DDIMScheduler.from_pretrained(
    model_id,
    subfolder="scheduler",
    clip_sample=False,
    timestep_spacing="linspace",
    beta_schedule="linear",
    steps_offset=1,
)
pipe.scheduler = scheduler

vae = pipe.vae
unet = pipe.unet
text_encoder = pipe.text_encoder
tokenizer = pipe.tokenizer

# Enable memory savings
pipe.enable_vae_slicing()
pipe.enable_model_cpu_offload()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The config attributes {'motion_activation_fn': 'geglu', 'motion_attention_bias': False, 'motion_cross_attention_dim': None} were passed to MotionAdapter, but are not expected and will be ignored. Please verify your config.json configuration file.


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

The config attributes {'center_input_sample': False, 'flip_sin_to_cos': True, 'freq_shift': 0, 'mid_block_type': 'UNetMidBlock2DCrossAttn', 'only_cross_attention': False, 'attention_head_dim': 8, 'dual_cross_attention': False, 'class_embed_type': None, 'num_class_embeds': None, 'upcast_attention': False, 'resnet_time_scale_shift': 'default', 'resnet_skip_time_act': False, 'resnet_out_scale_factor': 1.0, 'time_embedding_type': 'positional', 'time_embedding_dim': None, 'time_embedding_act_fn': None, 'timestep_post_act': None, 'conv_in_kernel': 3, 'conv_out_kernel': 3, 'class_embeddings_concat': False, 'mid_block_only_cross_attention': None, 'cross_attention_norm': None, 'addition_embed_type_num_heads': 64} were passed to UNetMotionModel, but are not expected and will be ignored. Please verify your config.json configuration file.


In [None]:
import torch.optim as optim

# Training function
def train_model(vae, unet, text_encoder, dataloader, tokenizer, num_epochs=5, lr=1e-4):
    optimizer = optim.Adam(list(vae.parameters()) + list(unet.parameters()) + list(text_encoder.parameters()), lr=lr)

    vae.train()
    unet.train()
    text_encoder.train()

    for epoch in range(num_epochs):
        running_loss = 0.0
        for images, labels in tqdm(dataloader):
            images = images.to(vae.device)
            optimizer.zero_grad()

            # Encode the images using VAE
            latents = vae.encode(images).latent_dist.sample()
            latents = latents * 0.18215

            # Convert labels to strings
            labels = [str(label) for label in labels]

            # Tokenize the labels (prompts)
            text_inputs = tokenizer(labels, return_tensors="pt", padding=True, truncation=True)
            text_inputs = text_inputs.input_ids.to(vae.device)
            text_embeddings = text_encoder(text_inputs)[0]

            # Forward pass through UNet
            noise = torch.randn_like(latents).to(vae.device)
            timesteps = torch.randint(0, scheduler.num_train_timesteps, (latents.size(0),), device=vae.device).long()
            noisy_latents = scheduler.add_noise(latents, noise, timesteps)
            outputs = unet(noisy_latents, timesteps, text_embeddings).sample

            # Compute the loss
            loss = torch.nn.functional.mse_loss(outputs, noise)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(dataloader)}")

    print("Training complete")

# Prepare the dataset and dataloader
csv_file = './labels.csv'
root_dir = '/content/images/images/'
dataset = MemeDataset(csv_file, root_dir)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Train the model
train_model(vae, unet, text_encoder, dataloader, tokenizer)

  0%|          | 0/1748 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: '/content/images/images/2940'

In [None]:
# Save the fine-tuned model
vae.save_pretrained("path_to_save_finetuned_model/vae")
unet.save_pretrained("path_to_save_finetuned_model/unet")
text_encoder.save_pretrained("path_to_save_finetuned_model/text_encoder")
