# SWIFT在【多模态领域】的应用

#### 使用样例见：https://github.com/modelscope/swift/tree/main/examples/pytorch/multi_modal/notebook

## 1. 文本生成图像

### 1.1 安装与导入包

In [None]:
# basic / third-party
import os
from  matplotlib import pyplot as plt
%matplotlib inline

# SWIFT
from swift import Swift, SwiftModel, snapshot_download, push_to_hub
from swift import AdapterConfig, LoRAConfig, PromptConfig, SideConfig, ResTuningConfig

# Modelscope
import modelscope
from modelscope.pipelines import pipeline
from modelscope.models import Model
from modelscope.utils.config import Config
from modelscope.metainfo import Trainers
from modelscope.msdatasets import MsDataset
from modelscope.trainers import build_trainer
from modelscope.utils.constant import ModelFile


### 1.2 Stable Diffusion

High-Resolution Image Synthesis with Latent Diffusion Models

<img src="resources/images/sd.jpg" width="800" align="middle" />

### 1.3 特定主体训练

### 1.3.1 数据集
- Dogs: https://modelscope.cn/datasets/buptwq/lora-stable-diffusion-finetune/summary

<img src="resources/images/dog.jpeg" width="200" align="middle" />

- 加载数据集

In [None]:
train_dataset = MsDataset.load(
        'buptwq/lora-stable-diffusion-finetune',
        split='train',
        ).remap_columns({'Text': 'prompt'})
eval_dataset = MsDataset.load(
        'buptwq/lora-stable-diffusion-finetune',
        split='validation',
        ).remap_columns({'Text': 'prompt'})

### 1.3.2 使用modelscope加载SD模型

In [None]:
model_id = 'damo/multi-modal_efficient-diffusion-tuning-swift-base'
task = 'efficient-diffusion-tuning'
revision = 'v1.0.1'

model_dir = snapshot_download(model_id)
cfg_dict = Config.from_file(os.path.join(model_dir, ModelFile.CONFIGURATION))
cfg_dict.model.inference = False
model = Model.from_pretrained(model_id, cfg_dict=cfg_dict, revision=revision)

#### 1.3.2 查看模型信息

In [None]:
print(model)

In [None]:
module_keys = [key for key, _ in model.named_modules()]
print(module_keys)

#### 1.3.3 配置SwiftConfig + 模型准备

#### Swift - Transformer - LoRA

<img src="resources/images/lora.png" width="500" align="middle" />

In [None]:
# unet
lora_config = LoRAConfig(
    r=16,
    target_modules=".*unet.*.(to_q|to_k|to_v|to_out.0)$"
)
model = Swift.prepare_model(model, lora_config)

#### 1.3.4 查看微调模型信息

In [None]:
print(model)
print(model.get_trainable_parameters())

#### 1.3.5 训练

In [None]:
def cfg_modify_fn(cfg):
    cfg.preprocessor.resolution = 512
    cfg.train.lr_scheduler = {
        'type': 'LambdaLR',
        'lr_lambda': lambda _: 1,
        'last_epoch': -1
    }
    cfg.train.max_epochs = 100
    cfg.train.optimizer.lr = 1e-4
    cfg.model.inference = False
    cfg.model.pretrained_tuner = None
    trainer_hook = cfg.train.hooks
    trainer_hook.append({"type": "SwiftHook"})
    cfg.train.hooks = trainer_hook
    return cfg

work_dir = "tmp/multimodal_swift_lora_1"
kwargs = dict(
    model=model,
    cfg_file=os.path.join(model_dir, 'configuration.json'),
    work_dir=work_dir,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    cfg_modify_fn=cfg_modify_fn
)

trainer = build_trainer(name='efficient-diffusion-tuning', default_args=kwargs)
trainer.train()

#### 1.3.6 测试

In [None]:
# 加载modelscope基础模型
model_dir = snapshot_download(model_id)
cfg_dict = Config.from_file(os.path.join(model_dir, ModelFile.CONFIGURATION))
cfg_dict.model.inference = True
model = Model.from_pretrained(model_id, cfg_dict=cfg_dict, revision=revision)

# 创建modelscope pipeline进行推理
pipe = pipeline(task="efficient-diffusion-tuning", model=model)

# 推理流程
test_prompt = "a dog"
img_out = pipe({'prompt': test_prompt}, num_inference_steps=50, generator_seed=123)['output_imgs'][0][:,:,::-1]

# 展示图片
plt.xticks([]), plt.yticks([])
plt.imshow(img_out)

In [None]:
# 加载Swift-Tuner模型参数
model = Swift.from_pretrained(model, os.path.join(work_dir, 'output_swift'))

pipe = pipeline(task="efficient-diffusion-tuning", model=model)

test_prompt = "a dog"
img_out = pipe({'prompt': test_prompt}, num_inference_steps=50, generator_seed=123)['output_imgs'][0][:,:,::-1]
plt.xticks([]), plt.yticks([])
plt.imshow(img_out)

### 1.4 特定风格训练

### 1.4.1 数据集
- Styles: https://modelscope.cn/datasets/damo/style_custom_dataset/summary

<img src="resources/images/flatillustration.jpeg" width="200" align="middle" />

- 加载数据集

In [None]:
train_dataset = MsDataset.load(
    'style_custom_dataset',
    namespace='damo',
    split='train',
    subset_name='Flatillustration'
).remap_columns({'Image:FILE': 'target:FILE'})

### 1.4.2 使用 modelscope + Swift 准备微调模型

In [None]:
model_id = 'damo/multi-modal_efficient-diffusion-tuning-swift-base'
task = 'efficient-diffusion-tuning'
revision = 'v1.0.1'

model_dir = snapshot_download(model_id)
cfg_dict = Config.from_file(os.path.join(model_dir, ModelFile.CONFIGURATION))
cfg_dict.model.inference = False
model = Model.from_pretrained(model_id, cfg_dict=cfg_dict, revision=revision)

# unet + text_encoder
lora_config = LoRAConfig(
    r=128,
    lora_alpha=128,
    target_modules="(.*unet.*.(to_q|to_k|to_v|to_out.0)$)|(.*text_encoder.*.(q_proj|k_proj|v_proj|out_proj)$)"
)
model = Swift.prepare_model(model, lora_config)

In [None]:
print(model)
print(model.get_trainable_parameters())

#### 1.4.3 训练

In [None]:
def cfg_modify_fn(cfg):
    cfg.preprocessor.resolution = 512
    cfg.train.lr_scheduler = {
        'type': 'LambdaLR',
        'lr_lambda': lambda _: 1,
        'last_epoch': -1
    }
    cfg.train.max_epochs = 100
    cfg.train.optimizer.lr = 1e-4
    cfg.model.inference = False
    cfg.model.pretrained_tuner = None
    trainer_hook = cfg.train.hooks
    trainer_hook.append({"type": "SwiftHook"})
    cfg.train.hooks = trainer_hook
    return cfg

work_dir = "tmp/multimodal_swift_lora_2"
kwargs = dict(
    model=model,
    cfg_file=os.path.join(model_dir, 'configuration.json'),
    work_dir=work_dir,
    train_dataset=train_dataset,
    cfg_modify_fn=cfg_modify_fn
)

trainer = build_trainer(name='efficient-diffusion-tuning', default_args=kwargs)
trainer.train()

#### 1.4.4 测试

In [None]:
model_dir = snapshot_download(model_id)
cfg_dict = Config.from_file(os.path.join(model_dir, ModelFile.CONFIGURATION))
cfg_dict.model.inference = True
model = Model.from_pretrained(model_id, cfg_dict=cfg_dict, revision=revision)
model = Swift.from_pretrained(model, os.path.join(work_dir, 'output_swift'))

pipe = pipeline(task="efficient-diffusion-tuning", model=model)

test_prompt = "a dog"
img_out = pipe({'prompt': test_prompt}, num_inference_steps=50, generator_seed=123)['output_imgs'][0][:,:,::-1]
plt.xticks([]), plt.yticks([])
plt.imshow(img_out)