modelscope · akaitsuki-ii · Aug 4, 2025 · Aug 4, 2025 · Aug 4, 2025
diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@ and offloading strategies, enabling loading of larger diffusion models (e.g., Fl
 
 ## News
 
+- **[v0.4.1](https://github.com/modelscope/DiffSynth-Engine/releases/tag/v0.4.1)** | **August 4, 2025**: 🔥Supports [Qwen-Image](https://www.modelscope.cn/models/Qwen/Qwen-Image), an image generation model excels at complex text rendering and creating images in a wide range of artistic styles.
 - **[v0.4.0](https://github.com/modelscope/DiffSynth-Engine/releases/tag/v0.4.0)** | **August 1, 2025**:
   - 🔥Supports [Wan2.2](https://modelscope.cn/collections/tongyiwanxiang-22--shipinshengcheng-2bb5b1adef2840) video generation model
   - ⚠️[**Breaking Change**] Improved `from_pretrained` method pipeline initialization
@@ -49,21 +50,24 @@ pip3 install -e .
 ### Usage
 Text to image
 ```python
-from diffsynth_engine import fetch_model, FluxImagePipeline
+from diffsynth_engine import fetch_model, FluxImagePipeline, FluxPipelineConfig
 
 model_path = fetch_model("muse/flux-with-vae", path="flux1-dev-with-vae.safetensors")
-pipe = FluxImagePipeline.from_pretrained(model_path, device='cuda:0')
+
+config = FluxPipelineConfig.basic_config(model_path=model_path, device="cuda:0")
+pipe = FluxImagePipeline.from_pretrained(config)
 image = pipe(prompt="a cat")
 image.save("image.png")
 ```
 Text to image with LoRA
 ```python
-from diffsynth_engine import fetch_model, FluxImagePipeline
+from diffsynth_engine import fetch_model, FluxImagePipeline, FluxPipelineConfig
 
 model_path = fetch_model("muse/flux-with-vae", path="flux1-dev-with-vae.safetensors")
 lora_path = fetch_model("DonRat/MAJICFLUS_SuperChinesestyleheongsam", path="麦橘超国风旗袍.safetensors")
 
-pipe = FluxImagePipeline.from_pretrained(model_path, device='cuda:0')
+config = FluxPipelineConfig.basic_config(model_path=model_path, device="cuda:0")
+pipe = FluxImagePipeline.from_pretrained(config)
 pipe.load_lora(path=lora_path, scale=1.0)
 image = pipe(prompt="a girl, qipao")
 image.save("image.png")

diff --git a/diffsynth_engine/__init__.py b/diffsynth_engine/__init__.py
@@ -3,6 +3,7 @@
     SDXLPipelineConfig,
     FluxPipelineConfig,
     WanPipelineConfig,
+    QwenImagePipelineConfig,
     ControlNetParams,
     ControlType,
 )
@@ -11,6 +12,7 @@
     SDXLImagePipeline,
     SDImagePipeline,
     WanVideoPipeline,
+    QwenImagePipeline,
 )
 from .models.flux import FluxControlNet, FluxIPAdapter, FluxRedux
 from .models.sd import SDControlNet
@@ -31,6 +33,7 @@
     "FluxPipelineConfig",
     "WanPipelineConfig",
     "FluxImagePipeline",
+    "QwenImagePipelineConfig",
     "FluxControlNet",
     "FluxIPAdapter",
     "FluxRedux",
@@ -39,6 +42,7 @@
     "SDXLImagePipeline",
     "SDImagePipeline",
     "WanVideoPipeline",
+    "QwenImagePipeline",
     "FluxInpaintingTool",
     "FluxOutpaintingTool",
     "FluxIPAdapterRefTool",

diff --git a/diffsynth_engine/conf/models/qwen_image/qwen2_5_vl_config.json b/diffsynth_engine/conf/models/qwen_image/qwen2_5_vl_config.json
@@ -0,0 +1,25 @@
+{
+    "hidden_size": 3584,
+    "intermediate_size": 18944,
+    "num_hidden_layers": 28,
+    "num_attention_heads": 28,
+    "num_key_value_heads": 4,
+    "mrope_section": [
+        16,
+        24,
+        24
+    ],
+    "rms_norm_eps": 1e-6,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "sliding_window": 32768,
+    "max_window_layers": 28,
+    "vocab_size": 152064,
+    "pad_token_id": 151643,
+    "im_start_token_id": 151644,
+    "im_end_token_id": 151645,
+    "vision_start_token_id": 151652,
+    "vision_end_token_id": 151653,
+    "image_token_id": 151655,
+    "video_token_id": 151656
+}
diff --git a/diffsynth_engine/conf/models/qwen_image/qwen2_5_vl_vision_config.json b/diffsynth_engine/conf/models/qwen_image/qwen2_5_vl_vision_config.json
@@ -0,0 +1,19 @@
+{
+    "in_channels": 3,
+    "hidden_size": 1280,
+    "intermediate_size": 3420,
+    "out_hidden_size": 3584,
+    "num_heads": 16,
+    "depth": 32,
+    "patch_size": 14,
+    "temporal_patch_size": 2,
+    "spatial_merge_size": 2,
+    "tokens_per_second": 2,
+    "window_size": 112,
+    "fullatt_block_indexes": [
+        7,
+        15,
+        23,
+        31
+    ]
+}
diff --git a/diffsynth_engine/conf/models/qwen_image/qwen_image_vae.json b/diffsynth_engine/conf/models/qwen_image/qwen_image_vae.json
@@ -0,0 +1,48 @@
+{
+    "in_channels": 3,
+    "out_channels": 3,
+    "encoder_dim": 96,
+    "decoder_dim": 96,
+    "z_dim": 16,
+    "dim_mult": [1, 2, 4, 4],
+    "num_res_blocks": 2,
+    "temperal_downsample": [false, true, true],
+    "dropout": 0.0,
+    "patch_size": 1,
+    "mean": [
+        -0.7571,
+        -0.7089,
+        -0.9113,
+        0.1075,
+        -0.1745,
+        0.9653,
+        -0.1517,
+        1.5508,
+        0.4134,
+        -0.0715,
+        0.5517,
+        -0.3632,
+        -0.1922,
+        -0.9497,
+        0.2503,
+        -0.2921
+    ],
+    "std": [
+        2.8184,
+        1.4541,
+        2.3275,
+        2.6558,
+        1.2196,
+        1.7708,
+        2.6052,
+        2.0743,
+        3.2687,
+        2.1526,
+        2.8652,
+        1.5579,
+        1.6382,
+        1.1253,
+        2.8251,
+        1.9160
+    ]
+}