In [3]:
import os
import io
import copy
from dataclasses import dataclass, field
import json
import logging
import pathlib
from typing import Dict, Optional, Sequence, List
import time
import torch, gc
import glob
import transformers
import tokenizers
import random
from torch.utils.data import Dataset
from PIL import Image, ImageFile
from datasets import load_dataset, concatenate_datasets
from pathlib import Path
from datasets.utils.logging import set_verbosity_info
from transformers import logging as tf_logging
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoProcessor
from functools import partial

In [4]:
def process_sample(sample):
    try:
        metadata = sample["json"]
        return {
            "caption": metadata.get("caption"),
            "cot": metadata.get("cot"),
            "aspect_ratio": metadata.get("aspect_ratio"),
            "img_index": metadata.get("img_index")
        }
    except Exception as e:
        print(f"Error processing sample: {e}")
        return None

In [5]:
from torch.utils.data import Dataset
from datasets import concatenate_datasets
import glob
from datasets import load_dataset
import torch
import json
import os

class LazySupervisedMixDataset(Dataset):
    def __init__(
        self,
        data_path: str,
        processor: AutoProcessor,
    ):
        super().__init__()
        self.data_files = glob.glob(os.path.join(data_path, "*.tar"))  # 保存 tar 文件列表
        train_datasets = []
        self.offsets = [0]  # 记录每个 tar 文件的起始索引

        # 逐个加载并处理 tar 文件
        for data_file in self.data_files:
            raw_dataset = load_dataset("webdataset", data_files=[data_file], split="train", num_proc=128)
            train_dataset = raw_dataset.map(process_sample).filter(lambda x: x is not None)
            train_datasets.append(train_dataset)
            self.offsets.append(self.offsets[-1] + len(train_dataset))  # 累积样本数

        # 合并数据集
        if len(train_datasets) > 1:
            self.list_data_dict = concatenate_datasets(train_datasets)
        else:
            self.list_data_dict = train_datasets[0]

        self.processor = processor

    # def process_sample(self, sample):
    #     # 示例处理逻辑，确保返回有效样本
    #     if 'caption' not in sample or 'cot' not in sample:
    #         return None
    #     return sample

    def get_tar_info(self, index: int):
        """根据全局索引定位到 tar 文件和文件内偏移量"""
        for i in range(len(self.offsets) - 1):
            if self.offsets[i] <= index < self.offsets[i + 1]:
                tar_index = i  # tar 文件索引
                file_index = index - self.offsets[i]  # 文件内偏移量
                return self.data_files[tar_index], file_index
        raise IndexError(f"Index {index} out of range")

    def __len__(self):
        return len(self.list_data_dict)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        sources = self.list_data_dict[i]

        conversation = [
            {"role": "<|User|>", "content": sources['caption']},
            {"role": "<|Assistant|>", "content": f"{sources['cot']}<begin_of_image><end_of_image>"},
        ]
        system_prompt = "You are an assistant that creates images from descriptions. First, describe the image in detail, then generate it."
        prompt = self.processor.apply_sft_template_for_multi_turn_prompts(
            conversations=conversation,
            sft_format=self.processor.sft_format,
            system_prompt=system_prompt,
        )

        # Tokenize prompt
        text_ids = self.processor.tokenizer.encode(prompt)
        all_ids = text_ids[:-2] + sources['img_index'] + text_ids[-2:]
        all_ids = torch.LongTensor(all_ids)

        # 构建图像 token 的 mask
        all_image_ids_mask = torch.zeros(all_ids.shape, dtype=torch.bool)
        all_image_ids_mask[-len(sources['img_index'])-2:-2] = True

        # 找到 Assistant 回答开始的位置
        try:
            assistant_start_token_id = self.processor.tokenizer.encode("<|Assistant|>")[0]
            assistant_start_index = text_ids.index(assistant_start_token_id)
        except (ValueError, IndexError):
            assistant_start_index = 0

        assistant_ids_mask = torch.zeros(all_ids.shape, dtype=torch.bool)
        assistant_ids_mask[assistant_start_index:] = True

        # 构造输入和标签
        input_ids = all_ids[:-1]
        text_ids_mask = (all_image_ids_mask[:-1] == False)
        image_ids_mask = all_image_ids_mask[:-1]
        label_ids = all_ids[1:]
        label_text_ids_mask = assistant_ids_mask[1:] & (all_image_ids_mask[1:] == False)
        label_image_ids_mask = assistant_ids_mask[1:] & all_image_ids_mask[1:]

        return {
            "input_ids": input_ids,
            "label_ids": label_ids,
            "text_ids_mask": text_ids_mask,
            "image_ids_mask": image_ids_mask,
            "label_text_ids_mask": label_text_ids_mask,
            "label_image_ids_mask": label_image_ids_mask,
        }

In [7]:
from janus.models.processing_vlm import VLChatProcessor
processor: VLChatProcessor = VLChatProcessor.from_pretrained("deepseek-ai/Janus-Pro-7B")
tokenizer = processor.tokenizer
print(tokenizer.model_max_length)
padding_id = tokenizer.pad_token_id
print(f"Padding ID: {padding_id}")


16384
Padding ID: 100015


In [59]:
data_files = glob.glob(os.path.join("/home/v-haodongli/mnt/v-haodongli-container/cot_output_test_train", "*.tar"))
# train_dataset = load_dataset("webdataset", data_files=data_files, split="train", streaming=True ,num_proc=8)
train_dataset = load_dataset("webdataset", data_files=data_files, split="train", num_proc=8)

In [63]:
train_dataset

Dataset({
    features: ['json', '__key__', '__url__'],
    num_rows: 2819186
})

In [61]:
train_dataset[1]

{'json': {'aspect_ratio': '3:2',
  'caption': 'Hoodoos Rule by Charlene Reinauer',
  'cot': 'The image depicts a landscape dominated by hoodoos, which are tall, thin rock formations with a conical or columnar shape. The hoodoos are primarily orange and brown in color, with some darker patches indicating variations in rock composition or weathering. The formations are closely packed together, creating a dense cluster. The background features a forested area with green trees, providing a contrast to the reddish hues of the hoodoos. The sky is not visible, focusing the viewer\'s attention on the rock formations and the forest. The style of the image is a natural, outdoor scene, likely taken during the day given the lighting and shadows. The image is titled "Hoodoos Rule" by Charlene Reinauer, suggesting a focus on the unique geological features and the artist\'s perspective.',
  'img_index': [4428,
   14063,
   13880,
   14031,
   10621,
   5473,
   6442,
   10630,
   6781,
   2956,
   15

In [29]:
def process_sample(sample):
    try:
        metadata = sample["json"]
        return {
            "caption": metadata.get("caption"),
            "cot": metadata.get("cot"),
            "aspect_ratio": metadata.get("aspect_ratio"),
            "img_index": metadata.get("img_index")
        }
    except Exception as e:
        print(f"Error processing sample: {e}")
        return None

In [36]:
import os
from dataclasses import dataclass
from typing import Dict, Sequence
import torch
from torch.utils.data import Dataset
from datasets import load_dataset, concatenate_datasets
from transformers import AutoProcessor
import glob
import transformers
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from datasets import load_dataset
import glob
from accelerate import Accelerator

In [111]:
from dataclasses import dataclass
from typing import Sequence, Dict, Any
import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import PreTrainedTokenizer

@dataclass
class DataCollatorForSupervisedDataset:
    """Collate examples for supervised fine-tuning."""
    tokenizer: PreTrainedTokenizer
    processor: Any  # 替换为你的具体 processor 类型（如 VLMProcessor）
    max_length: int = 1024
    IGNORE_INDEX: int = -100

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids_list = []
        labels_list = []
        text_ids_mask_list = []
        image_ids_mask_list = []
        label_text_ids_mask_list = []
        label_image_ids_mask_list = []

        for instance in instances:
            # 提取 caption 和 img_index
            try:
                json_data = instance['json']
                caption = json_data['caption']
                cot = json_data['cot']  # 注意这里新增了 cot 字段
                img_index = json_data['img_index']  # list of int 或者 tensor
            except KeyError as e:
                raise ValueError(f"Missing key in instance: {e}")

            # 构造 conversation
            conversation = [
                {"role": "<|User|>", "content": caption},
                {"role": "<|Assistant|>", "content": f"{cot}<begin_of_image><end_of_image>"},
            ]
            system_prompt = "You are an assistant that creates images from descriptions. First, describe the image in detail, then generate it."

            # 使用 self.processor 来生成 prompt
            prompt = self.processor.apply_sft_template_for_multi_turn_prompts(
                conversations=conversation,
                sft_format=self.processor.sft_format,
                system_prompt=system_prompt,
            )

            # Tokenize prompt
            text_ids = self.tokenizer.encode(prompt)

            # 插入图像 token ID
            all_ids = text_ids[:-2] + img_index + text_ids[-2:]
            all_ids = torch.LongTensor(all_ids)

            # 构建图像 token 的 mask
            all_image_ids_mask = torch.zeros(len(all_ids), dtype=torch.bool)
            all_image_ids_mask[-len(img_index)-2:-2] = True

            # 找到 Assistant 回答开始的位置
            try:
                assistant_start_token_id = self.tokenizer.encode("<|Assistant|>")[0]
                assistant_start_index = (all_ids == assistant_start_token_id).nonzero(as_tuple=True)[0][0].item()
            except Exception:
                assistant_start_index = 0

            # 构造各类 mask
            assistant_mask = torch.zeros(len(all_ids), dtype=torch.bool)
            assistant_mask[assistant_start_index:] = True

            # 构造 input 和 label
            input_ids = all_ids[:-1]
            label_ids = all_ids[1:]

            text_mask = (all_image_ids_mask[:-1] == False)
            image_mask = all_image_ids_mask[:-1]

            label_text_mask = assistant_mask[1:] & (all_image_ids_mask[1:] == False)
            label_image_mask = assistant_mask[1:] & all_image_ids_mask[1:]

            # 只保留 label 中需要的部分，其他设为 IGNORE_INDEX
            label_ids[~label_text_mask] = self.IGNORE_INDEX

            # 添加进列表
            input_ids_list.append(input_ids)
            labels_list.append(label_ids)
            text_ids_mask_list.append(text_mask)
            image_ids_mask_list.append(image_mask)
            label_text_ids_mask_list.append(label_text_mask)
            label_image_ids_mask_list.append(label_image_mask)

        # Padding 处理
        input_ids = pad_sequence(input_ids_list, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        labels = pad_sequence(labels_list, batch_first=True, padding_value=self.IGNORE_INDEX)
        text_ids_mask = pad_sequence(text_ids_mask_list, batch_first=True, padding_value=False)
        image_ids_mask = pad_sequence(image_ids_mask_list, batch_first=True, padding_value=False)
        label_text_ids_mask = pad_sequence(label_text_ids_mask_list, batch_first=True, padding_value=False)
        label_image_ids_mask = pad_sequence(label_image_ids_mask_list, batch_first=True, padding_value=False)

        # 截断处理
        if input_ids.size(1) > self.max_length:
            input_ids = input_ids[:, :self.max_length]
            labels = labels[:, :self.max_length]
            text_ids_mask = text_ids_mask[:, :self.max_length]
            image_ids_mask = image_ids_mask[:, :self.max_length]
            label_text_ids_mask = label_text_ids_mask[:, :self.max_length]
            label_image_ids_mask = label_image_ids_mask[:, :self.max_length]

        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=(input_ids != self.tokenizer.pad_token_id),
            text_ids_mask=text_ids_mask,
            image_ids_mask=image_ids_mask,
            label_text_ids_mask=label_text_ids_mask,
            label_image_ids_mask=label_image_ids_mask,
        )

In [117]:
# 假设你已经有一个 processor 实例
collator = DataCollatorForSupervisedDataset(
    tokenizer=processor.tokenizer,
    processor=processor,
    max_length=2048
)

# 测试一下
batch = collator([train_dataset[1], train_dataset[2], train_dataset[3]])
batch


{'input_ids': tensor([[100000,   2054,    418,  ..., 100015, 100015, 100015],
         [100000,   2054,    418,  ..., 100015, 100015, 100015],
         [100000,   2054,    418,  ...,   -100,   -100, 100593]]),
 'labels': tensor([[  2054,    418,    274,  ...,   -100,   -100,   -100],
         [  2054,    418,    274,  ...,   -100,   -100,   -100],
         [  2054,    418,    274,  ...,   -100, 100593, 100001]]),
 'attention_mask': tensor([[ True,  True,  True,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         [ True,  True,  True,  ...,  True,  True,  True]]),
 'text_ids_mask': tensor([[ True,  True,  True,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False,  True]]),
 'image_ids_mask': tensor([[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ...,  True,  True, Fal

In [115]:
batches_per_epoch = len(train_dataset) // 8
batches_per_epoch

352398

In [65]:
tar_file, file_index = train_dataset.get_tar_info(2132052)
print(f"索引 2132052 的数据位于 tar 文件: {tar_file}，文件内第 {file_index} 个样本")

AttributeError: 'Dataset' object has no attribute 'get_tar_info'

In [None]:
import webdataset as wds

shard_path = '/mnt/v-haodongli/cot_output_test_train/02166.tar'
target_key = "02166_00028"  # 想看的 key

dataset = wds.WebDataset(shard_path).decode().to_tuple("__key__", "json")

for key, label in dataset:
    if key == target_key:
        print("Key:", key)
        print("Label:", label)
        break  # 找到后退出循环   

In [87]:
processor: VLChatProcessor = VLChatProcessor.from_pretrained("deepseek-ai/Janus-Pro-7B")

In [105]:
from dataclasses import dataclass
from typing import Sequence, Dict, Any
import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import PreTrainedTokenizer

@dataclass
class DataCollatorForSupervisedDataset:
    """Collate examples for supervised fine-tuning."""
    tokenizer: PreTrainedTokenizer
    max_length: int = 1024
    IGNORE_INDEX: int = -100

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids_list = []
        labels_list = []
        text_ids_mask_list = []
        image_ids_mask_list = []
        label_text_ids_mask_list = []
        label_image_ids_mask_list = []

        for instance in instances:
            # 提取 caption 和 img_index
            try:
                json_data = instance['json']
                caption = json_data['caption']
                img_index = json_data['img_index']  # list of int 或者 tensor
            except KeyError as e:
                raise ValueError(f"Missing key in instance: {e}")

            # 构造 conversation
            conversation = [
                {"role": "<|User|>", "content": caption},
                {"role": "<|Assistant|>", "content": f"{caption}<begin_of_image><end_of_image>"},
            ]
            system_prompt = "You are an assistant that creates images from descriptions. First, describe the image in detail, then generate it."

            # Tokenize prompt
            prompt = self.tokenizer.apply_sft_template_for_multi_turn_prompts(
                conversations=conversation,
                sft_format="simple",  # 根据你的 processor 修改
                system_prompt=system_prompt,
            )
            text_ids = self.tokenizer.encode(prompt)

            # 插入图像 token ID
            all_ids = text_ids[:-2] + img_index + text_ids[-2:]
            all_ids = torch.LongTensor(all_ids)

            # 构建图像 token 的 mask
            all_image_ids_mask = torch.zeros(len(all_ids), dtype=torch.bool)
            all_image_ids_mask[-len(img_index)-2:-2] = True

            # 找到 Assistant 回答开始的位置
            try:
                assistant_start_token_id = self.tokenizer.encode("<|Assistant|>")[0]
                assistant_start_index = (all_ids == assistant_start_token_id).nonzero(as_tuple=True)[0][0].item()
            except Exception:
                assistant_start_index = 0

            # 构造各类 mask
            assistant_mask = torch.zeros(len(all_ids), dtype=torch.bool)
            assistant_mask[assistant_start_index:] = True

            # 构造 input 和 label
            input_ids = all_ids[:-1]
            label_ids = all_ids[1:]

            text_mask = (all_image_ids_mask[:-1] == False)
            image_mask = all_image_ids_mask[:-1]

            label_text_mask = assistant_mask[1:] & (all_image_ids_mask[1:] == False)
            label_image_mask = assistant_mask[1:] & all_image_ids_mask[1:]

            # 只保留 label 中需要的部分，其他设为 IGNORE_INDEX
            label_ids[~label_text_mask] = self.IGNORE_INDEX

            # 添加进列表
            input_ids_list.append(input_ids)
            labels_list.append(label_ids)
            text_ids_mask_list.append(text_mask)
            image_ids_mask_list.append(image_mask)
            label_text_ids_mask_list.append(label_text_mask)
            label_image_ids_mask_list.append(label_image_mask)

        # Padding 处理
        input_ids = pad_sequence(input_ids_list, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        labels = pad_sequence(labels_list, batch_first=True, padding_value=self.IGNORE_INDEX)
        text_ids_mask = pad_sequence(text_ids_mask_list, batch_first=True, padding_value=False)
        image_ids_mask = pad_sequence(image_ids_mask_list, batch_first=True, padding_value=False)
        label_text_ids_mask = pad_sequence(label_text_ids_mask_list, batch_first=True, padding_value=False)
        label_image_ids_mask = pad_sequence(label_image_ids_mask_list, batch_first=True, padding_value=False)

        # 截断处理
        if input_ids.size(1) > self.max_length:
            input_ids = input_ids[:, :self.max_length]
            labels = labels[:, :self.max_length]
            text_ids_mask = text_ids_mask[:, :self.max_length]
            image_ids_mask = image_ids_mask[:, :self.max_length]
            label_text_ids_mask = label_text_ids_mask[:, :self.max_length]
            label_image_ids_mask = label_image_ids_mask[:, :self.max_length]

        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=(input_ids != self.tokenizer.pad_token_id),
            text_ids_mask=text_ids_mask,
            image_ids_mask=image_ids_mask,
            label_text_ids_mask=label_text_ids_mask,
            label_image_ids_mask=label_image_ids_mask,
        )

In [107]:
collator = DataCollatorForSupervisedDataset(tokenizer=processor.tokenizer)

dataloader = DataLoader(
    train_dataset,
    batch_size=4,
    collate_fn=collator,
    num_workers=8,
    pin_memory=True
)

for batch in dataloader:
    print(batch)
    break  # 只打印第一个 batch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/v-haodongli/miniconda3/envs/janus/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/home/v-haodongli/miniconda3/envs/janus/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch
    return self.collate_fn(data)
  File "/tmp/ipykernel_9383/2233236194.py", line 39, in __call__
    prompt = self.tokenizer.apply_sft_template_for_multi_turn_prompts(
  File "/home/v-haodongli/miniconda3/envs/janus/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 1108, in __getattr__
    raise AttributeError(f"{self.__class__.__name__} has no attribute {key}")
AttributeError: LlamaTokenizerFast has no attribute apply_sft_template_for_multi_turn_prompts


In [99]:
def collate_batch(instances, tokenizer, max_length=1024, IGNORE_INDEX=-100):
    input_ids_list = []
    labels_list = []
    text_ids_mask_list = []
    image_ids_mask_list = []
    label_text_ids_mask_list = []
    label_image_ids_mask_list = []

    for instance in instances:
        # 提取 caption 和 img_index
        try:
            json_data = instance['json']
            caption = json_data['caption']
            img_index = json_data['img_index']  # list of int 或者 tensor
        except KeyError as e:
            raise ValueError(f"Missing key in instance: {e}")

        # 构造 conversation
        conversation = [
            {"role": "<|User|>", "content": caption},
            {"role": "<|Assistant|>", "content": f"{caption}<begin_of_image><end_of_image>"},
        ]
        system_prompt = "You are an assistant that creates images from descriptions. First, describe the image in detail, then generate it."

        # Tokenize prompt
        prompt = tokenizer.apply_sft_template_for_multi_turn_prompts(
            conversations=conversation,
            sft_format="simple",  # 根据你的 processor 修改
            system_prompt=system_prompt,
        )
        text_ids = tokenizer.encode(prompt)

        # 插入图像 token ID
        all_ids = text_ids[:-2] + img_index + text_ids[-2:]
        all_ids = torch.LongTensor(all_ids)

        # 构建图像 token 的 mask
        all_image_ids_mask = torch.zeros(len(all_ids), dtype=torch.bool)
        all_image_ids_mask[-len(img_index)-2:-2] = True

        # 找到 Assistant 回答开始的位置
        try:
            assistant_start_token_id = tokenizer.encode("<|Assistant|>")[0]
            assistant_start_index = (all_ids == assistant_start_token_id).nonzero(as_tuple=True)[0][0].item()
        except Exception:
            assistant_start_index = 0

        # 构造各类 mask
        assistant_mask = torch.zeros(len(all_ids), dtype=torch.bool)
        assistant_mask[assistant_start_index:] = True

        # 构造 input 和 label
        input_ids = all_ids[:-1]
        label_ids = all_ids[1:]

        text_mask = (all_image_ids_mask[:-1] == False)
        image_mask = all_image_ids_mask[:-1]

        label_text_mask = assistant_mask[1:] & (all_image_ids_mask[1:] == False)
        label_image_mask = assistant_mask[1:] & all_image_ids_mask[1:]

        # 只保留 label 中需要的部分，其他设为 IGNORE_INDEX
        label_ids[~label_text_mask] = IGNORE_INDEX

        # 添加进列表
        input_ids_list.append(input_ids)
        labels_list.append(label_ids)
        text_ids_mask_list.append(text_mask)
        image_ids_mask_list.append(image_mask)
        label_text_ids_mask_list.append(label_text_mask)
        label_image_ids_mask_list.append(label_image_mask)

    # Padding 处理
    input_ids = pad_sequence(input_ids_list, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels = pad_sequence(labels_list, batch_first=True, padding_value=IGNORE_INDEX)
    text_ids_mask = pad_sequence(text_ids_mask_list, batch_first=True, padding_value=False)
    image_ids_mask = pad_sequence(image_ids_mask_list, batch_first=True, padding_value=False)
    label_text_ids_mask = pad_sequence(label_text_ids_mask_list, batch_first=True, padding_value=False)
    label_image_ids_mask = pad_sequence(label_image_ids_mask_list, batch_first=True, padding_value=False)

    # 截断处理
    if input_ids.size(1) > max_length:
        input_ids = input_ids[:, :max_length]
        labels = labels[:, :max_length]
        text_ids_mask = text_ids_mask[:, :max_length]
        image_ids_mask = image_ids_mask[:, :max_length]
        label_text_ids_mask = label_text_ids_mask[:, :max_length]
        label_image_ids_mask = label_image_ids_mask[:, :max_length]

    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": (input_ids != tokenizer.pad_token_id),
        "text_ids_mask": text_ids_mask,
        "image_ids_mask": image_ids_mask,
        "label_text_ids_mask": label_text_ids_mask,
        "label_image_ids_mask": label_image_ids_mask,
    }

In [102]:
dataloader = DataLoader(
    train_dataset,
    batch_size=4,
    collate_fn=DataCollatorForSupervisedDataset(),
    num_workers=8,
    pin_memory=True
)

TypeError: DataCollatorForSupervisedDataset.__init__() missing 2 required positional arguments: 'tokenizer' and 'processor'

In [None]:
for i in range(len(train_dataset)):
    try:
        sample = train_dataset[i]
        if len(sample['input_ids'])>2000:
            print(f"Sample index {i}: input_ids length = {len(sample['input_ids'])}")
    except Exception as e:
        print(f"Error at index {i}: {e}")
        # 打印原始数据源信息
        print("Raw data:", train_dataset.data[i])
        continue

In [None]:
start_index = 2010347
for i in range(start_index, len(train_dataset)):
    try:
        sample = train_dataset[i]
        if len(sample['input_ids']) > 2000:  # 检查 input_ids 长度
            print(f"Sample index {i}: input_ids length = {len(sample['input_ids'])}")
    except Exception as e:
        print(f"Error at index {i}: {e}")  # 打印错误信息
        print("Raw data:", train_dataset.data[i])  # 打印原始数据（需确保 data 属性存在）
        continue  # 跳过当前错误样本，继续循环

In [None]:
start_index = 2132053
for i in range(start_index, len(train_dataset)):
    try:
        sample = train_dataset[i]
        if len(sample['input_ids']) > 2000:  # 检查 input_ids 长度
            print(f"Sample index {i}: input_ids length = {len(sample['input_ids'])}")
    except Exception as e:
        print(f"Error at index {i}: {e}")  # 打印错误信息
        print("Raw data:", train_dataset.data[i])  # 打印原始数据（需确保 data 属性存在）
        continue  # 跳过当前错误样本，继续循环

In [None]:
import webdataset as wds
import os

# 输入路径（同时也是目标输出路径）
input_shard = '/mnt/v-haodongli/cot_output_test_train/02166.tar'

# 临时中间文件路径
temp_shard = input_shard + ".tmp"

# 想要删除的 key
target_key_to_remove = "02166_00027"

# 第一步：读取原始 tar，过滤后写入临时文件
with wds.TarWriter(temp_shard) as sink:
    with wds.WebDataset(input_shard) as dataset:
        for sample in dataset:
            if sample["__key__"] == target_key_to_remove:
                print(f"Skipping key: {target_key_to_remove}")
                continue
            sink.write(sample)

# 第二步：将临时文件替换回原文件名（覆盖原文件）
os.replace(temp_shard, input_shard)

print(f"Done. Removed key '{target_key_to_remove}' and overwritten the original file.")

In [6]:
import torch

# 文件路径
file_path = "/scratch/amlt_code/debug_batch_step_327_rank_3.pt"

# 加载文件
data = torch.load(file_path, map_location='cpu')  # 建议先加载到 CPU 上

# 打印所有 key
print("Keys in the saved file:")
print(data.keys())

# 打印每个 key 对应的数据形状或内容
print("\nData details:")
for key, value in data.items():
    if isinstance(value, torch.Tensor):
        print(f"{key}: {value.shape} | dtype: {value.dtype}")
    else:
        print(f"{key}: {type(value)}")
        if isinstance(value, dict):
            for k, v in value.items():
                if isinstance(v, torch.Tensor):
                    print(f"  {k}: {v.shape} | dtype: {v.dtype}")
                else:
                    print(f"  {k}: {type(v)}")
        else:
            print(f"  Value: {value}")

Keys in the saved file:
dict_keys(['batch', 'global_step', 'input_ids', 'text_id_mask', 'image_id_mask', 'label_ids', 'label_text_id_mask', 'label_image_id_mask'])

Data details:
batch: <class 'dict'>
  input_ids: torch.Size([4, 1176]) | dtype: torch.int64
  label_ids: torch.Size([4, 1176]) | dtype: torch.int64
  attention_mask: torch.Size([4, 1176]) | dtype: torch.bool
  text_id_mask: torch.Size([4, 1176]) | dtype: torch.bool
  image_id_mask: torch.Size([4, 1176]) | dtype: torch.bool
  label_text_id_mask: torch.Size([4, 1176]) | dtype: torch.bool
  label_image_id_mask: torch.Size([4, 1176]) | dtype: torch.bool
global_step: <class 'int'>
  Value: 326
input_ids: torch.Size([4, 1176]) | dtype: torch.int64
text_id_mask: torch.Size([4, 1176]) | dtype: torch.bool
image_id_mask: torch.Size([4, 1176]) | dtype: torch.bool
label_ids: torch.Size([4, 1176]) | dtype: torch.int64
label_text_id_mask: torch.Size([4, 1176]) | dtype: torch.bool
label_image_id_mask: torch.Size([4, 1176]) | dtype: torch.

In [7]:
label_ids = data["label_ids"]
label_text_ids_mask = data["label_text_id_mask"]
input_ids = data["input_ids"]
image_ids_mask = data["image_id_mask"]


In [8]:
text = processor.tokenizer.decode(label_ids[label_text_ids_mask], skip_special_tokens=False)

In [9]:
text

'You are an assistant that creates images from descriptions. First, describe the image in detail, then generate it.\n\n<|User|>: Valentines table runner\n\n<|Assistant|>: A Valentine\'s table runner is displayed. It is rectangular in shape, with a red and white color scheme. The runner is composed of numerous small squares, each featuring a heart design. The hearts are red and white, with some having a three-dimensional texture. The squares are arranged in a grid pattern, creating a patchwork effect. The table runner is placed on a wooden surface, which is partially visible at the top and bottom edges of the image. The style of the image is a close-up photograph, focusing on the intricate details of the table runner.<begin_of_image><end_of_image><｜end▁of▁sentence｜>You are an assistant that creates images from descriptions. First, describe the image in detail, then generate it.\n\n<|User|>: The Coolest Cars In Racing Game History\n\n<|Assistant|>: The image depicts a racing game interfa

In [5]:
import numpy as np
def decode_to_pil(vq_list, vl_gpt, shape=(1, 8, 24, 24)):
    # 将列表转为张量并移动到GPU
    vq_tensor = torch.tensor(vq_list, dtype=torch.int, device="cuda")
    print(vq_tensor.shape)
    # 解码图像数据（假设vl_gpt已加载）
    with torch.no_grad():
        dec = vl_gpt.gen_vision_model.decode_code(vq_tensor, shape=shape)
    
    # 后处理：张量转图像
    dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
    dec = np.clip((dec + 1) / 2 * 255, 0, 255).astype(np.uint8)
    return Image.fromarray(dec[0])

In [10]:
from janus.models.modeling_vlm import MultiModalityCausalLM
model: MultiModalityCausalLM = MultiModalityCausalLM.from_pretrained(
        "deepseek-ai/Janus-Pro-7B",
        trust_remote_code=True).to("cuda")
image = decode_to_pil(input_ids[image_ids_mask].tolist(), model)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.12it/s]


torch.Size([2304])


RuntimeError: shape '[1, 24, 24, 8]' is invalid for input of size 18432

In [1]:
import torch

# 文件路径
file_path = "/scratch/amlt_code/debug_batch_step_2_rank_3.pt"

# 加载文件
data = torch.load(file_path, map_location='cpu')  # 建议先加载到 CPU 上

# 打印所有 key
print("Keys in the saved file:")
print(data.keys())

# 打印每个 key 对应的数据形状或内容
print("\nData details:")
for key, value in data.items():
    if isinstance(value, torch.Tensor):
        print(f"{key}: {value.shape} | dtype: {value.dtype}")
    else:
        print(f"{key}: {type(value)}")
        if isinstance(value, dict):
            for k, v in value.items():
                if isinstance(v, torch.Tensor):
                    print(f"  {k}: {v.shape} | dtype: {v.dtype}")
                else:
                    print(f"  {k}: {type(v)}")
        else:
            print(f"  Value: {value}")

Keys in the saved file:
dict_keys(['batch', 'global_step', 'input_ids', 'text_id_mask', 'image_id_mask', 'label_ids', 'label_text_id_mask', 'label_image_id_mask'])

Data details:
batch: <class 'dict'>
  input_ids: torch.Size([4, 780]) | dtype: torch.int64
  label_ids: torch.Size([4, 780]) | dtype: torch.int64
  attention_mask: torch.Size([4, 780]) | dtype: torch.bool
  text_id_mask: torch.Size([4, 780]) | dtype: torch.bool
  image_id_mask: torch.Size([4, 780]) | dtype: torch.bool
  label_text_id_mask: torch.Size([4, 780]) | dtype: torch.bool
  label_image_id_mask: torch.Size([4, 780]) | dtype: torch.bool
global_step: <class 'int'>
  Value: 1
input_ids: torch.Size([4, 780]) | dtype: torch.int64
text_id_mask: torch.Size([4, 780]) | dtype: torch.bool
image_id_mask: torch.Size([4, 780]) | dtype: torch.bool
label_ids: torch.Size([4, 780]) | dtype: torch.int64
label_text_id_mask: torch.Size([4, 780]) | dtype: torch.bool
label_image_id_mask: torch.Size([4, 780]) | dtype: torch.bool


In [2]:
label_ids = data["label_ids"]
label_text_ids_mask = data["label_text_id_mask"]
input_ids = data["input_ids"]
image_ids_mask = data["image_id_mask"]


In [6]:
from janus.models.modeling_vlm import MultiModalityCausalLM
model: MultiModalityCausalLM = MultiModalityCausalLM.from_pretrained(
        "deepseek-ai/Janus-Pro-7B",
        trust_remote_code=True).to("cuda")
image = decode_to_pil(input_ids[image_ids_mask].tolist(), model)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.19it/s]


torch.Size([2304])


RuntimeError: shape '[1, 24, 24, 8]' is invalid for input of size 18432

In [8]:
from janus.models.processing_vlm import VLChatProcessor
processor: VLChatProcessor = VLChatProcessor.from_pretrained("deepseek-ai/Janus-Pro-7B")
tokenizer = processor.tokenizer
text = processor.tokenizer.decode(label_ids[label_text_ids_mask], skip_special_tokens=False)
text

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


"You are an assistant that creates images from descriptions. First, describe the image in detail, then generate it.\n\n<|User|>: 76 best cars coloring pages images on Pinterest Coloring sheets\n\n<|Assistant|>: The image is a black and white line drawing of a cartoon-style car. The car has a smiling face with eyes, a nose, and a mouth. It has a rounded body with a small, flat roof. The car is depicted with a single wheel on the front and a single wheel on the back, both of which are simple lines. The car is shown in a side profile, facing to the right. The car has a small, rectangular shape with a slightly curved front. The car is on a flat surface, and there are small, dashed lines indicating the ground beneath it. The style of the image is simple and cartoonish, with clean lines and no shading.<begin_of_image><end_of_image><｜end▁of▁sentence｜>You are an assistant that creates images from descriptions. First, describe the image in detail, then generate it.\n\n<|User|>: Healthier 2-Ingr