In [1]:
from jinja2 import Template

CHAT_TEMPLATE = """
{%- set nl = '\\n' -%}
{%- set im_start = '<|im_start|>' -%}
{%- set im_end = '<|im_end|>' -%}
{{- im_start -}}system{{- nl -}}{{- system_message -}}{{- im_end -}}{{- nl -}}
{%- for message in messages -%}
    {%- if message.from == 'human' -%}
        {{- im_start -}}user{{- nl -}}
        {%- if message.value is not none -%}
            {%- set parts = message.value.split('<image>') -%}
            {%- for part in parts -%}
                {{- part -}}
                {%- if not loop.last -%}<image>{{- nl -}}{%- endif -%}
            {%- endfor -%}
        {%- endif -%}
        {{- im_end -}}{{- nl -}}
    {%- elif message.from == 'gpt' -%}
        {{- im_start -}}assistant{{- nl -}}
        {{- message.value if message.value is not none else '' -}}
        {{- im_end -}}{{- nl -}}
    {%- endif -%}
{%- endfor -%}
"""

conversation = [
        {"from": "human", "value": "<image>Hello!"},
        {"from": "gpt", "value": "Hi there!"}
]
system_message = "Test system message"
template = Template(CHAT_TEMPLATE)
rendered = template.render(
    system_message=system_message,
    messages=conversation
)
print(rendered)

<|im_start|>system
Test system message<|im_end|>
<|im_start|>user
<image>
Hello!<|im_end|>
<|im_start|>assistant
Hi there!<|im_end|>



In [19]:
from transformers import AutoTokenizer
import torch, re
from llava.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN

def preprocess_qwen(sources, tokenizer, has_image: bool = False, max_len=2048, system_message: str = "You are a helpful assistant."):
    roles = {"human": "<|im_start|>user", "gpt": "<|im_start|>assistant"}
    im_start, im_end = tokenizer.additional_special_tokens_ids
    nl_tokens = tokenizer("\n").input_ids
    _system = tokenizer("system").input_ids + nl_tokens
    input_ids = []
    source = sources
    if roles[source[0]["from"]] != roles["human"]: source = source[1:]
    input_id, target = [], []
    system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens
    input_id += system
    target += [im_start] + [IGNORE_INDEX] * (len(system) - 3) + [im_end] + nl_tokens
    assert len(input_id) == len(target)
    for j, sentence in enumerate(source):
        role = roles[sentence["from"]]
        if has_image and sentence["value"] is not None and "<image>" in sentence["value"]:
            num_image = len(re.findall(DEFAULT_IMAGE_TOKEN, sentence["value"]))
            texts = sentence["value"].split('<image>')
            _input_id = tokenizer(role).input_ids + nl_tokens 
            for i,text in enumerate(texts):
                _input_id += tokenizer(text).input_ids 
                if i<len(texts)-1: _input_id += [IMAGE_TOKEN_INDEX] + nl_tokens
            _input_id += [im_end] + nl_tokens
            assert sum([i==IMAGE_TOKEN_INDEX for i in _input_id])==num_image
        else:
            if sentence["value"] is None: _input_id = tokenizer(role).input_ids + nl_tokens
            else: _input_id = tokenizer(role).input_ids + nl_tokens + tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens
        input_id += _input_id
    input_ids.append(input_id)
    return torch.tensor(input_ids, dtype=torch.long)

tokenizer = AutoTokenizer.from_pretrained('neulab/Pangea-7B')
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
tokenizer.add_tokens([DEFAULT_IMAGE_TOKEN], special_tokens=True)
input_ids = preprocess_qwen(conversation, tokenizer, has_image=True, system_message=system_message)

In [20]:
input_ids

tensor([[151644,   8948,    198,   2271,   1849,   1943, 151645,    198, 151644,
            872,    198,   -200,    198,   9707,      0, 151645,    198, 151644,
          77091,    198,  13048,   1052,      0, 151645,    198]])

In [21]:
len(tokenizer.vocab)

151650

In [25]:
new_input_ids = tokenizer(rendered)["input_ids"]
image_token_id = tokenizer.convert_tokens_to_ids(DEFAULT_IMAGE_TOKEN)
new_input_ids[new_input_ids.index(image_token_id)] = IMAGE_TOKEN_INDEX
new_input_ids == input_ids[0].tolist()

True

In [13]:
len(tokenizer.vocab)

151649

In [16]:
print(tokenizer.decode(input_ids[0][:11]))

<|im_start|>system
Test system message<|im_end|>
<|im_start|>user



In [22]:
from datasets import load_dataset

input_dataset = load_dataset("CNCL-Penn-State/cpo_en_multitask_text_fa_msd5_mm10_ms20_div_nov_sur_qua")

Using the latest cached version of the dataset since CNCL-Penn-State/cpo_en_multitask_text_fa_msd5_mm10_ms20_div_nov_sur_qua couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/ismayilz/.cache/huggingface/datasets/CNCL-Penn-State___cpo_en_multitask_text_fa_msd5_mm10_ms20_div_nov_sur_qua/default/0.0.0/b25b8f9b353c159240dc7fc4f25c3221eab75ed0 (last modified on Thu May 29 10:27:52 2025).


In [23]:
input_dataset

DatasetDict({
    train: Dataset({
        features: ['dataset', 'task', 'score_label', 'score_chosen', 'score_rejected', 'chosen', 'rejected', 'novelty_chosen', 'surprise_chosen', 'diversity_chosen', 'quality_chosen'],
        num_rows: 42058
    })
    val: Dataset({
        features: ['dataset', 'task', 'score_label', 'score_chosen', 'score_rejected', 'chosen', 'rejected', 'novelty_chosen', 'surprise_chosen', 'diversity_chosen', 'quality_chosen'],
        num_rows: 4880
    })
    test: Dataset({
        features: ['dataset', 'task', 'score_label', 'score_chosen', 'score_rejected', 'chosen', 'rejected', 'novelty_chosen', 'surprise_chosen', 'diversity_chosen', 'quality_chosen'],
        num_rows: 4533
    })
    heldout_item: Dataset({
        features: ['dataset', 'task', 'score_label', 'score_chosen', 'score_rejected', 'chosen', 'rejected', 'novelty_chosen', 'surprise_chosen', 'diversity_chosen', 'quality_chosen'],
        num_rows: 3521
    })
    heldout_task: Dataset({
        f

In [24]:
from datasets import Dataset
from tqdm import tqdm 

def extend_dataset(dataset):
    extended_data = []
    for sample in tqdm(dataset):
        extended_data.append({
            **sample,
            "chosen": [{"role": "user", "content": f"<image>{sample['chosen'][0]['content']}"}, sample["chosen"][1]],
            "rejected": [{"role": "user", "content": f"<image>{sample['rejected'][0]['content']}"}, sample["rejected"][1]],
            "image": "data/images/general/allava-4v/89708.jpeg"
        })
    return Dataset.from_generator(lambda: extended_data)

In [25]:
for split in input_dataset.keys():
    input_dataset[split] = extend_dataset(input_dataset[split])

100%|██████████| 42058/42058 [00:03<00:00, 10793.10it/s]


Generating train split: 0 examples [00:00, ? examples/s]

100%|██████████| 4880/4880 [00:00<00:00, 11657.71it/s]


Generating train split: 0 examples [00:00, ? examples/s]

100%|██████████| 4533/4533 [00:00<00:00, 11638.98it/s]


Generating train split: 0 examples [00:00, ? examples/s]

100%|██████████| 3521/3521 [00:00<00:00, 11798.15it/s]


Generating train split: 0 examples [00:00, ? examples/s]

100%|██████████| 52/52 [00:00<00:00, 10938.00it/s]


Generating train split: 0 examples [00:00, ? examples/s]

100%|██████████| 744/744 [00:00<00:00, 11811.10it/s]


Generating train split: 0 examples [00:00, ? examples/s]

100%|██████████| 2356/2356 [00:00<00:00, 11381.77it/s]


Generating train split: 0 examples [00:00, ? examples/s]

100%|██████████| 720/720 [00:00<00:00, 11322.31it/s]


Generating train split: 0 examples [00:00, ? examples/s]

100%|██████████| 2373/2373 [00:00<00:00, 11696.63it/s]


Generating train split: 0 examples [00:00, ? examples/s]

In [26]:
input_dataset

DatasetDict({
    train: Dataset({
        features: ['dataset', 'task', 'score_label', 'score_chosen', 'score_rejected', 'chosen', 'rejected', 'novelty_chosen', 'surprise_chosen', 'diversity_chosen', 'quality_chosen', 'image'],
        num_rows: 42058
    })
    val: Dataset({
        features: ['dataset', 'task', 'score_label', 'score_chosen', 'score_rejected', 'chosen', 'rejected', 'novelty_chosen', 'surprise_chosen', 'diversity_chosen', 'quality_chosen', 'image'],
        num_rows: 4880
    })
    test: Dataset({
        features: ['dataset', 'task', 'score_label', 'score_chosen', 'score_rejected', 'chosen', 'rejected', 'novelty_chosen', 'surprise_chosen', 'diversity_chosen', 'quality_chosen', 'image'],
        num_rows: 4533
    })
    heldout_item: Dataset({
        features: ['dataset', 'task', 'score_label', 'score_chosen', 'score_rejected', 'chosen', 'rejected', 'novelty_chosen', 'surprise_chosen', 'diversity_chosen', 'quality_chosen', 'image'],
        num_rows: 3521
    })
 

In [27]:
input_dataset["train"][0]

{'dataset': '1.Becky.csv',
 'task': 'Real-Life Creative Problem Solving',
 'score_label': 'originality',
 'score_chosen': 34.0,
 'score_rejected': 24.0,
 'chosen': [{'content': "<image>Come up with an original and creative solution for the following real-world problem:\nBecky is a college student who works part-time at Mark's Pizzeria. Mark, the owner of the restaurant, has treated Becky very well. He gave her a job that she needs to help pay her rent when no other business would employ her because she was arrested for shoplifting three years ago. Mark also lets Becky work around her school schedule, and has asked if she wants to be a shift manager in the summers. Becky's roommate Jim also works at the pizzeria, but Jim has been causing a lot of problems at work. He always avoids doing his job, treats customers rudely, and makes a lot of mistakes with orders. Jim recently began stealing food from the pizzeria. Two days ago the pizzeria was short- staffed, so Jim and Becky were the only

In [29]:
from dotenv import load_dotenv
load_dotenv()

input_dataset.push_to_hub("CNCL-Penn-State/cpo_en_multitask_text_fa_msd5_mm10_ms20_div_nov_sur_qua_test", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/43 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/CNCL-Penn-State/cpo_en_multitask_text_fa_msd5_mm10_ms20_div_nov_sur_qua_test/commit/efcfa9cfc8b93b41dba4cf7f7f07fc56ac9fe93f', commit_message='Upload dataset', commit_description='', oid='efcfa9cfc8b93b41dba4cf7f7f07fc56ac9fe93f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/CNCL-Penn-State/cpo_en_multitask_text_fa_msd5_mm10_ms20_div_nov_sur_qua_test', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CNCL-Penn-State/cpo_en_multitask_text_fa_msd5_mm10_ms20_div_nov_sur_qua_test'), pr_revision=None, pr_num=None)