In [2]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import TrainingArguments
import torch
import json

In [3]:
import torch
torch.cuda.empty_cache()
torch.cuda.ipc_collect()  # 强制回收 PyTorch 的内存缓存（少数情况有效）

In [4]:
import torch
print("Using device:", torch.cuda.current_device())
print("World size:", torch.distributed.get_world_size() if torch.distributed.is_initialized() else "Not distributed")

Using device: 0
World size: Not distributed


In [3]:
def format_pairwise_example(example):
    return {
        "text": example['prompt'],
        "label": 0 if example["label"] == "A" else 1
    }

def load_json_dataset(path):
    with open(path, 'r', encoding='utf-8') as f:
        raw = json.load(f)
    return Dataset.from_list([format_pairwise_example(ex) for ex in raw])

dataset = load_json_dataset("/home/yangliu26/data/pairwise/pairwise_datas.json")
print(json.dumps(dataset[0], indent=2, ensure_ascii=False))

{
  "text": "Instruction: \nGiven the DB info and question, there are two candidate queries. There is correct one and incorrect one, compare the two candidate answers, analyze the differences of the query and the result. Based on the original question and the provided database info, choose the correct one. \n************************** \nDatabase Schema \nCREATE TABLE supplier (\n  s_nationkey INTEGER REFERENCES nation(n_nationkey),\n  s_acctbal REAL\n);\nCREATE TABLE nation (\n  n_nationkey INTEGER,\n  n_regionkey INTEGER REFERENCES region(r_regionkey)\n);\nCREATE TABLE region (\n  r_regionkey INTEGER,\n  r_name TEXT\n);\n \n************************** \nQuestion: \nAmong the suppliers in the European region, what percentage have a below-average account balance? \nEvidence: \nDIVIDE(COUNT(s_acctbal < AVG(s_acctbal)), COUNT(s_suppkey)) as percentage where r_name = 'EUROPE'; \n************************** \nCandidate A \nSELECT CAST(SUM(IIF(T1.s_acctbal < ( SELECT AVG(s_acctbal) FROM suppli

In [4]:
dataset = dataset.train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 40
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5
    })
})

In [5]:
model_name = "/home/yangliu26/qwen3-8b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at /home/yangliu26/qwen3-8b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# 编码函数
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# 训练参数
training_args = TrainingArguments(
    output_dir="./pairwise_selector_model/qwen3-8b/",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    bf16=True,
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=2e-5,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer
)

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

  trainer = Trainer(


In [7]:
trainer.train()

OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 96, in _worker
    output = module(*input, **kwargs)
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/transformers/utils/generic.py", line 965, in wrapper
    output = func(self, *args, **kwargs)
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 933, in forward
    transformer_outputs: BaseModelOutputWithPast = self.model(
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/transformers/utils/generic.py", line 965, in wrapper
    output = func(self, *args, **kwargs)
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 576, in forward
    layer_outputs = decoder_layer(
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 289, in forward
    hidden_states, self_attn_weights = self.self_attn(
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 218, in forward
    key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/envs/envd/lib/python3.10/site-packages/transformers/models/qwen3/modeling_qwen3.py", line 76, in forward
    return self.weight * hidden_states.to(input_dtype)
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 39.49 GiB of which 5.38 MiB is free. Process 419942 has 39.48 GiB memory in use. Of the allocated memory 38.86 GiB is allocated by PyTorch, and 4.14 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


In [11]:
print("TrainingArguments loaded from:", TrainingArguments.__module__)

TrainingArguments loaded from: transformers.training_args


In [6]:
import transformers
print("Transformers version:", transformers.__version__)
print("Loaded from:", transformers.__file__)

Transformers version: 4.51.3
Loaded from: /opt/conda/envs/envd/lib/python3.10/site-packages/transformers/__init__.py
