In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install datasets

Mounted at /content/drive
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspe

In [None]:
import os
import time
import torch
from datasets import load_dataset
from transformers import (  # transformer
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    AutoConfig,
    PreTrainedModel
)
from peft import (
    get_peft_model,
    PromptTuningConfig,
    TaskType
)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

model_path = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True,
    use_fast=True
)

import torch.nn as nn

class AdSeek_Qwen_NerPos(PreTrainedModel):
    def __init__(self, config, base_model, num_pos, num_ner):
        super().__init__(config)
        self.config = config

        self.model = base_model.model
        self.lm_head = base_model.lm_head

        self.pos_embedding = nn.Embedding(num_pos, config.hidden_size)
        self.ner_embedding = nn.Embedding(num_ner, config.hidden_size)

        self.embed_tokens = self.model.get_input_embeddings()

    def forward(self, input_ids=None, attention_mask=None, labels=None, pos_ids=None, ner_ids=None, **kwargs):

        if input_ids is not None:
            token_embeds = self.embed_tokens(input_ids)
        elif "inputs_embeds" in kwargs and kwargs["inputs_embeds"] is not None:
            token_embeds = kwargs["inputs_embeds"]
            kwargs.pop("inputs_embeds")
        else:
            raise ValueError("Must provide either input_ids or inputs_embeds")


        if pos_ids is not None:
            seq_diff = token_embeds.size(1) - pos_ids.size(1)
            if seq_diff > 0:
                pad = torch.full((pos_ids.size(0), seq_diff), pos2id["PAD"], device=pos_ids.device)
                pos_ids = torch.cat([pad, pos_ids], dim=1)
            elif seq_diff < 0:
                pos_ids = pos_ids[:, -token_embeds.size(1):]
            token_embeds = token_embeds + self.pos_embedding(pos_ids)

        if ner_ids is not None:
            seq_diff = token_embeds.size(1) - ner_ids.size(1)
            if seq_diff > 0:
                pad = torch.full((ner_ids.size(0), seq_diff), ner2id["PAD"], device=ner_ids.device)
                ner_ids = torch.cat([pad, ner_ids], dim=1)
            elif seq_diff < 0:
                ner_ids = ner_ids[:, -token_embeds.size(1):]
            token_embeds = token_embeds + self.ner_embedding(ner_ids)

        outputs = self.model(
            input_ids=None,
            inputs_embeds=token_embeds,
            attention_mask=attention_mask,
            **kwargs
        )

        hidden_states = outputs.last_hidden_state
        logits = self.lm_head(hidden_states)

        loss = None
        if labels is not None:
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss_fn = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        return {"loss": loss, "logits": logits}


POS_TAGS = ['PAD', 'ADJ', 'NOUN', 'VERB', 'ADV', 'PRON', 'DET', 'ADP', 'PROPN', 'PUNCT']
NER_TAGS = ['PAD', 'PERSON', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'DATE', 'O']
pos2id = {tag: i for i, tag in enumerate(POS_TAGS)}
ner2id = {tag: i for i, tag in enumerate(NER_TAGS)}

config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
raw_model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)

model = AdSeek_Qwen_NerPos(config, base_model=raw_model, num_pos=len(pos2id), num_ner=len(ner2id))



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init="TEXT",
    prompt_tuning_init_text="Extract key information from the text:",
    num_virtual_tokens=20,
    tokenizer_name_or_path='Qwen/Qwen2.5-0.5B-Instruct',
    token_dim=model.config.hidden_size,
    num_transformer_submodules=1
)

model = get_peft_model(model, peft_config).cuda()
model.print_trainable_parameters()

trainable params: 17,920 || all params: 494,066,816 || trainable%: 0.0036


In [None]:
import spacy

ner_pos_core = spacy.load("en_core_web_sm")

def load_and_preprocess(data_path):
    dataset = load_dataset("json", data_files=data_path)

    dataset = dataset.filter(lambda x: len(x["prompt"]) > 0 and len(x["complete"]) > 0)

    def tokenize_function(examples):
        completions = [str(c) for c in examples["complete"]]
        prompts = examples["prompt"]

        full_texts = [p + c + tokenizer.eos_token for p, c in zip(prompts, completions)]

        tokenized = tokenizer(
            full_texts,
            max_length=1024,
            truncation=True,
            padding="max_length",
        )

        docs = list(ner_pos_core.pipe(prompts, batch_size=32))

        pos_ids_all = []
        ner_ids_all = []

        for doc, input_id in zip(docs, tokenized["input_ids"]):
            pos_ids = []
            ner_ids = []

            if len(doc) > 0:
                for token in doc:
                    pos = pos2id.get(token.pos_, pos2id["PAD"])
                    ner = ner2id.get(token.ent_type_ or "O", ner2id["PAD"])
                    pos_ids.append(pos)
                    ner_ids.append(ner)
            else:
                pos_ids = [pos2id["PAD"]] * len(input_id)
                ner_ids = [ner2id["PAD"]] * len(input_id)

            pos_ids = [pos2id["PAD"]] * 20 + pos_ids
            ner_ids = [ner2id["PAD"]] * 20 + ner_ids
            pad_len = len(input_id) - len(pos_ids)
            if pad_len > 0:
                pos_ids += [pos2id["PAD"]] * pad_len
                ner_ids += [ner2id["PAD"]] * pad_len
            else:
                pos_ids = pos_ids[:len(input_id)]
                ner_ids = ner_ids[:len(input_id)]

            pos_ids_all.append(pos_ids)
            ner_ids_all.append(ner_ids)

        tokenized["pos_ids"] = pos_ids_all
        tokenized["ner_ids"] = ner_ids_all

        prompt_tokenized = tokenizer(prompts, add_special_tokens=False)
        prompt_lengths = [len(ids) for ids in prompt_tokenized["input_ids"]]

        labels = []
        for i, length in enumerate(prompt_lengths):
            input_ids = tokenized["input_ids"][i]
            label = input_ids.copy()
            label[:min(length, 1023)] = [-100] * min(length, 1023)
            labels.append(label)

        tokenized["labels"] = labels

        return tokenized


    return dataset.map(tokenize_function, batched=True)


In [None]:

class DataCollatorForPOSNER:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features):
        batch = {}
        for key in features[0].keys():
            values = [f[key] for f in features]

            if isinstance(values[0], str) or values[0] is None:
                continue

            try:
                batch[key] = torch.tensor(values)
            except Exception as e:
                print(f"[ERROR] key={key}, sample value={values[0]}")
                raise e

        return batch


class POSNERTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        pos_ids = inputs.pop("pos_ids", None)
        ner_ids = inputs.pop("ner_ids", None)

        outputs = model(**inputs, pos_ids=pos_ids, ner_ids=ner_ids)

        loss = outputs["loss"]
        return (loss, outputs) if return_outputs else loss


def formal_train(json_path="salary_47_allin.json", target_model_path="./qwen_prompt_47"):
    dataset = load_and_preprocess(json_path)
    data_collator = DataCollatorForPOSNER(tokenizer)


    training_args = TrainingArguments(
        output_dir="./qwen_prompt_tuning_output",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=2,
        num_train_epochs=2,
        learning_rate=8e-4,
        logging_dir="./logs",
        logging_steps=100,
        save_strategy="epoch",
        fp16=True,
        optim="adamw_torch",
        dataloader_num_workers=4,
        report_to="none"
    )

    trainer = POSNERTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        data_collator=data_collator,
    )

    print("开始训练...")
    start_time = time.time()
    trainer.train()
    print(f"训练完成，耗时: {time.time() - start_time:.2f}秒")

    model.save_pretrained(target_model_path)


In [None]:
training_path = '/content/drive/MyDrive/AdSeek/Preprocessing/ready2train_test/agument_work_5316.json'

formal_train(json_path=training_path, target_model_path="./qwen_posner_top100")

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/5316 [00:00<?, ? examples/s]

Map:   0%|          | 0/5316 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


开始训练...


Step,Training Loss
100,23.5175
200,22.1158
300,18.507
400,14.5327
500,11.0026


训练完成，耗时: 386.14秒


In [None]:

import torch.nn.functional as F

def generate_response(prompt_text, max_new_tokens=100, temperature=0.8, top_k=50, top_p=0.9):
    model.eval()
    inputs = tokenizer(prompt_text, return_tensors="pt", truncation=True, max_length=1024).to(model.device)
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    doc = ner_pos_core(prompt_text)
    pos = [pos2id.get(t.pos_, pos2id["PAD"]) for t in doc]
    ner = [ner2id.get(t.ent_type_ or "O", ner2id["PAD"]) for t in doc]

    L = input_ids.shape[1]
    pad_len = L - len(pos)
    pos_ids = torch.tensor(([pos2id["PAD"]] * pad_len + pos)[-L:], dtype=torch.long).unsqueeze(0).to(model.device)
    ner_ids = torch.tensor(([ner2id["PAD"]] * pad_len + ner)[-L:], dtype=torch.long).unsqueeze(0).to(model.device)

    generated = input_ids
    prompt_len = input_ids.shape[1]

    for _ in range(max_new_tokens):
        with torch.no_grad():
            out = model(
                input_ids=generated,
                attention_mask=torch.ones_like(generated),
                pos_ids=pos_ids[:, -generated.shape[1]:],
                ner_ids=ner_ids[:, -generated.shape[1]:],
            )
        logits = out["logits"][:, -1, :]

        for token_id in [tokenizer.pad_token_id, tokenizer.unk_token_id]:
            if token_id is not None:
                logits[:, token_id] = -float("inf")

        logits = logits / temperature
        probs = F.softmax(logits, dim=-1)

        if top_k > 0:
            top_k_values, top_k_indices = torch.topk(probs, top_k)
            probs_filtered = torch.zeros_like(probs).scatter(1, top_k_indices, top_k_values)
            probs = probs_filtered / probs_filtered.sum(dim=-1, keepdim=True)

        # Top-p (nucleus) filter
        if top_p < 1.0:
            sorted_probs, sorted_indices = torch.sort(probs, descending=True)
            cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
            cutoff = cumulative_probs > top_p
            cutoff[..., 1:] = cutoff[..., :-1].clone()
            cutoff[..., 0] = False
            sorted_probs[cutoff] = 0.0
            probs = torch.zeros_like(probs).scatter(1, sorted_indices, sorted_probs)
            probs = probs / probs.sum(dim=-1, keepdim=True)

        next_token = torch.multinomial(probs, num_samples=1)
        generated = torch.cat([generated, next_token], dim=1)

        pos_ids = torch.cat([pos_ids, torch.tensor([[pos2id["PAD"]]]).to(model.device)], dim=1)
        ner_ids = torch.cat([ner_ids, torch.tensor([[ner2id["PAD"]]]).to(model.device)], dim=1)

        if next_token.item() == tokenizer.eos_token_id:
            break

    return tokenizer.decode(generated[0][prompt_len:], skip_special_tokens=True)


In [None]:
test_path = "/content/drive/MyDrive/AdSeek/Preprocessing/ready2train_test/test_1355.json"
# Load val_data from JSON file
import json
with open(test_path, 'r') as f: #Update with the path to your JSON File.
    data = json.load(f)



In [None]:
import pandas as pd

df = pd.DataFrame(data)
p = df.prompt.iloc[3]

generate_response(p)

'组--Bar--\\ � � �通信-A-BarAA -A-A--, A � � �-A �-Bar-AA--A,，-A资个--AA,，,,-,的资 �A，A-的A资 � �具资A资资A �A �WhatA,A资,资-A，,具，，资 �资个，What资-AA-A-A，A资，的的AA A'

In [None]:
import time

t0 = time.time()
# answers = []
for i, item in enumerate(data):
  if (0 <= i < 20) or (567 <= i < 587) or (1256 <= i):
    # print("=" * 20, 'Round:', i, "=" * 20)
    p = item['prompt']
    # y = item['complete']
    # print("Target:", y)
    # print("\n Prompt tuning Answer:", generate_response(p))
    ans = generate_response(p)
    print(ans)
    # answers.append(ans)
t1 = time.time()
print("Inference time:", t1 - t0)

KeyboardInterrupt: 

In [None]:
import pandas as pd

df = pd.DataFrame(data)
df["y_pred"] = answers

output_path = "NER_pt_05B_results.json"
df.to_json(output_path, orient='records', indent=4, force_ascii=False)