In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Note - Setting of this notebook model**

LLM model: qwen0.5b

Training method: prompt tuning

epoch:6

learning rate: 3e-2 (default)

training set: 80% train

validation set: 20% train

comment after training:
the loss is not decreasing as expected. Maybe decrease the learning rate further to 4e-2 or 5e-2 for the new training

In [None]:
# Data split and convert to json file is done by jobad_cleaning_trainval_split.py
train_80_json_data_path = '/content/drive/MyDrive/AdSeek/combined_prompts_complete_train_80.json'
val_20_json_data_path = '/content/drive/MyDrive/AdSeek/combined_prompts_complete_val_20.json'

In [None]:
!pip install datasets # peft transformers

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
import os
import time
import torch
from datasets import load_dataset
from transformers import (  # transformer
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    get_peft_model,
    PromptTuningConfig,
    TaskType
)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"


model_path = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True,
    use_fast=True
)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    trust_remote_code=True,
    torch_dtype=torch.float16,
).to('cpu')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init="TEXT",
    prompt_tuning_init_text="Extract key information from the text:",
    num_virtual_tokens=20,
    tokenizer_name_or_path='Qwen/Qwen2.5-0.5B-Instruct',
    token_dim=model.config.hidden_size,
    num_transformer_submodules=1
)

# model = get_peft_model(model, peft_config).cuda()

# Assume there is no cuda (if there is cuda, uncomment the above code instead)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 17,920 || all params: 494,050,688 || trainable%: 0.0036


In [None]:
import spacy


ner_pos_core = spacy.load("en_core_web_sm")

def load_and_preprocess(data_path):
    dataset = load_dataset("json", data_files=data_path)

    dataset = dataset.filter(lambda x: len(x["prompt"]) > 0 and len(x["complete"]) > 0)

    def tokenize_function(examples):
        completions = [str(c) for c in examples["complete"]]
        prompts = examples["prompt"]

        docs = list(ner_pos_core.pipe(prompts, batch_size=32))

        pos_ner_texts = []

        for doc, completion in zip(docs, completions):
            tokens_with_tags = []
            for token in doc:
                pos = token.pos_
                ner = token.ent_type_ if token.ent_type_ else "O"

                if ner != "O":
                    tokens_with_tags.append(f"{token.text}_{ner}")
                elif pos == "ADJ":
                    tokens_with_tags.append(f"{token.text}_{pos}")
                else:
                    tokens_with_tags.append(token.text)

            tagged_prompt = " ".join(tokens_with_tags)
            full_text = tagged_prompt + " " + completion + " " + tokenizer.eos_token
            pos_ner_texts.append(full_text)

        tokenized = tokenizer(
            pos_ner_texts,
            max_length=1280,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        prompt_tokenized = tokenizer(prompts, add_special_tokens=False)
        prompt_lengths = [len(ids) for ids in prompt_tokenized["input_ids"]]

        labels = tokenized["input_ids"].clone()
        for i, length in enumerate(prompt_lengths):
            if length >= 1280:
                length = 1279
            labels[i][:length] = -100

        tokenized["labels"] = labels
        return tokenized

    return dataset.map(tokenize_function, batched=True)


In [None]:
def formal_train(json_path="salary_47_allin.json", target_model_path="./qwen_prompt_47"):
    dataset = load_and_preprocess(json_path)
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir="./qwen_prompt_tuning_output_pos_ner",
        per_device_train_batch_size=6,
        gradient_accumulation_steps=2,
        num_train_epochs=6,
        learning_rate=3e-2,
        logging_dir="./logs",
        logging_steps=100,
        save_strategy="epoch",
        fp16=True,
        optim="adamw_torch",
        dataloader_num_workers=4,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        data_collator=data_collator,
    )

    print("开始训练...")
    start_time = time.time()
    trainer.train()
    print(f"训练完成，耗时: {time.time() - start_time:.2f}秒")

    model.save_pretrained(target_model_path)


In [None]:
  formal_train(json_path=train_80_json_data_path, target_model_path="./qwen_pt_pos_ner_6epochs")

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/4093 [00:00<?, ? examples/s]

Map:   0%|          | 0/4093 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


开始训练...


Step,Training Loss
100,3.0786
200,2.7771
300,2.7236
400,2.676
500,2.667
600,2.6238
700,2.6265
800,2.6164
900,2.5942
1000,2.6061


训练完成，耗时: 1077.00秒


In [None]:

def generate_response(prompt):
  with torch.no_grad():
      inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
      outputs = model.generate(
          input_ids=inputs["input_ids"],
          attention_mask=inputs["attention_mask"],
          max_new_tokens=100,
          temperature=0.7,
          do_sample=True,
          pad_token_id=tokenizer.eos_token_id
      )

      generated_tokens = outputs[0][inputs["input_ids"].shape[-1]:]
      return tokenizer.decode(generated_tokens, skip_special_tokens=True)


In [None]:

# Load val_data from JSON file
import json
with open(val_20_json_data_path, 'r') as f: #Update with the path to your JSON File.
    data = json.load(f)



In [None]:
len(data)

1025

In [None]:
# Check the id of each prompt task
# Your task-specific prompt instructions
prompt_dict = {
    'salary': 'Extract salary information from the job ad. \n\n',
    'seniority': 'Extract seniority level from the job ad. \n\n',
    'arrangement': 'Extract work arrangement from the job ad.\n\n'
}

# Keep track of first occurrence indices
task_start_indices = {}

# Loop through val set to find the first index of each task
for i, item in enumerate(data):
    for task_name, prompt in prompt_dict.items():
        # task is 'salary','seniority','arrangement
        # prompt is 'Extract salary information from the job ad. \n\n' ...
        if prompt in item['prompt']:
            if task_name not in task_start_indices:
                task_start_indices[task_name] = i
        if len(task_start_indices) == 3:
          break
print("First index for each task:")
print(task_start_indices)

First index for each task:
{'salary': 0, 'seniority': 454, 'arrangement': 1005}


In [None]:
# Inference val_data
# Limit to only first 20 examples for each task to preview
for i, item in enumerate(data):
  if (0 <= i < 20) or (454 <= i < 474) or (1005 <= i):
    print("=" * 20, 'Round:', i, "=" * 20)
    p = item['prompt']
    y = item['complete']
    print("Target:", y)
    print("\n Prompt tuning Answer:", generate_response(p))


Target: 48-48-AUD-HOURLY





 Prompt tuning Answer: _ORG Project_ORG Support_ORG Officer 
Target: 500-667-HKD-DAILY

 Prompt tuning Answer:  533-HK-HOURLY 
Target: 0-0-None-None

 Prompt tuning Answer: _ORG 0-0-MONTHLY 
Target: 0-0-None-None

 Prompt tuning Answer: head 
Target: 0-0-None-None

 Prompt tuning Answer:  QuickFood_ORG &_ORG Casual_ORG Catering_ORG -_ORG Caterer_ORG &_ORG Bar_ORG 

 QuickFood_ORG &_ORG Casual_ORG Catering_ORG -_ORG Caterer_ORG &_ORG Bar_ORG 

 QuickFood_ORG &_ORG Casual_ORG Catering_ORG -_ORG Caterer_ORG &_ORG Bar_ORG 

 QuickFood_ORG &_ORG Casual_ORG Catering_ORG -_ORG Caterer
Target: 0-0-None-None

 Prompt tuning Answer: None None 
Target: 500-667-HKD-DAILY

 Prompt tuning Answer:  600,000,000,000,000,000,000,000-HKD-SECURE-ORIG 

 600,000,000,000,000,000,000,000-HKD-SECURE-ORG 

 600KHDK SECURE-ORG 

 600KHKD
Target: 67357-105895-AUD-ANNUAL

 Prompt tuning Answer:   67357-105895-ANU 
Target: 0-0-None-None

 Prompt tuning Answer: SG_ORG 0-0-0-SGD-AGG-0-0-0-0-SGD_ORG -0-0-0-0-SGD_ORG