In [1]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
import csv
from bs4 import BeautifulSoup





In [2]:
# load model from local
model_name = "/root/autodl-tmp/llama-3.2-1b-instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
def clean_html(raw_html):
    if pd.isna(raw_html):
        return ""
    soup = BeautifulSoup(raw_html, "html.parser")
    return soup.get_text(separator=" ", strip=True)

In [4]:
train_df = pd.read_csv("/root/autodl-tmp/data/salary_labelled_development_set.csv")  

def row_to_text(row, include_label=True):
    job_title = row["job_title"]
    job_details = clean_html(row["job_ad_details"])
    # job_details = row["job_ad_details"]
    location = row["nation_short_desc"]

    base_text = f"Job Title: {job_title}\nJob Description: {job_details}\nLocation: {location}"
    if include_label:
        base_text += f"\ny_true: {row['y_true']}"
    return base_text

In [5]:
# use first 10 data in train set as few-shot
shots = "\n\n".join([f"example{i+1}：\n{row_to_text(train_df.iloc[i])}" for i in range(10)])

In [11]:
test_df = pd.read_csv("/root/autodl-tmp/data/salary_labelled_test_set.csv")


test_df["predicted"] = ""
test_df["true_label"] = test_df["y_true"].astype(str)
results = []
correct=0

for i, row in test_df.iterrows():
    query = f"query：\n{row_to_text(row, include_label=False)}\nyour_predict：y_true ="

    user_prompt = (
        "You are a classification assistant. Below are some job descriptions along with their corresponding salaries (y_true)：\n\n"
        "Please only give me answer, no more other things.The stuctrue shold be like 100-200-AUS-MONTHLY.(No commas are required between numbers)"
        "If you think there is no mention of salary, please give me 0-0-None-None."
        "Do not give me things like y_true = 0-0-None-None, only give me answer, like 0-0-None-None"
        # f"{shots}\n\n{query}"
        f"{query}"
    )

    messages = [
        # {"role": "system", "content": ""},
        {"role": "user", "content": user_prompt}
    ]

    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    outputs = model.generate(
        **model_inputs,
        max_new_tokens=30,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    gen_ids = outputs[0][model_inputs["input_ids"].shape[-1]:]
    prediction = tokenizer.decode(gen_ids, skip_special_tokens=True).strip().split("\n")[0]

    results.append({
        "index": i,
        "predicted": prediction,
        "true_label": str(row["y_true"])
    })
    if prediction == str(row["y_true"]):
        correct+=1
    # save
    if (i + 1) % 100 == 0:
        checkpoint_df = pd.DataFrame(results)
        filename = f"llama/llama-1b-salary_predictions_nohtml_zeroshot.csv"
        checkpoint_df.to_csv(filename, index=False)
        print(f"{i+1},save result to：{filename}")
    number=i
    # if i ==220:
    #     break


filename = f"llama/llama-1b-salary_predictions_nohtml_zeroshot.csv"
final_df = pd.DataFrame(results)
final_df.to_csv(filename, index=False)

100,save result to：llama/llama-1b-salary_predictions_nohtml_zeroshot.csv
200,save result to：llama/llama-1b-salary_predictions_nohtml_zeroshot.csv
300,save result to：llama/llama-1b-salary_predictions_nohtml_zeroshot.csv
400,save result to：llama/llama-1b-salary_predictions_nohtml_zeroshot.csv
500,save result to：llama/llama-1b-salary_predictions_nohtml_zeroshot.csv


In [12]:

correct

95

In [13]:
number

566

In [14]:
accuracy=correct/number


In [2]:
print(f"accuracy：{accuracy:.2%}")
print(f"save result to：{filename}")

accuracy：16.78%
save result to：llama/llama-1b-salary_predictions_nohtml_zeroshot.csv
