In [1]:
import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
from transformers import AutoTokenizer, BitsAndBytesConfig, Gemma3ForCausalLM
from bs4 import BeautifulSoup





In [2]:
# load model from local
model_name = "/root/autodl-tmp/gemma-3-1b-it"

model = Gemma3ForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  
    device_map="auto",          
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
def clean_html(raw_html):
    if pd.isna(raw_html):
        return ""
    soup = BeautifulSoup(raw_html, "html.parser")
    return soup.get_text(separator=" ", strip=True)

In [4]:
train_df = pd.read_csv("/root/autodl-tmp/data/seniority_labelled_development_set.csv")  

def row_to_text(row, include_label=True):
    base = (
        f"Job Title: {row['job_title']}\n"
        f"Summary: {row['job_summary']}\n"
        f"Details: {row['job_ad_details']}\n"
        f"classification_name: {row['classification_name']}\n"
        f"subclassification_name: {row['subclassification_name']}\n"
    )
    return base + (f"\nLabel: y_true = {row['y_true']}" if include_label else "")

In [5]:
# use first 10 data in train set as few-shot
shots = "\n\n".join([f"example{i+1}：\n{row_to_text(train_df.iloc[i])}" for i in range(10)])

In [6]:
test_df = pd.read_csv("/root/autodl-tmp/data/seniority_labelled_test_set.csv")


test_df["predicted"] = ""
test_df["true_label"] = test_df["y_true"].astype(str)
results = []
correct=0

for i, row in test_df.iterrows():
    query = f"query：\n{row_to_text(row, include_label=False)}\nyour_predict：y_true ="

    user_prompt = (
        "You are a classification assistant. Below are some job descriptions along with their corresponding seniorities (y_true)：\n\n"
        "Please only give me answer, no more other things,like experienced, intermediate, senior, entry level, assistant, lead, head, junior and so on.)"
        # f"{shots}\n\n{query}"
        f"{query}"
    )

    messages = [
        # {"role": "system", "content": ""},
        {"role": "user", "content": user_prompt}
    ]

    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    outputs = model.generate(
        **model_inputs,
        max_new_tokens=80,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    gen_ids = outputs[0][model_inputs["input_ids"].shape[-1]:]
    prediction = tokenizer.decode(gen_ids, skip_special_tokens=True).strip().split("\n")[0]

    results.append({
        "index": i,
        "predicted": prediction,
        "true_label": str(row["y_true"])
    })
    if prediction == str(row["y_true"]):
        correct+=1
    # save
    if (i + 1) % 100 == 0:
        checkpoint_df = pd.DataFrame(results)
        filename = f"gemma/gemma-1b-seniority_predictions_zeroshot.csv"
        checkpoint_df.to_csv(filename, index=False)
        print(f"{i+1},save result to：{filename}")
    number=i
    # if i ==220:
    #     break


filename = f"gemma/gemma-1b-seniority_predictions_zeroshot.csv"
final_df = pd.DataFrame(results)
final_df.to_csv(filename, index=False)

100,save result to：gemma/gemma-1b-seniority_predictions_zeroshot.csv
200,save result to：gemma/gemma-1b-seniority_predictions_zeroshot.csv
300,save result to：gemma/gemma-1b-seniority_predictions_zeroshot.csv
400,save result to：gemma/gemma-1b-seniority_predictions_zeroshot.csv
500,save result to：gemma/gemma-1b-seniority_predictions_zeroshot.csv
600,save result to：gemma/gemma-1b-seniority_predictions_zeroshot.csv


In [7]:

correct

4

In [8]:
number

688

In [9]:
accuracy=correct/number


In [None]:
print(f"accuracy：{accuracy:.2%}")
print(f"save result to：{filename}")

accuracy：0.58%
save result to：gemma/gemma-1b-seniority_predictions_zeroshot.csv


: 