In [1]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
import csv




In [2]:
# load model from local
model_name = "/root/autodl-tmp/phi-3-mini-instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
train_df = pd.read_csv("/root/autodl-tmp/data/work_arrangements_development_set.csv")  

def row_to_text(row, include_label=True):
    base = (
        f"job_ad: {row['job_ad']}\n"
    )
    return base + (f"\nLabel: y_true = {row['y_true']}" if include_label else "")

In [4]:
# use first 10 data in train set as few-shot
shots = "\n\n".join([f"example{i+1}：\n{row_to_text(train_df.iloc[i])}" for i in range(10)])

In [5]:
test_df = pd.read_csv("/root/autodl-tmp/data/work_arrangements_test_set.csv")


test_df["predicted"] = ""
test_df["true_label"] = test_df["y_true"].astype(str)
results = []
correct=0

for i, row in test_df.iterrows():
    query = f"query：\n{row_to_text(row, include_label=False)}\nyour_predict：y_true ="

    user_prompt = (
        "You are a classification assistant. Below are some job descriptions along with their corresponding work arrangement (y_true)：\n\n"
        "Please only give me answer, give me only one word, no more other things,do not repeat answer, you should choose a word in OnSite, Remote and Hybrid as answer.)"
        f"{shots}\n\n{query}"
        # f"{query}"
    )

    messages = [
        # {"role": "system", "content": ""},
        {"role": "user", "content": user_prompt}
    ]

    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    outputs = model.generate(
        **model_inputs,
        max_new_tokens=30,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    gen_ids = outputs[0][model_inputs["input_ids"].shape[-1]:]
    prediction = tokenizer.decode(gen_ids, skip_special_tokens=True).strip().split("\n")[0]

    results.append({
        "index": i,
        "predicted": prediction,
        "true_label": str(row["y_true"])
    })
    if prediction == str(row["y_true"]):
        correct+=1
    # save
    if (i + 1) % 100 == 0:
        checkpoint_df = pd.DataFrame(results)
        filename = f"phi/phi-1b-work_predictions.csv"
        checkpoint_df.to_csv(filename, index=False)
        print(f"{i+1},save result to：{filename}")
    number=i
    # if i ==220:
    #     break


filename = f"phi/phi-1b-work_predictions.csv"
final_df = pd.DataFrame(results)
final_df.to_csv(filename, index=False)

In [6]:

correct

0

In [7]:
number

98

In [8]:
accuracy=correct/number


In [None]:
print(f"accuracy：{accuracy:.2%}")
print(f"save result to：{filename}")