In [3]:
import pandas as pd
import re
import json
from bs4 import BeautifulSoup

In [4]:
# Load datasets
seniority_df = pd.read_csv("/root/autodl-tmp/data/seniority_labelled_test_set.csv")
work_df = pd.read_csv("/root/autodl-tmp/data/work_arrangements_test_set.csv")
salary_df = pd.read_csv("/root/autodl-tmp/data/salary_labelled_test_set.csv")

In [5]:
# HTML cleaner for salary dataset
def clean_html(raw_html):
    return BeautifulSoup(str(raw_html), "html.parser").get_text()

In [6]:
# Generate alpaca format for seniority
seniority_data = []
for _, row in seniority_df.iterrows():
    input_text = f"Job Title: {row['job_title']}\nSummary: {row['job_summary']}\nDetails: {row['job_ad_details']}\nClassification: {row['classification_name']}\nSubclassification: {row['subclassification_name']}"
    item = {
        "instruction": "You are a classification assistant. Below are some job descriptions along with their corresponding seniorities (y_true):\n\nPlease only give me answer, no more other things,like experienced, intermediate, senior, entry level, assistant, lead, head, junior and so on.",
        "input": input_text,
        "output": row['y_true']
    }
    seniority_data.append(item)

In [7]:
# Generate alpaca format for work arrangement
work_data = []
for _, row in work_df.iterrows():
    item = {
        "instruction": "You are a classification assistant. Below are some job descriptions along with their corresponding work arrangement (y_true):\n\nPlease only give me answer, no more other things,you should choose a word in OnSite, Remote and Hybrid as answer.",
        "input": row['job_ad'],
        "output": row['y_true']
    }
    work_data.append(item)

In [8]:
# Generate alpaca format for salary
salary_data = []
for _, row in salary_df.iterrows():
    details_clean = clean_html(row['job_ad_details'])
    input_text = f"Job Title: {row['job_title']}\nDetails: {details_clean}\nCountry: {row['nation_short_desc']}\nAdditional Info: {row['salary_additional_text']}"
    item = {
        "instruction": "You are a classification assistant. Below are some job descriptions along with their corresponding salaries (y_true):\n\nPlease only give me answer, no more other things. The structure should be like 100-200-AUS-MONTHLY. (No commas are required between numbers) If you think there is no mention of salary, please give me 0-0-None-None.",
        "input": input_text,
        "output": row['y_true']
    }
    salary_data.append(item)

In [9]:
# Save to JSONL
with open("/root/autodl-tmp/data/seniority_alpaca_test.jsonl", "w") as f:
    for item in seniority_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

with open("/root/autodl-tmp/data/work_alpaca_test.jsonl", "w") as f:
    for item in work_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

with open("/root/autodl-tmp/data/salary_alpaca_test.jsonl", "w") as f:
    for item in salary_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

In [10]:
all_data = seniority_data + work_data + salary_data

# Save combined data to a single JSONL file
output_path = "/root/autodl-tmp/data/combined_data_test.jsonl"
with open(output_path, "w", encoding="utf-8") as f:
    for item in all_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")