In [None]:
import requests
import math
import os
from dotenv import load_dotenv
import json
import re

load_dotenv()
APP_ID = os.getenv("Adzuna_ID")
APP_KEY = os.getenv("Adzuna_KEY")

results_per_page = 50
max_records = 6000
records = []

page = 1

while len(records) < max_records:
    url = f"https://api.adzuna.com/v1/api/jobs/gb/search/{page}"
    params = {
        "app_id": APP_ID,
        "app_key": APP_KEY,
        "results_per_page": results_per_page,
        "what": "developer",
        "content-type": "application/json"
    }

    response = requests.get(url, params=params)
    data = response.json()

    records.extend(data["results"])

    if len(records) >= data["count"]:
        break

    page += 1

print("Total records collected:", len(records))

Total records collected: 6000


In [None]:


def clean_html(text):
    return re.sub(r"<.*?>", "", text)

job_descriptions = [
    {
        "id": job.get("id"),
        "title": job.get("title"),
        "company": job.get("company", {}).get("display_name"),
        "description": clean_html(job.get("description", "")).strip()
    }
    for job in records
    if job.get("description")
]

print("Total job descriptions:", len(job_descriptions))


Total job descriptions: 6000


In [None]:
# กำหนดโฟลเดอร์ปลายทาง (ถอยจาก src/adzuna... ไปหา raw_data)
# หมายเหตุ: ปรับระดับ ../.. ตามความลึกของโฟลเดอร์
output_dir = '../../raw_data' 

# สร้างโฟลเดอร์ถ้ายังไม่มี
os.makedirs(output_dir, exist_ok=True) 

#กำหนดชื่อไฟล์
filename = "adzuna_job_descriptions_6000.json"
file_path = os.path.join(output_dir, filename)

#บันทึกไฟล์
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(job_descriptions, f, ensure_ascii=False, indent=2)

print(f"File saved successfully at {file_path}")