Setup --program

In [2]:
import requests
from bs4 import BeautifulSoup
import time
import random

company_names = {}
all_jobs = []
jobs_details_list = []

def fetch_job_listings(page):
    url = f"https://www.dice.com/jobs?page={page}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup.find_all('div', class_='header -mb-2 flex flex-row justify-between')

def get_company_links(companies):
    for company in companies:
        name_company = company.find('p').text.strip()
        link = 'https://www.dice.com' + company.find('a').get('href')
        company_names[name_company] = link

In [3]:
def fetch_job_details(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    dice_id = soup.find('li', attrs={'data-testid': 'legalInfo-companyName'})
    company_name = soup.find('a', attrs={'data-wa-click': 'djv-job-company-profile-click'})
    skills = soup.find_all('li', class_='text-sm text-gray-700')
    job_name = soup.find('h1', class_='text-pretty font-bold text-2xl sm:text-2xl md:text-2xl lg:text-3xl xl:text-3xl 2xl:text-3xl')
    posted_element = soup.find_all('div', {"class": 'mt-4'})
    start_li = soup.find_all('span', {'class': 'text-sm font-normal text-font-light'})
    if start_li:
        posted_date = start_li[0].find_all('span')[1].get_text(strip=True)
        clean_date = posted_date.replace('• Posted', '').strip()
    
    
    return {
        "post_id": dice_id.text.split()[2] if dice_id else None,
        "company_name": company_name.get_text(strip=True) if company_name else None,
        "job_name": job_name.get_text(strip=True) if job_name else None,
        "skills": [skill.get_text(strip=True) for skill in skills] if skills else [],
        "date_scraped": time.strftime("%Y-%m-%d"),
        "date_posted": clean_date
    }


In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

In [5]:
def get_company_job_links(company_url):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    all_job_links = []
    
    try:
        # เริ่มต้นที่หน้าแรกก่อนเพื่อเช็คจำนวนหน้าทั้งหมด
        driver.get(company_url)
        wait = WebDriverWait(driver, 15)
        
        # 1. เข้าไปที่ Jobs Tab เพื่อให้ Shadow DOM ของรายการงานโหลด
        jobs_tab = wait.until(EC.element_to_be_clickable((By.XPATH, "//li[contains(text(), 'Jobs')]")))
        jobs_tab.click()
        
        # รอให้รายการงานปรากฏ
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "dhi-job-search-job-list")))
        time.sleep(4) 

        def get_total_pages():
            script = """
            const pagination = document.querySelector('dhi-job-search-job-list')?.shadowRoot
                               ?.querySelector('seds-pagination')?.shadowRoot.querySelector('.page');
            if (!pagination) return 1;
            const spans = pagination.querySelectorAll('span');
            for (let s of spans) {
                if (s.innerText.includes('of')) {
                    return parseInt(s.innerText.replace(/[^0-9]/g, ''));
                }
            }
            return 1;
            """
            return int(driver.execute_script(script))

        total_pages = get_total_pages()
        #print(f"Detected Total Pages: {total_pages}")

        # สคริปต์ดึง Link (เจาะ Shadow DOM)
        pierce_links_script = """
        function findAllLinksDeep(selector, root = document, links = []) {
            const foundInRoot = root.querySelectorAll(selector);
            foundInRoot.forEach(el => { if (el.href) links.push(el.href); });
            const hosts = root.querySelectorAll('*');
            for (const host of hosts) {
                if (host.shadowRoot) findAllLinksDeep(selector, host.shadowRoot, links);
            }
            return links;
        }
        return findAllLinksDeep('a.job-title-link');
        """

        for page_num in range(1, total_pages + 1):
            # สร้าง URL สำหรับแต่ละหน้า
            # ตรวจสอบว่ามี ? อยู่ใน URL เดิมหรือยังเพื่อเลือกใช้ & หรือ ?
            separator = "&" if "?" in company_url else "?"
            page_url = f"{company_url}{separator}page={page_num}"
            
            #print(f"Scraping page {page_num}: {page_url}")
            driver.get(page_url)
            
            # หลังจากเปลี่ยน URL ต้องรอให้ Shadow DOM โหลดใหม่
            time.sleep(5) 
            
            # ดึงข้อมูล Link
            current_links = driver.execute_script(pierce_links_script)
            
            initial_count = len(all_job_links)
            for link in current_links:
                if link not in all_job_links:
                    all_job_links.append(link)
            
            #print(f"Added {len(all_job_links) - initial_count} new links.")

        #print(f"Extraction complete. Found {len(all_job_links)} total links.")
        
    finally:
        driver.quit()
    return all_job_links

In [6]:
import pickle
def save_fetched_data(data, filename='jobs_data.pkl'):
    with open(filename, 'wb') as f:
        pickle.dump(data, f) 
    print(f"Data saved to {filename}")  

In [14]:
import json
import os

def save_all_to_json(all_scraped_data, filename='jobs_data.json'):
    """
    บันทึกข้อมูลลงในโฟลเดอร์ Raw_data โดยถอยออกจากโฟลเดอร์ src
    """
    if not all_scraped_data:
        print("ไม่มีข้อมูลให้บันทึก")
        return

    # 1. กำหนด Path โดยถอยออกจาก src 1 ชั้นเพื่อเข้าสู่ Raw_data
    # .. หมายถึงถอยหลัง 1 ก้าว
    target_folder = os.path.join('..', 'Raw_data')
    full_path = os.path.join(target_folder, filename)

    # 2. ตรวจสอบและสร้างโฟลเดอร์ Raw_data หากยังไม่มี (กันเหนียว)
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)
        print(f"สร้างโฟลเดอร์ใหม่ที่: {target_folder}")

    # 3. โหลดข้อมูลเดิมที่มีอยู่ในไฟล์ (ถ้ามี)
    master_data = []
    if os.path.exists(full_path):
        with open(full_path, 'r', encoding='utf-8') as f:
            try:
                master_data = json.load(f)
            except (json.JSONDecodeError, FileNotFoundError):
                master_data = []

    # 4. เตรียมข้อมูลใหม่ (Clean ข้อมูลเหมือนเดิม)
    new_entries = []
    for entry in all_scraped_data:
        clean_entry = {
            "post_id": str(entry.get('post_id') or ""),
            "company_name": str(entry.get('company_name') or ""),
            "job_name": str(entry.get('job_name') or ""),
            "skills": [str(s) for s in entry.get('skills', [])],
            "date_scraped": str(entry.get('date_scraped') or ""),
            "date_posted": str(entry.get('date_posted') or "").replace('•', '').replace('Posted', '').strip()
        }
        new_entries.append(clean_entry)

    # 5. รวมข้อมูลและบันทึกลงไปใน Raw_data
    master_data.extend(new_entries)
    with open(full_path, 'w', encoding='utf-8') as f:
        json.dump(master_data, f, ensure_ascii=False, indent=4)
    
    print(f"--- บันทึกสำเร็จ! ---")
    print(f"ตำแหน่งไฟล์: {os.path.abspath(full_path)}") # โชว์ Path เต็มๆ เพื่อความมั่นใจ
    print(f"เพิ่ม {len(new_entries)} รายการ (รวมทั้งหมด {len(master_data)})")

In [8]:
for page in range(1, 26):
    companies = fetch_job_listings(page)
    print(f"Page {page} companies successfully fetched.")
    time.sleep(random.uniform(1, 3))
    get_company_links(companies)
print(f'Number of company links = {len(company_names)}')

Page 1 companies successfully fetched.
Page 2 companies successfully fetched.
Page 3 companies successfully fetched.
Page 4 companies successfully fetched.
Page 5 companies successfully fetched.
Page 6 companies successfully fetched.
Page 7 companies successfully fetched.
Page 8 companies successfully fetched.
Page 9 companies successfully fetched.
Page 10 companies successfully fetched.
Page 11 companies successfully fetched.
Page 12 companies successfully fetched.
Page 13 companies successfully fetched.
Page 14 companies successfully fetched.
Page 15 companies successfully fetched.
Page 16 companies successfully fetched.
Page 17 companies successfully fetched.
Page 18 companies successfully fetched.
Page 19 companies successfully fetched.
Page 20 companies successfully fetched.
Page 21 companies successfully fetched.
Page 22 companies successfully fetched.
Page 23 companies successfully fetched.
Page 24 companies successfully fetched.
Page 25 companies successfully fetched.
Number of

Selenium

In [9]:
selenium_start = 0
selenium_end = len(company_names)

for company_url in list(company_names.values())[selenium_start:selenium_end]:
    jobs_link = get_company_job_links(company_url)
    all_jobs.extend(jobs_link)
    print(f"Fetched job links from {len(jobs_link)} jobs at {company_url} Successfully.")
    time.sleep(random.uniform(1, 3))
print(f'Total job links fetched: {len(all_jobs)}')

Fetched job links from 7 jobs at https://www.dice.com/company-profile/c7af1038-e10f-5c6c-adf6-4debf5924a68?companyname=Global%20Infotek%2C%20Inc. Successfully.
Fetched job links from 3 jobs at https://www.dice.com/company-profile/9200e0ea-f6aa-58a6-a2ff-f19030b62ab8?companyname=BlueCross%20BlueShield%20Of%20South%20Carolina Successfully.
Fetched job links from 1 jobs at https://www.dice.com/company-profile/ca8506bd-5cb6-4ae6-bf2d-067b3cd17bc4?companyname=University%20of%20Michigan Successfully.
Fetched job links from 1 jobs at https://www.dice.com/company-profile/7d36a679-f780-551a-bbb0-b0138d8d4acb?companyname=F2ONSITE Successfully.
Fetched job links from 17 jobs at https://www.dice.com/company-profile/851eb002-5af9-5ba2-920e-07f769f6d828?companyname=BURGEON%20IT%20SERVICES%20LLC Successfully.
Fetched job links from 11 jobs at https://www.dice.com/company-profile/c0115b1f-65e5-5326-982c-6095381fed03?companyname=REDLEO%20SOFTWARE%20INC. Successfully.
Fetched job links from 7 jobs at ht

In [11]:
save_fetched_data(all_jobs, filename='all_jobs_links.pkl')

Data saved to all_jobs_links.pkl


Jobs Details

In [12]:
start = 0
end = len(all_jobs)

for i, job_url in enumerate(all_jobs[start:end]):  # Fetch details for first 500 jobs
    try:
        print(f"Fetching details for URL: {job_url}")
        job_details = fetch_job_details(job_url)
        if job_details not in jobs_details_list:
            jobs_details_list.append(job_details)
        print(f'{i+1}. fetched successfully.')
        time.sleep(random.uniform(1, 3))
    except Exception as e:
        print(f'Error fetching job details from {job_url}: {e}')
    finally:
        continue
print(f'Total job details fetched: {len(jobs_details_list)}')

Fetching details for URL: https://www.dice.com/job-detail/ee521b15-b2a6-4d6d-a85a-62f587ef8652
1. fetched successfully.
Fetching details for URL: https://www.dice.com/job-detail/4852337b-d707-44e4-ae22-68028b2e9a54
2. fetched successfully.
Fetching details for URL: https://www.dice.com/job-detail/0d931f92-67c4-4df7-b93c-1eb190ea97a2
3. fetched successfully.
Fetching details for URL: https://www.dice.com/job-detail/eee669b9-c0f7-43d5-95e1-08728ff664c0
4. fetched successfully.
Fetching details for URL: https://www.dice.com/job-detail/095e3f94-e070-4e90-b6a4-d94470dc608a
5. fetched successfully.
Fetching details for URL: https://www.dice.com/job-detail/1dcc837e-fd77-4e7c-9f66-94e4ac412edf
6. fetched successfully.
Fetching details for URL: https://www.dice.com/job-detail/a358b331-edfa-47a6-97d8-92417a1668f2
7. fetched successfully.
Fetching details for URL: https://www.dice.com/job-detail/853c8fc5-87a0-40c5-b6c0-5b1263b504e8
8. fetched successfully.
Fetching details for URL: https://www.di

เก็บไว้ใน JSON

In [15]:
save_all_to_json(jobs_details_list, filename='jobs_data.json')

สร้างโฟลเดอร์ใหม่ที่: ..\Raw_data
--- บันทึกสำเร็จ! ---
ตำแหน่งไฟล์: c:\GitHub\Clone\programming-language-trends-analysis\Raw_data\jobs_data.json
เพิ่ม 2525 รายการ (รวมทั้งหมด 2525)
