In [4]:
!pip install pandas requests beautifulsoup4 nest_asyncio playwright
!playwright install

Collecting playwright
  Downloading playwright-1.54.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.54.0-py3-none-manylinux1_x86_64.whl (45.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.9/45.9 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.54.0 pyee-13.0.0
Downloading Chromium 139.0.7258.5 (playwright build v1181)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1181/chromium-linux.zip[22m
Chromium 139.0.7258.5 (playwright build v1181) downloaded to /root/.cache/ms-playwright/chromium-1181
Downloading Chromium Headless Shell 139.0.7258.5 (playwright build v1181)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1181/

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse, parse_qs
import nest_asyncio
import asyncio
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout

nest_asyncio.apply()

In [7]:
# Step 1: Fetch Static Job Listing Info
def fetch_jobs_from_page(page_number, fcat_id):
    url = f'https://jobs.bdjobs.com/jobsearch.asp?fcatId={fcat_id}&icatId=&pg={page_number}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

In [8]:
# Step 2: Scrape Static Data
def scrape_static_data(num_pages=8, category_ids=range(61, 100)):
    job_details = []
    for fcat_id in category_ids:
        for page in range(1, num_pages + 1):
            soup = fetch_jobs_from_page(page, fcat_id)
            jobs = soup.find_all('div', class_=['norm-jobs-wrapper', 'sout-jobs-wrapper'])
            print(f"fcat_id {fcat_id}, Page {page}: Found {len(jobs)} jobs")

            for job in jobs:
                title_div = job.find('div', class_='job-title-text')
                title_a = title_div.find('a') if title_div else None
                job_link = title_a['href'] if title_a and title_a.has_attr('href') else 'N/A'
                parsed = urlparse(job_link)
                q = parse_qs(parsed.query)

                job_details.append({
                    "Title": title_a.get_text(strip=True) if title_a else 'N/A',
                    "Job Link": job_link,
                    "Job ID": q.get('id', ['N/A'])[0],
                    "Job Category ID": q.get('fcatId', ['N/A'])[0],
                    "Company Name": job.find('div', class_='comp-name-text').get_text(strip=True) if job.find('div', class_='comp-name-text') else 'N/A',
                    "Promotion Text": job.find('div', class_='promo-text').get_text(strip=True) if job.find('div', class_='promo-text') else 'N/A',
                    "Location": job.find('div', class_='locon-text-d').get_text(strip=True) if job.find('div', class_='locon-text-d') else 'N/A',
                    "Experience Required": job.find('div', class_='exp-text-d').get_text(strip=True) if job.find('div', class_='exp-text-d') else 'N/A',
                    "Deadline": job.find('div', class_='dead-text-d').get_text(strip=True) if job.find('div', class_='dead-text-d') else 'N/A'
                })
    return pd.DataFrame(job_details)

In [10]:
# Step 3: Scrap Dynamic Data
async def fetch_dynamic_data(page, link):
    job_data = {
        "Job Link": link,
        "Vacancy": "N/A",
        "Age": "N/A",
        "Job Location": "N/A",
        "Salary": "N/A",
        "Experience": "N/A",
        "Published": "N/A",
        "Additional Requirements": "N/A",
        "Education": "N/A",
        "Remuneration Package": "N/A",
        "Employment Status": "N/A",
        "Gender": "N/A"
    }

    try:
        await page.goto(link, timeout=60000)
        await page.wait_for_selector('#allSection', timeout=5000)
        html = await page.content()
        soup = BeautifulSoup(html, 'html.parser')

        # Section: #allSection
        section = soup.find('div', id='allSection')
        if section:
            for div in section.find_all('div', recursive=True):
                text = div.get_text(separator=' ', strip=True)
                if 'Vacancy:' in text:
                    job_data["Vacancy"] = text.replace('Vacancy:', '').strip()
                elif 'Age:' in text:
                    job_data["Age"] = text.replace('Age:', '').strip()
                elif 'Location:' in text:
                    job_data["Job Location (Dynamic)"] = text.replace('Location:', '').strip()
                elif 'Salary:' in text:
                    job_data["Salary"] = text.replace('Salary:', '').strip()
                elif 'Experience:' in text:
                    job_data["Experience (Dynamic)"] = text.replace('Experience:', '').strip()
                elif 'Published:' in text:
                    job_data["Published"] = text.replace('Published:', '').strip()

        # Section: Education and Additional Requirements
        requirements_div = soup.find('div', id='requirements')
        if requirements_div:
            # Education
            edu_block = requirements_div.find('p', string='Education')
            if edu_block:
                ul = edu_block.find_next('ul')
                if ul:
                    items = [li.get_text(strip=True) for li in ul.find_all('li')]
                    job_data["Education"] = "; ".join(items)

            # Additional Requirements
            add_block = requirements_div.find('p', string='Additional Requirements')
            if add_block:
                sibling_divs = add_block.find_next_siblings('div')
                if sibling_divs:
                    all_lis = []
                    for div in sibling_divs:
                        all_lis += [li.get_text(strip=True) for li in div.find_all('li')]
                    job_data["Additional Requirements"] = "; ".join(all_lis)

        # Remuneration Package
        salary_section = soup.find('div', id='salary')
        if salary_section:
            ul = salary_section.find('ul')
            if ul:
                rem_texts = [li.get_text(strip=True) for li in ul.find_all('li')]
                job_data["Remuneration Package"] = "; ".join(rem_texts)

        # Employment Status
        emp_status_label = soup.find('p', string='Employment Status')
        if emp_status_label:
            emp_status_value = emp_status_label.find_next('p')
            if emp_status_value:
                job_data["Employment Status"] = emp_status_value.get_text(strip=True)

        # Gender
        gender_label = soup.find('p', string='Gender')
        if gender_label:
            gender_value = gender_label.find_next('p')
            if gender_value:
                job_data["Gender"] = gender_value.get_text(strip=True)

    except Exception as e:
        print(f"Error on {link}: {e}")
    except PlaywrightTimeout:
        print(f"Timeout: {link}")

    return job_data

In [11]:
# Step 4: Use asyncio.gather for concurrency
async def scrape_dynamic_data_concurrent(job_links, concurrency=5):
    results = []
    semaphore = asyncio.Semaphore(concurrency)

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()

        async def bound_scrape(link):
            async with semaphore:
                page = await context.new_page()
                data = await fetch_dynamic_data(page, link)
                await page.close()
                return data

        tasks = [bound_scrape(link) for link in job_links]
        results = await asyncio.gather(*tasks)

        await browser.close()
    return pd.DataFrame(results)

In [12]:
# Step 5: Run Entire Pipeline
async def run_full_scraper():
    static_df = scrape_static_data(num_pages=8, category_ids=range(61, 100))
    job_links = static_df['Job Link'].dropna().unique().tolist()
    dynamic_df = await scrape_dynamic_data_concurrent(job_links, concurrency=5)
    final_df = pd.merge(static_df, dynamic_df, on='Job Link', how='left')
    final_df.to_csv("bdjobs_scraped_data.csv", index=False)
    print("Scraping complete. Data saved to 'bdjobs_scraped_data.csv'")
    return final_df

In [13]:
# Run the script
await run_full_scraper()

fcat_id 61, Page 1: Found 30 jobs
fcat_id 61, Page 2: Found 0 jobs
fcat_id 61, Page 3: Found 0 jobs
fcat_id 61, Page 4: Found 0 jobs
fcat_id 61, Page 5: Found 0 jobs
fcat_id 61, Page 6: Found 0 jobs
fcat_id 61, Page 7: Found 0 jobs
fcat_id 61, Page 8: Found 0 jobs
fcat_id 62, Page 1: Found 50 jobs
fcat_id 62, Page 2: Found 9 jobs
fcat_id 62, Page 3: Found 0 jobs
fcat_id 62, Page 4: Found 0 jobs
fcat_id 62, Page 5: Found 0 jobs
fcat_id 62, Page 6: Found 0 jobs
fcat_id 62, Page 7: Found 0 jobs
fcat_id 62, Page 8: Found 0 jobs
fcat_id 63, Page 1: Found 48 jobs
fcat_id 63, Page 2: Found 0 jobs
fcat_id 63, Page 3: Found 0 jobs
fcat_id 63, Page 4: Found 0 jobs
fcat_id 63, Page 5: Found 0 jobs
fcat_id 63, Page 6: Found 0 jobs
fcat_id 63, Page 7: Found 0 jobs
fcat_id 63, Page 8: Found 0 jobs
fcat_id 64, Page 1: Found 17 jobs
fcat_id 64, Page 2: Found 0 jobs
fcat_id 64, Page 3: Found 0 jobs
fcat_id 64, Page 4: Found 0 jobs
fcat_id 64, Page 5: Found 0 jobs
fcat_id 64, Page 6: Found 0 jobs
fcat_i

Unnamed: 0,Title,Job Link,Job ID,Job Category ID,Company Name,Promotion Text,Location,Experience Required,Deadline,Vacancy,...,Salary,Experience,Published,Additional Requirements,Education,Remuneration Package,Employment Status,Gender,Job Location (Dynamic),Experience (Dynamic)
0,USG Operator/Medical Transcriptionist,https://jobs.bdjobs.com/jobdetails/?id=1394103...,1394103,61,Dhaka Central International Medical College & ...,DCIMCH is a leading healthcare institution com...,Shyamoli,1 to 3 year(s),30 Aug2025,2,...,Negotiable,,09 Aug 2025,Age 25 to 35 years,HSC; Bachelor/Honors,"Provident fund,Over time allowance; Salary Rev...",Full Time,Only Female,Dhaka (Shyamoli),1 to 3 years
1,Lead Generation & Excel Expert,https://jobs.bdjobs.com/jobdetails/?id=1393321...,1393321,61,M.K.I Outsourcing,,Sylhet Sadar,3 to 5 year(s),16 Aug2025,--,...,Negotiable,,07 Aug 2025,Age 25 to 35 years; Proficiency in lead genera...,Bachelor/Honors,Salary Review: Half Yearly; Festival Bonus: 2,Full Time,Only Male,Sylhet (Sylhet Sadar),3 to 5 years
2,Computer Operator (Female),https://jobs.bdjobs.com/jobdetails/?id=1393595...,1393595,61,BAJAJMART,,Chirirbandar,Na,17 Aug2025,2,...,Tk. 12000 - 15000 (Monthly),,07 Aug 2025,অভিজ্ঞদের অগ্রাধীকার দেওয়া হবে।; ই-মেইল এবং ইন...,HSC,,Full Time,Only Female,Dinajpur (Chirirbandar),
3,কম্পিউটার অপারেটর,https://jobs.bdjobs.com/jobdetails/?id=1393434...,1393434,61,Shaheed Lieutenant Tanzim Cantonment Public Sc...,,Ramu,2 to 3 year(s),21 Aug2025,01,...,Negotiable,,07 Aug 2025,Age 18 to 35 years,HSC; যে কোন বোর্ড হতে এইচএসসি/সমমানসহ সরকার অন...,গ্রেড - ১৬ (৯৩০০-২২৪৯০),Full Time,,Cox`s Bazar (Ramu),2 to 3 years
4,Computer Operator,https://jobs.bdjobs.com/jobdetails/?id=1393399...,1393399,61,Bazlul Huq Khan School & College,,Dhaka,2 to 4 year(s),25 Aug2025,2,...,Negotiable,,07 Aug 2025,Age 25 to 32 years; Good typing knowledge in b...,Bachelor/Honors,Provident fund; Salary Review: Yearly; Festiva...,Full Time,Only Male,Dhaka,2 to 4 years
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718,Medical Technologist / Physiotherapist,https://jobs.bdjobs.com/jobdetails/?id=1393356...,1393356,92,Amin Physiotherapy & Fitness Center,,"Sylhet, Sylhet Sadar",Na,16 Aug2025,4,...,Tk. 15000 - 25000 (Monthly),,06 Aug 2025,Age 22 to 33 years,Bachelor of Science (BSc) in Physiotherapy; Di...,,Full Time,,"Sylhet, Sylhet (Sylhet Sa... Sylhet, Sylhet (S...",
719,অফিস সহায়ক (পুরুষ),https://jobs.bdjobs.com/jobdetails/?id=1392652...,1392652,92,E-Learning And Earning Ltd (Barishal Branch),,Barishal Sadar,At least 1 year(s),14 Aug2025,1,...,Tk. 7000 (Monthly),,04 Aug 2025,Age At least 18 years; ২৪ ঘন্টা অফিসে থাকা বাধ...,JSC / JDC / 8 pass,Salary Review: Yearly; Festival Bonus: 2,Full Time,Only Male,Barishal (Barishal Sadar),At least 1 years
720,Clinical Physiotherapist,https://jobs.bdjobs.com/jobdetails/?id=1390739...,1390739,92,Techno Health Bangladesh,,"Uttara, Banani",1 to 3 year(s),28 Aug2025,10,...,Tk. 20000 - 30000 (Monthly),,29 Jul 2025,Age 25 to 35 years; Excellent communication an...,Bachelor of Physiotherapy (BPT),,Full Time,Only Female,"Dhaka (Banani, Uttara)",1 to 3 years
721,Medical Technologist (Physiotherapy),https://jobs.bdjobs.com/jobdetails/?id=1387833...,1387833,92,ASPC ManipulationTherapy centre,,"Mohammadpur, Tangail Sadar",Na,20 Aug2025,5,...,Tk. 16000 - 20000 (Monthly),,24 Jul 2025,,Diploma in Medical Technology (Physiotherapy),Performance bonus; Salary Review: Yearly; Fest...,Full Time,,"Dhaka (Mohammadpur), Tang... Dhaka (Mohammadpu...",


In [15]:
from IPython.display import FileLink

FileLink(r'bdjobs_scraped_data.csv')