In [11]:
import json
import asyncio
from loguru import logger as log
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
from lxml import html
from urllib.parse import urlencode, quote_plus

# Initialize Scrapfly client with your API key
SCRAPFLY = ScrapflyClient(key="scp-live-29eb1fea76d9446eac9f5ba0027653fc")

# Base configuration for scraping
BASE_CONFIG = {
    "asp": True,
    "country": "US",
    "headers": {
        "Accept-Language": "en-US,en;q=0.5"
    }
}

## 1. Scrape URLs of job listings from the job search page

In [None]:
# The module parses job urls and the total number of job listings from job search pages
async def parse_job_search(response: ScrapeApiResponse):
        # Get the HTML content which is assigned to the key "content" in the response dictionary
        content = response.scrape_result['content']
        
        # Parse the HTML content
        tree = html.fromstring(content)
        log.info(f"tree detail: {tree}")

        # Extract job URLs which have href attributes
        job_urls = tree.xpath('//a[contains(@class, "base-card__full-link")]/@href')
        urls = []
        for url in job_urls:
            urls.append(url)

        # Extract the number of total results
        total_results = tree.xpath("//span[@class='results-context-header__job-count']/text()")
        total_results = int(total_results[0].strip()) if total_results else 0

        # Store job URLs and the total number of results in a dictionary    
        result = {"urls": urls, "total_results": total_results}
        return result

In [12]:
async def scrape_job_search(keyword: str, max_pages: int = None):

    def form_urls_params(keyword):
        # form the job search URL params
        params = {'keywords': quote_plus(keyword)}
        return urlencode(params)

    # Get the response of the first page
    first_page_url = "https://www.linkedin.com/jobs/search?f_E=1%2C2&geoId=104195383&" + form_urls_params(keyword)
    first_page_response = await SCRAPFLY.async_scrape(ScrapeConfig(first_page_url, **BASE_CONFIG, render_js=True))
    
    # Scrape URLs of job listings in the first page and total results
    first_page_data = await parse_job_search(first_page_response)
    urls = first_page_data['urls']
    total_results = first_page_data['total_results']

    # Calculate the number of pages to scrape
    if max_pages and max_pages * 25 < total_results:
        total_results = max_pages * 25
    log.info(f'Scraped the first job page, {total_results // 25 - 1} more pages')

    # Scrape the remaining pages concurrently
    other_pages_url = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?f_E=1%2C2&geoId=104195383&"
    to_scrape = [ScrapeConfig(other_pages_url + form_urls_params(keyword) + f"&start={index}", **BASE_CONFIG, render_js=True)
                 for index in range(25, total_results + 25, 25)]
    
    async for response in SCRAPFLY.concurrent_scrape(to_scrape):
        if response.status_code == 200:
            result = await parse_job_search(response)  # Await this call
            page_urls = result['urls']
            urls.extend(page_urls)
            log.debug(f"Scraped {len(page_data)} jobs from this page. Total jobs collected: {len(urls)}")
        else:
            log.error(f"Failed to scrape: Status code {response.status_code}")
            return None

    log.success(f'Scraped {len(urls)} jobs from LinkedIn job search')
    return urls # Return a list of urls 

In [21]:
async def run():
    job_search_data = await scrape_job_search(
        keyword="financial risk management", max_pages = 10
    )
    print(job_search_data)
    # Save the data to a JSON file
    with open("u2_financial_risk_management.json", "w", encoding="utf-8") as file:
        json.dump(job_search_data, file, indent=2, ensure_ascii=False)

# Directly use `await` in the interactive shell
await run()

[32m2025-02-17 22:32:36.572[0m | [1mINFO    [0m | [36m__main__[0m:[36mparse_job_search[0m:[36m29[0m - [1mtree detail: <Element html at 0x1090b77f0>[0m
[32m2025-02-17 22:32:36.573[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrape_job_search[0m:[36m21[0m - [1mScraped the first job page, 3 more pages[0m
[32m2025-02-17 22:32:44.594[0m | [1mINFO    [0m | [36m__main__[0m:[36mparse_job_search[0m:[36m29[0m - [1mtree detail: <Element html at 0x1090b4e10>[0m
[32m2025-02-17 22:32:44.596[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mscrape_job_search[0m:[36m34[0m - [34m[1mScraped 2 jobs from this page. Total jobs collected: 56[0m
[32m2025-02-17 22:32:54.119[0m | [1mINFO    [0m | [36m__main__[0m:[36mparse_job_search[0m:[36m29[0m - [1mtree detail: <Element html at 0x1090b7430>[0m
[32m2025-02-17 22:32:54.121[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mscrape_job_search[0m:[36m34[0m - [34m[1mScraped 2 jobs from this page. Tota

['https://vn.linkedin.com/jobs/view/associate-executive-%E2%80%93-credit-risk-management-financial-services-at-momo-m-service-4123597192?position=1&pageNum=0&refId=O0i1L5PHsGdcwZVTYClO4w%3D%3D&trackingId=xk6AfYedkcamZu6ouMyYdg%3D%3D', 'https://vn.linkedin.com/jobs/view/relationship-manager-at-orient-commercial-joint-stock-bank-ocb-4137448765?position=2&pageNum=0&refId=O0i1L5PHsGdcwZVTYClO4w%3D%3D&trackingId=mdI5JeB0n4YTt%2FIzp70V5g%3D%3D', 'https://vn.linkedin.com/jobs/view/internal-inspection-staff-at-ctbc-bank-vietnam-4148869164?position=3&pageNum=0&refId=O0i1L5PHsGdcwZVTYClO4w%3D%3D&trackingId=mca9wm2oqThDH%2FYBMtRO%2BQ%3D%3D', 'https://vn.linkedin.com/jobs/view/kyc-officer-vietnam-morning-day-shifts-at-cybertexex-4074882596?position=4&pageNum=0&refId=O0i1L5PHsGdcwZVTYClO4w%3D%3D&trackingId=DiAaaGR9Ih7hpHFtnaXHMw%3D%3D', 'https://vn.linkedin.com/jobs/view/overseas-remittance-staff-at-ctbc-bank-vietnam-4152673294?position=5&pageNum=0&refId=O0i1L5PHsGdcwZVTYClO4w%3D%3D&trackingId=4DmY

# 2. Scrape job details

In [None]:
file_path = '/Users/nhuyenhuynh/u2_financial_risk_management.json'  
with open(file_path, 'r', encoding='utf-8') as file:
    json_data = json.load(file)
urls = [item for item in json_data]

In [None]:
# Function to save the descriptions to a .txt or .json file
def save_descriptions_to_file(descriptions):
    file_path_json = '/Users/nhuyenhuynh/d2_financial_risk_management.json'

    try:
        # Optionally save as .json file (formatted JSON)
        with open(file_path_json, 'w', encoding='utf-8') as file:
            json.dump(descriptions, file, ensure_ascii=False, indent=4)
        print(f"Descriptions saved to {file_path_json}")

    except Exception as e:
        print(f"Error saving descriptions to file: {e}")

In [None]:
async def scrape_urls():
    to_scrape = [
        ScrapeConfig(url, **BASE_CONFIG) for url in urls
    ]
    
    # Initialize a list to store the cleaned descriptions
    descriptions = []

    async for response in SCRAPFLY.concurrent_scrape(to_scrape):
        try:
            if response.status_code == 200:
                # Get the HTML content
                content = response.scrape_result['content']
                
                # Parse the HTML content
                tree = html.fromstring(content)
                
                # Extract the JSON content from the <script> tag
                json_script = tree.xpath('//script[@type="application/ld+json"]/text()')
                
                # If the script is found and contains valid JSON, parse it
                if json_script:
                    try:
                        job_posting_data = json.loads(json_script[0])  # Parse the first matching script tag
                        description = job_posting_data.get("description", "")
                        
                        # Decode HTML entities (e.g., &lt;br&gt; to <br>)
                        description_cleaned = unescape(description)
                        
                        # Remove any remaining HTML tags
                        description_no_tags = re.sub(r'<.*?>', ' ', description_cleaned)
                        
                        # Append the cleaned description to the list
                        descriptions.append(description_no_tags)
                        
                
                    except json.JSONDecodeError as e:
                        print(f"Failed to decode JSON: {e}")
                else:
                    print("No <script> tag with type 'application/ld+json' found")
                
            else:
                log.error(f"Failed to scrape: Status code {response.status_code}")
                return None
                
        except Exception as e:
            log.error(f"Error processing response: {str(e)}")
            return None

    # Save the descriptions to a file after scraping is complete
    save_descriptions_to_file(descriptions)

    # Return descriptions if needed
    return 'finish the scraping'

In [None]:
if __name__ == "__main__":
    if not asyncio.get_event_loop().is_running():
        result = asyncio.run(scrape_urls())
    else:
        result = await scrape_urls()