# Requirements:
langchain <br>
langchain_ollama <br>
selenium <br>
beaufifulsoup4 <br>
lxml <br>
html5lib <br>
python-dotenv

## Scraper

In [23]:
import selenium.webdriver as webdriver
from selenium.webdriver.chrome.service import Service

from bs4 import BeautifulSoup

In [54]:
def scrape_website(website):
    print("Launching chrome browser...")

    chrome_driver_path = '../utils/chromedriver'
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)

    try:
        driver.get(website)
        print("Page loaded...")
        html = driver.page_source

        return html
    
    finally:
        driver.quit()

def extract_body_content(html_content):
    soup = BeautifulSoup(html_content,"html.parser")
    body_content = soup.body

    if body_content:
        print("Content extracted...")
        return str(body_content)
    
    print("Unable to collect content")
    
def clean_body_content(body_content):
    soup = BeautifulSoup(body_content,"html.parser")

    for script_or_style in soup(["script","style"]):
        script_or_style.extract()

    cleaned_content = soup.get_text(separator='\n')
    cleaned_content = "\n".join(line.strip() for line in cleaned_content.splitlines() if line.strip())

    print("Content cleaned...")
    return cleaned_content

def split_dom_content(dom_content,max_length=6000):
    return [
        dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
    ]

# Parser

Once Ollama is downloaded just do a 'ollama pull 'model-name'' to pull and be able to utilize the model

In [43]:
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

model = OllamaLLM(model='llama3')

template = (
    "You are tasked with extracting specific information from the following text content: {dom_content}. "
    "Please follow these instructions carefully: \n\n"
    "1. **Extract Information:** Only extract the information that directly matches the provided description: {parse_description}. "
    "2. **No Extra Content:** Do not include any additional text, comments, or explanations in your response. "
    "3. **Empty Response:** If no information matches the description, return an empty string ('')."
    "4. **Direct Data Only:** Your output should contain only the data that is explicitly requested, with no other text."
)

In [41]:
def parse_with_ollama(dom_chunks,parse_description):
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | model

    parsed_results = []

    for i,chunk in enumerate(dom_chunks,start=1):
        response = chain.invoke({"dom_content":chunk, "parse_description":parse_description})
        print(f"Parsed batch {i} of {len(dom_chunks)}")

        parsed_results.append(response)

    return "\n".join(parsed_results)

In [50]:
url = 'https://jobs.apple.com/en-us/search?key=data%252520engineer&location=united-states-USA&sort=relevance&page=2'

In order to make this work you have to download the chrome driver and also run it in terminal 'pwd'/chromedriver and then when it pops up with a warning you open it

In [55]:
result = scrape_website(url)
body_content = extract_body_content(result)
cleaned_content = clean_body_content(body_content)
dom_chunks = split_dom_content(cleaned_content)

Launching chrome browser...
Page loaded...
Unable to collect content


TypeError: object of type 'NoneType' has no len()

In [44]:
res = parse_with_ollama(dom_chunks,'Extract all relevent job data including job title, job link, salary range, location, team')

Parsed batch 1 of 6
Parsed batch 2 of 6
Parsed batch 3 of 6
Parsed batch 4 of 6
Parsed batch 5 of 6
Parsed batch 6 of 6


In [45]:
res

'US-Business Expert\nSales and Business Development\nDec 10, 2024\nVarious Locations within United States\nHere is the extracted information:\n\n**US-Business Pro:**\n* Role Number: 200125453\n* Weekly Hours: 40 Hours\n\n**US-Genius:**\n* Role Number: 114438151\n* Weekly Hours: 40 Hours\n\n**US-Creative:**\n* Role Number: 114438149\n* Weekly Hours: 40 Hours\n\n**US-Operations Expert:**\n* Role Number: 114438152\n* Weekly Hours: 40 Hours\n\n**US-Expert:**\n* Role Number: 114438150\n* Weekly Hours: 40 Hours\nHere are the extracted job details:\n\n**US-Expert**\n\n* Role Number: N/A\n* Weekly Hours: N/A\n\n**US-Technical Specialist**\n\n* Role Number: 114438201\n* Weekly Hours: 40 Hours\n\n**US - Specialist: Full-Time, Part-Time, and Part-Time Temporary**\n\n* Role Number: 114438158\n* Weekly Hours: 40 Hours\n\n**WatchOS Software QA Engineer**\n\n* Role Number: 200576662\n* Weekly Hours: 40 Hours\n\n**Core Bringup Engineering Manager**\n\n* Role Number: 200582675\n* Weekly Hours: 40 Hours