# Requirements:
langchain <br>
langchain_ollama <br>
selenium <br>
beaufifulsoup4 <br>
lxml <br>
html5lib <br>
python-dotenv

## Scraper

In [23]:
import selenium.webdriver as webdriver
from selenium.webdriver.chrome.service import Service

from bs4 import BeautifulSoup

In [64]:
def scrape_website(website):
    print("Launching chrome browser...")

    chrome_driver_path = '../utils/chromedriver'
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)

    try:
        driver.get(website)
        print("Page loaded...")
        html = driver.page_source

        return html
    
    finally:
        driver.quit()

def extract_body_content(html_content):
    soup = BeautifulSoup(html_content,"html.parser")
    body_content = soup.body

    if body_content:
        print("Content extracted...")
        return str(body_content)
    
    print("Unable to collect content")
    
def clean_body_content(body_content):
    soup = BeautifulSoup(body_content,"html.parser")

    for script_or_style in soup(["script","style"]):
        script_or_style.extract()

    cleaned_content = soup.get_text(separator='\n')
    cleaned_content = "\n".join(line.strip() for line in cleaned_content.splitlines() if line.strip())

    print("Content cleaned...")
    return cleaned_content

def split_dom_content(dom_content,max_length=6000):
    return [
        dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
    ]

# Parser

Once Ollama is downloaded just do a 'ollama pull 'model-name'' to pull and be able to utilize the model

In [127]:
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from dataclasses import dataclass
import json
import re

model = OllamaLLM(model='llama3')

@dataclass
class JobData:
    title: str
    company: str
    salary: str
    location: str
    description: str

# Updated template
template = (
    "You are tasked with extracting specific information from the following text content: {dom_content}. "
    "Please follow these instructions carefully:\n\n"
    "1. **Extract Information:** Only extract the information that directly matches the parse description: {parse_description}.\n"
    "2. **Output Format:** Use the following JSON format to present your response:\n"
    "   [\n"
    "   {{\n"
    "       \"title\": \"<job title>\",\n"
    "       \"company\": \"<company name>\",\n"
    "       \"joblink\": \"<job link>\",\n"
    "       \"location\": \"<job location>\",\n"
    "       \"description\": \"<job description>\",\n"
    "   }}\n"
    "   ]\n"
    "4. **Misssing Information:** If there is any information for each job that is not mentioned fill that section with 'na'.\n"
    "5. **No Extra Content:** Do not include any additional text, notes, comments, or explanations in your response.\n"
    "6. **No content leading or following data:** Do not include any reponses before and after the data in its outputted format.\n"
    "6. **Empty Response:** If no information matches the description, return an empty JSON array: `[]`.\n"
)

In [121]:
def parse_with_ollama(dom_chunks,parse_description):
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | model

    parsed_results = []

    for i,chunk in enumerate(dom_chunks,start=1):
        response = chain.invoke({"dom_content":chunk, "parse_description":parse_description})
        print(f"Parsed batch {i} of {len(dom_chunks)}")

        json_match = re.search(r"(\[.*\])", response, re.DOTALL)

        try:
            # Attempt to parse the response as JSON
            if json_match:
                for job in json.loads(json_match.group(1)):
                    parsed_results.append(
                        JobData(
                            title=job.get("title", ""),
                            company=job.get("company", ""),
                            salary=job.get("salary", "Not mentioned"),
                            location=job.get("location", ""),
                            description=job.get("description", "")
                        )
                    )
        except json.JSONDecodeError as e:
            print(f"Error parsing response: {e}")

    return parsed_results

In [128]:
url = 'https://careers.smartrecruiters.com/WesternDigital'

In order to make this work you have to download the chrome driver and also run it in terminal 'pwd'/chromedriver and then when it pops up with a warning you open it

In [129]:
result = scrape_website(url)
body_content = extract_body_content(result)
cleaned_content = clean_body_content(body_content)
dom_chunks = split_dom_content(cleaned_content)

Launching chrome browser...
Page loaded...
Content extracted...
Content cleaned...


In [130]:
res = parse_with_ollama(dom_chunks,'Extract job information from all posted jobs.')

Parsed batch 1 of 2
Error parsing response: Expecting property name enclosed in double quotes: line 5 column 45 (char 139)
Parsed batch 2 of 2


In [131]:
res

[JobData(title='Quality Assurance Engineering', company='', salary='Not mentioned', location='na', description='na')]