# Requirements:
langchain <br>
langchain_ollama <br>
selenium <br>
beaufifulsoup4 <br>
lxml <br>
html5lib <br>
python-dotenv

## Scraper

In [172]:
import selenium.webdriver as webdriver
from selenium.webdriver.chrome.service import Service

from bs4 import BeautifulSoup

In [173]:
def scrape_website(website):
    print("Launching chrome browser...")

    chrome_driver_path = '../utils/chromedriver'
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)

    try:
        driver.get(website)
        print("Page loaded...")
        html = driver.page_source

        return html
    
    finally:
        driver.quit()

def extract_body_content(html_content):
    soup = BeautifulSoup(html_content,"html.parser")
    body_content = soup.body

    if body_content:
        print("Content extracted...")
        return str(body_content)
    
    print("Unable to collect content")

def clean_body_content(body_content):
    soup = BeautifulSoup(body_content,"html.parser")

    for script_or_style in soup(["script","style"]):
        script_or_style.extract()

    cleaned_content = soup.get_text(separator='\n')
    cleaned_content = "\n".join(line.strip() for line in cleaned_content.splitlines() if line.strip())

    print("Content cleaned...")
    return cleaned_content

def extract_links_and_titles_from_sections(body_content):
    soup = BeautifulSoup(body_content, "html.parser")

    # Find all <section> elements
    sections_data = []
    for section in soup.find_all("section"):
        # Extract all <a> and <h4> tags within the section
        links_and_titles = []
        for link_tag in section.find_all("a", href=True):
            h4_tag = link_tag.find_next("h4")
            # Will need to add a check before hand in order to check if it is actually a job link
            links_and_titles.append({
                "href": link_tag["href"],
                "title": h4_tag.get_text(strip=True) if h4_tag else None
            })
        

        if links_and_titles:
            sections_data.append(links_and_titles)

    print("Links and titles extracted from sections...")
    return sections_data

def split_dom_content(dom_content,max_length=6000):
    return [
        dom_content[i:i+max_length] for i in range(0,len(dom_content),max_length)
    ]

# Parser

Once Ollama is downloaded just do a 'ollama pull 'model-name'' to pull and be able to utilize the model

In [150]:
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from dataclasses import dataclass
import json
import re

model = OllamaLLM(model='llama3')

@dataclass
class JobData:
    title: str
    company: str
    salary: str
    location: str
    description: str

@dataclass
class JobLinkData:
    title: str
    link: str

# Updated template
job_data_template = (
    "You are tasked with extracting specific information from the following text content: {dom_content}. "
    "Please follow these instructions carefully:\n\n"
    "1. **Extract Information:** Only extract the information that directly matches the parse description: {parse_description}.\n"
    "2. **Output Format:** Use the following JSON format to present your response:\n"
    "   [\n"
    "   {{\n"
    "       \"title\": \"<job title>\",\n"
    "       \"company\": \"<company name>\",\n"
    "       \"joblink\": \"<job link>\",\n"
    "       \"location\": \"<job location>\",\n"
    "       \"description\": \"<job description>\",\n"
    "   }}\n"
    "   ]\n"
    "4. **Misssing Information:** If there is any information for each job that is not mentioned fill that section with 'na'.\n"
    "5. **No Extra Content:** Do not include any additional text, notes, comments, or explanations in your response.\n"
    "6. **No content leading or following data:** Do not include any reponses before and after the data in its outputted format.\n"
    "6. **Empty Response:** If no information matches the description, return an empty JSON array: `[]`.\n"
)

job_link_template = (
    "You are tasked with extracting specific information from the following text content: {dom_content}. "
    "Please follow these instructions carefully:\n\n"
    "1. **Extract Information:** Only extract the information that directly matches the parse description: {parse_description}.\n"
    "2. **Output Format:** Use the following JSON format to present your response:\n"
    "   [\n"
    "   {{\n"
    "       \"title\": \"<job title>\",\n"
    "       \"link\": \"<href>\",\n"
    "   }}\n"
    "   ]\n"
    "4. **Misssing Information:** If there is any information for each job that is not mentioned fill that section with 'na'.\n"
    "5. **No Extra Content:** Do not include any additional text, notes, comments, or explanations in your response.\n"
    "6. **No content leading or following data:** Do not include any reponses before and after the data in its outputted format.\n"
    "6. **Empty Response:** If no information matches the description, return an empty JSON array: `[]`.\n"
)

In [151]:
def get_links(dom_chunks,parse_description):
    prompt = ChatPromptTemplate.from_template(job_link_template)
    chain = prompt | model

    parsed_results = []

    for i,chunk in enumerate(dom_chunks,start=1):
        response = chain.invoke({"dom_content":chunk, "parse_description":parse_description})
        print(f"Parsed batch {i} of {len(dom_chunks)}")

        json_match = re.search(r"(\[.*\])", response, re.DOTALL)

        try:
            # Attempt to parse the response as JSON
            if json_match:
                for job in json.loads(json_match.group(1)):
                    parsed_results.append(
                        JobLinkData(
                            title=job.get("title", ""),
                            link=job.get("link", "")
                        )
                    )
        except json.JSONDecodeError as e:
            print(f"Error parsing response: {e}")

    return parsed_results

In [174]:
def parse_with_ollama(dom_chunks,parse_description):
    prompt = ChatPromptTemplate.from_template(job_data_template)
    chain = prompt | model

    parsed_results = []

    for i,chunk in enumerate(dom_chunks,start=1):
        response = chain.invoke({"dom_content":chunk, "parse_description":parse_description})
        print(f"Parsed batch {i} of {len(dom_chunks)}")

        json_match = re.search(r"(\[.*\])", response, re.DOTALL)

        try:
            # Attempt to parse the response as JSON
            if json_match:
                for job in json.loads(json_match.group(1)):
                    parsed_results.append(
                        JobData(
                            title=job.get("title", ""),
                            company=job.get("company", ""),
                            salary=job.get("salary", "Not mentioned"),
                            location=job.get("location", ""),
                            description=job.get("description", "")
                        )
                    )
        except json.JSONDecodeError as e:
            print(f"Error parsing response: {e}")

    return parsed_results

In [175]:
ind_job_url = 'https://explore.jobs.netflix.net/careers/job?domain=netflix.com&pid=790299385411&query=data%20engineer&domain=netflix.com&sort_by=relevance&utm_source=Netflix%20Careersite&jobIndex=0&job_index=0'

In [134]:
url = 'https://careers.smartrecruiters.com/WesternDigital'

In order to make this work you have to download the chrome driver and also run it in terminal 'pwd'/chromedriver and then when it pops up with a warning you open it

In [178]:
result = scrape_website(ind_job_url)
body_content = extract_body_content(result)
cleaned_content = clean_body_content(body_content)
dom_chunks = split_dom_content(cleaned_content)

Launching chrome browser...
Page loaded...
Content extracted...
Content cleaned...


In [180]:
res = parse_with_ollama(dom_chunks,parse_description='Pull all relevant job information.')

Parsed batch 1 of 5
Parsed batch 2 of 5
Parsed batch 3 of 5
Parsed batch 4 of 5
Parsed batch 5 of 5
Error parsing response: Expecting value: line 3 column 14 (char 19)


In [169]:
result = scrape_website(url)
body_content = extract_body_content(result)
#cleaned_content = clean_body_content(body_content)
cleaned_content = extract_links_and_titles_from_sections(body_content)
dom_chunks = split_dom_content(cleaned_content)

Launching chrome browser...
Page loaded...
Content extracted...
Links and titles extracted from sections...


In [170]:
cleaned_content

[{'section_html': '<section aria-label="Careers at Western Digital" class="site-splash splash" id="st-heroSection"><a class="skipping-link js-skipping visibility--onFocus" href="#st-main">Skip hero section</a><div class="js-lazyloadBgImage" style=""><span class="splash-image js-parallax" id="parallax-background" style=\'background-image: url("https://c.smartrecruiters.com/sr-careersite-image-prod-aws-dc5/606f5a8a307c314d0b94c588/f7523030-6e71-4706-86df-52e7ac48bd20?r=s3-eu-central-1"); transform: translate3d(0px, 0px, 0px);\'></span></div><div class="splash-container alignment--vertical is-search_added is-splash_added"><div class="alignment-content"><div class="hero splash-hero"><div class="hero-wrapper wrapper"><div class="hero-details details"><h2 class="hero-title details-title text--pre-wrap">Careers at Western Digital</h2></div></div></div></div></div></section>',
  'links_and_titles': [{'href': '#st-main',
    'title': 'Technician 2, Test Equipment Maintenance'}]},
 {'section_htm

In [152]:
links = get_links(dom_chunks, 'Extract all job information that includes job titles as title and listed job href as link.')

Parsed batch 1 of 1


In [156]:
links

[JobLinkData(title='Analyst 4, IT Security', link='https://jobs.smartrecruiters.com/WesternDigital/744000031268330-analyst-4-it-security'),
 JobLinkData(title='Associate General Counsel- 20+ years -Legal', link='https://jobs.smartrecruiters.com/WesternDigital/744000021048891-associate-general-counsel-20-years-legal'),
 JobLinkData(title='Technician 3, Engineering', link='https://jobs.smartrecruiters.com/WesternDigital/744000031304792-technician-3-engineering'),
 JobLinkData(title='Program Manager, Third Party Risk Management & Responsible Sourcing (Procurement)', link='https://jobs.smartrecruiters.com/WesternDigital/744000030509280-program-manager-third-party-risk-management-responsible-sourcing-procurement-')]