In [1]:
# !/usr/bin/env python3
# a vibe coded web crawler by Claude

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import re
from typing import List, Dict

class JobCrawler:
    def __init__(self, keywords: List[str], max_pages: int = 5):
        """
        Initialize the job crawler.
        
        Args:
            keywords: List of keywords to search for in job postings
            max_pages: Maximum number of pages to crawl per URL
        """
        self.keywords = [kw.lower() for kw in keywords]
        self.max_pages = max_pages
        self.results = []
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
    
    def fetch_page(self, url: str) -> str:
        """Fetch HTML content from a URL."""
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            return ""
    
    def extract_text(self, html: str) -> str:
        """Extract readable text from HTML."""
        soup = BeautifulSoup(html, 'html.parser')
        
        # Remove script and style elements
        for script in soup(['script', 'style', 'nav', 'footer', 'header']):
            script.decompose()
        
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        return text.lower()
    
    def contains_keywords(self, text: str) -> List[str]:
        """Check if text contains any of the keywords."""
        found = []
        for keyword in self.keywords:
            if keyword in text:
                found.append(keyword)
        return found
    
    def extract_job_links(self, html: str, base_url: str) -> List[str]:
        """Extract potential job posting links from a page."""
        soup = BeautifulSoup(html, 'html.parser')
        links = set()
        
        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = urljoin(base_url, href)
            
            # Filter for job-related URLs
            job_indicators = ['job', 'position', 'career', 'opening', 'posting']
            if any(indicator in full_url.lower() for indicator in job_indicators):
                links.add(full_url)
        
        return list(links)[:self.max_pages]
    
    def crawl_url(self, url: str):
        """Crawl a single job board URL."""
        print(f"\n{'='*60}")
        print(f"Crawling: {url}")
        print(f"{'='*60}")
        
        # Fetch main page
        html = self.fetch_page(url)
        if not html:
            return
        
        # Check main page for keywords
        text = self.extract_text(html)
        found_keywords = self.contains_keywords(text)
        
        if found_keywords:
            self.results.append({
                'url': url,
                'keywords_found': found_keywords,
                'title': self.extract_title(html)
            })
            print(f"✓ Found keywords on main page: {', '.join(found_keywords)}")
        
        # Extract and crawl job links
        job_links = self.extract_job_links(html, url)
        print(f"Found {len(job_links)} potential job postings to check...")
        
        for i, job_url in enumerate(job_links, 1):
            print(f"  Checking ({i}/{len(job_links)}): {job_url[:60]}...")
            
            job_html = self.fetch_page(job_url)
            if job_html:
                job_text = self.extract_text(job_html)
                found = self.contains_keywords(job_text)
                
                if found:
                    self.results.append({
                        'url': job_url,
                        'keywords_found': found,
                        'title': self.extract_title(job_html)
                    })
                    print(f"    ✓ Match! Keywords: {', '.join(found)}")
            
            time.sleep(1)  # Be polite, don't hammer the server
    
    def extract_title(self, html: str) -> str:
        """Extract page title from HTML."""
        soup = BeautifulSoup(html, 'html.parser')
        title_tag = soup.find('title')
        return title_tag.get_text().strip() if title_tag else "No title"
    
    def crawl(self, urls: List[str]):
        """Crawl multiple job board URLs."""
        print(f"\nStarting crawl for keywords: {', '.join(self.keywords)}")
        
        for url in urls:
            self.crawl_url(url)
            time.sleep(2)  # Delay between different sites
        
        self.print_results()
    
    def print_results(self):
        """Print the crawl results."""
        print(f"\n{'='*60}")
        print(f"CRAWL COMPLETE - Found {len(self.results)} matching pages")
        print(f"{'='*60}\n")
        
        if not self.results:
            print("No matches found for the specified keywords.")
            return
        
        for i, result in enumerate(self.results, 1):
            print(f"{i}. {result['title']}")
            print(f"   URL: {result['url']}")
            print(f"   Keywords: {', '.join(result['keywords_found'])}")
            print()
    
    def save_results(self, filename: str = "job_results.txt"):
        """Save results to a text file."""
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(f"Job Crawler Results\n")
            f.write(f"Keywords: {', '.join(self.keywords)}\n")
            f.write(f"Total matches: {len(self.results)}\n")
            f.write("="*60 + "\n\n")
            
            for i, result in enumerate(self.results, 1):
                f.write(f"{i}. {result['title']}\n")
                f.write(f"   URL: {result['url']}\n")
                f.write(f"   Keywords: {', '.join(result['keywords_found'])}\n\n")
        
        print(f"Results saved to {filename}")




In [None]:
#Create lists of job boards to crawl

#Lists can be swapped out as needed

top_scores_list = [
    "https://weworkremotely.com",
    "https://www.usebraintrust.com",
    "https://nodesk.co",
    "https://remoteok.com",
    "https://www.crossover.com/jobs",
    "https://remotive.com",
    "https://www.workingnomads.com",
    "https://www.flexjobs.com",
    "https://jobspresso.co",
    "https://builtin.com",
    "https://powertofly.com",
    "https://remotewoman.com"
]

#flexjobs gives errors unless logged in



In [3]:
#Create list of keywords to search for during crawl

# Core Role Keywords
role_keywords = [
    "program manager", "project manager", "product manager",
    "technical program manager", "technical project manager",
    "senior program manager", "senior product manager",
    "delivery manager", "engineering program manager",
    "r&d program manager", "implementation manager",
    "pmo", "agile", "scrum", "kanban"
]

# Tech Industry Keywords
tech_keywords = [
    "software", "ai", "machine learning", "data science", "cloud",
    "saas", "devops", "full stack", "backend", "frontend",
    "cybersecurity", "infrastructure", "platform", "systems",
    "digital transformation", "innovation", "startup", "venture",
    "technology", "it", "analytics", "product development"
]

# Remote / Hybrid Keywords
remote_keywords = [
    "remote", "hybrid", "distributed", "work from anywhere",
    "global", "flexible", "telecommute", "digital nomad",
    "work from home", "async", "worldwide"
]

# Compensation / Seniority Keywords
salary_keywords = [
    "senior", "director", "lead", "principal", "head", "executive",
    "$130k", "$150k", "six-figure", "high compensation",
    "experienced", "strategy", "enterprise", "scale", "leadership"
]


In [None]:
# Example usage
if __name__ == "__main__":  #only runs when executed as a script
    # Define keywords
    keywords = role_keywords

    #examples:
    #keywords = ["python", "django", "machine learning", "remote"]
    #keywords = role_keywords + tech_keywords + remote_keywords + salary_keywords
    
    # Define list of niche job board URLs to crawl, use the variable name "urls"
    urls = top_scores_list
    
    # Create crawler and start crawling
    crawler = JobCrawler(keywords=keywords, max_pages=10)
    crawler.crawl(urls)
    
    # Optionally save results
    crawler.save_results("job_results.txt")


Starting crawl for keywords: program manager, project manager, product manager, technical program manager, technical project manager, senior program manager, senior product manager, delivery manager, engineering program manager, r&d program manager, implementation manager, pmo, agile, scrum, kanban

Crawling: https://weworkremotely.com
✓ Found keywords on main page: project manager, product manager, agile, scrum
Found 10 potential job postings to check...
  Checking (1/10): https://weworkremotely.com/remote-jobs/asc-engineered-soluti...
Error fetching https://weworkremotely.com/remote-jobs/asc-engineered-solutions-crm-administrator: 403 Client Error: Forbidden for url: https://weworkremotely.com/remote-jobs/asc-engineered-solutions-crm-administrator
  Checking (2/10): https://weworkremotely.com/remote-jobs/whitebridge-ltd-junio...
Error fetching https://weworkremotely.com/remote-jobs/whitebridge-ltd-junior-crypto-analyst-trader-remote-training-included: 403 Client Error: Forbidden for