## Import required libraries

In [1]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import time

## Generating a URL with a function

In [357]:
# Function to generate the URL based on position, location, and page
def generate_url(position, location, page=1):
    base_url = "https://www.careerjet.com/jobs"
    search_params = f"?s={position.replace(' ', '+')}&l={location.replace(' ', '+')}&p={page}"
    return base_url + search_params

## Creating WebScraping Function

In [389]:
# Function to scrape job postings based on position, location, and page
def scrape_jobs(position, location, pages):
    current_date = datetime.now().strftime("%Y-%m-%d")  # Get current date to timestamp the CSV file
    filename = f"job_postings_{position.replace(' ', '_')}_{location.replace(' ', '_')}_{current_date}.csv"
    
    # Open the CSV file for writing
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['JobTitle', 'Company', 'JobLocation', 'PostTime', 'ExtractDate', 'Salary', 'JobURL'])  # Write the header row
        
        for page in range(1, pages + 1):
            url = generate_url(position, location, page)
            print(f'url page{page}: {url}')
            # Send a GET request to the generated URL
            response = requests.get(url)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                job_listings = soup.find("ul", class_="jobs")  # Adjust selector based on actual page structure
                job_listings = job_listings.find_all('li', class_= lambda x: x != 'cjgad-outer') #extract job listings

                for job in job_listings:
                        
                    JobTitle = job.find('h2').find('a').text.strip() if job.find('h2') else None
    
                    company_tag = job.find('p', class_='company')
                    Company = company_tag.find('a').text.strip() if company_tag and company_tag.find('a') else None
                    
                    JobLocation = job.find('ul', class_='location').find('li').text.strip() if job.find('ul', class_='location') else None
                    
                    PostTime = job.find('footer').find('ul', class_='tags').find('li').find('span').text.strip() if job.find('footer') else None
                    
                    ExtractDate = datetime.now().strftime("%Y-%m-%d %H:%M")
                    
                    Salary = job.find('ul', class_='salary').find('li').text.strip() if job.find('ul', class_='salary') else None
                  
                    JobURL = "https://www.careerjet.com" + job.find('h2').find('a').get('href') if job.find('h2') else None
    
                    values =  [JobTitle, Company, JobLocation, PostTime, ExtractDate, Salary, JobURL]
                    if None not in values:
                        writer.writerow(values)
                                               
                # Introduce a delay between requests (avoid overloading the server)
                time.sleep(1)
            else:
                print(f"Failed to retrieve the page {page}. Status code: {response.status_code}")

    print(f"\nJob postings saved to {filename}")

## Testing the Code with JobTitle: `Data Analyst`, Location: `New York`, Maxpages: `3`

In [391]:
# Example usage
JobTitle = "Data Analyst"
Location = "New York"
Maxpages = 3

scrape_jobs(JobTitle, Location, Maxpages)

url page1: https://www.careerjet.com/jobs?s=Data+Analyst&l=New+York&p=1
url page2: https://www.careerjet.com/jobs?s=Data+Analyst&l=New+York&p=2
url page3: https://www.careerjet.com/jobs?s=Data+Analyst&l=New+York&p=3

Job postings saved to job_postings_Data_Analyst_New_York_2025-02-08.csv
