FINAL PROJECT

Web Scraping- Job listings analysis python project : 
Write a python code to collect job listings and data in IT job markets from a website, 
such as jobs, salary estimates and location preferences. 

In [14]:
#import modules
import requests
import bs4
import os
import time
import re
from datetime import datetime

In [4]:
site = requests.get("https://www.jobbank.gc.ca/jobsearch/jobsearch?searchstring=software+developer&sort=M")

In [5]:
print(site.status_code)

200


In [6]:
print(site.raise_for_status())   #returns a None instead of an error if everything is okay
print(len(site.text))

None
253514


In [7]:
print(site.text[:1000])

<!DOCTYPE html>

<html class="no-js" lang="en" dir="ltr"><head id="j_id_5">
		<meta charset="utf-8" />
		
		<title>software developer  in various locations - Search - Job Bank
			      
		</title>
		<meta content="width=device-width,initial-scale=1" name="viewport" />
		
		<meta name="dcterms.language" content="eng" />
		<meta name="dcterms.creator" content="Employment and Social Development Canada" />
		<meta name="dcterms.service" content="ESDC-EDSC_JobBank-GuichetEmplois" />
		<meta name="dcterms.accessRights" content="2" /><meta name="dcterms.title" content="Search results - Search - Job Bank" />
			<meta property="og:title" content="software developer  in various locations - Search" />
			<meta property="og:description" name="description" content="View 287 job postings near for software developer; on Job Bank, Canada’s one-stop job board." />
			<meta property="og:url" content="/jobsearch/jobsearch?term=software+developer&amp;page=1&amp;sort=M&amp;fn21=21232" />
			<meta property=

In [8]:
print(type(site))

<class 'requests.models.Response'>


In [9]:
# Saving File
File = open('site.txt', 'wb')
for chunk in site.iter_content(100000):
    File.write(chunk)

In [10]:
File.close()

In [11]:
# Open the HTML file for reading
exFile= open('site.txt', 'r', encoding='utf-8')
# Parse the file content with BeautifulSoup
noSoup = bs4.BeautifulSoup(exFile.read(), 'html.parser')

# Find all job cards on the page
job_cards = noSoup.find_all(class_=re.compile('.*results*'))

In [12]:
# Lists to store our data
jobs = []

# Loop through each job card
for card in job_cards:
    # Get job details with error handling
    title_tag = card.find('span', class_="noctitle")
    title = title_tag.text.strip() if title_tag else "No job title available"

    company_tag = card.find('li', class_='business')
    company = company_tag.text.strip() if company_tag else "No company name listed"

    location_tag = card.find('li', class_='location')
    location = location_tag.text.strip() if location_tag else "No location available"

    salary_tag = card.find('li', class_='salary')
    salary = salary_tag.text.strip() if salary_tag else "No salary listed"

    date_tag = card.find('li', class_='date')
    date = date_tag.text.strip() if date_tag else "No publication date available"

    # Ignore incomplete job postings (those without a title or company)
    if title != "No job title available" and company != "No company name listed":
        # Store the job details in a dictionary and append to the list
        job = {
            "title": title,
            "company": company,
            "location": location,
            "salary": salary,
            "date": date
        }
        jobs.append(job)

# Main display starts here
print("\n=== IT Job Search Tool ===")
print("This tool searches jobbank.gc.ca for IT job postings in Canada.\n")

# Get user input once for the search term
search_term = input("Are you looking for Software-related roles (Yes or No): ").strip()

# If the user says Yes, print all job details in the list
if search_term.lower() == "yes":
    print("\nSearching the website..................................")
    print()
    time.sleep(10) # Add a 10 seconds delay
    # Display job information
    if jobs: 
        for job in jobs:
            print(f"\nTitle: {job['title']}")
            print(f"Company: {job['company']}")
            print(f"{job['location']}")
            print(f"{job['salary']}")
            print(f"Publication date: {job['date']}")
    else:
        print("No relevant jobs found.")
else:
    print("Have a great day!")



=== IT Job Search Tool ===
This tool searches jobbank.gc.ca for IT job postings in Canada.



Are you looking for Software-related roles (Yes or No):  yes



Searching the website..................................


Title: software developer
Company: APPEXOS SOFTWARE INC
Location
					
										   		Regina (SK)
Salary:
						$43.27 hourly
Publication date: March 26, 2025

Title: software developer
Company: APPEXOS SOFTWARE INC
Location
					
										   		Regina (SK)
Salary:
						$43.27 hourly
Publication date: March 26, 2025

Title: software developer
Company: APPEXOS SOFTWARE INC
Location
					
										   		Regina (SK)
Salary:
						$43.27 hourly
Publication date: March 26, 2025

Title: software developer
Company: APPEXOS SOFTWARE INC
Location
					
										   		Regina (SK)
Salary:
						$43.27 hourly
Publication date: March 26, 2025

Title: software developer
Company: APPEXOS SOFTWARE INC
Location
					
										   		Regina (SK)
Salary:
						$43.27 hourly
Publication date: March 26, 2025

Title: software developer
Company: Pearl Technologies Ltd.
Location
					
										   		Toronto (ON)
Salary:
						$48.00 hourly
Publication

In [26]:
def analyze_job_trends(x):
    # Variables to store data
    max_salary = None
    min_salary = None
    location_count = {}

    # Variables for date comparison
    most_recent_job = None
    oldest_job = None
    most_recent_date = None
    oldest_date = None

    def parse_salary(salary_str):
        # Handle hourly page
        if 'hourly' in salary_str.lower():
            numbers = [float(s.replace('$', '').replace(',', '')) 
                      for s in salary_str.split() if s.startswith('$')]
            if numbers:
                # Convert hourly to annual (assuming 40 hrs/week * 52 weeks)
                return max(n * 40 * 52 for n in numbers)
        
        # Handle annual salaries
        elif 'annually' in salary_str.lower():
            numbers = [float(s.replace('$', '').replace(',', '')) 
                      for s in salary_str.split() if s.startswith('$')]
            return max(numbers) if numbers else None
        
        # To handle biweekly pay
        elif 'biweekly' in salary_str.lower():
            numbers = [float(s.replace('$', '').replace(',', '')) 
                      for s in salary_str.split() if s.startswith('$')]
            if numbers:
                # Convert biweekly to annual (26 pay periods)
                return max(n * 26 for n in numbers)
        
        # To handle simple cases (just a dollar amount)
        else:
            try:
                return float(salary_str.replace('$', '').replace(',', ''))
            except:
                return None
        
        return None

    # Loop through the jobs to analyze the trends
    for job in x:
        # Get and parse the salary
        salary_str = job.get('salary', '').strip()
        salary = parse_salary(salary_str)

        # Update the max salary
        if salary is not None:
            if max_salary is None or salary > max_salary:
                max_salary = salary
            if min_salary is None or salary < min_salary:
                min_salary = salary

        # Track the most common locations
        location = job.get('location', 'Unknown')
        location_count[location] = location_count.get(location, 0) + 1

        # Process date
        if 'date' in job:
            try:
                job_date = datetime.strptime(job['date'], '%B %d, %Y')
                
                if most_recent_date is None or job_date > most_recent_date:
                    most_recent_date = job_date
                    most_recent_job = job
                
                if oldest_date is None or job_date < oldest_date:
                    oldest_date = job_date
                    oldest_job = job
            except (ValueError, TypeError):
                pass

    # Find the most and least common locations
    most_common_location = max(location_count, key=location_count.get)
    least_common_location = min(location_count, key=location_count.get)

    # Total number of jobs
    total_jobs = len(x)

    # Display the trend analysis results
    print("\n=== Job Market Trend Analysis ===")
    print()
    user_input = input("Would you like to perform analysis on the data (Yes or No)?: ")

    if user_input.lower() == "yes":
        print("\nAnalyzing the data...............................")
        time.sleep(10) #delay for 10 seconds
        print(f"\n1. The highest salary offered: ${max_salary:,.2f}" if max_salary is not None else "\n1. No valid salary data found")
        print(f"2. The lowest salary offered: ${min_salary:,.2f}" if min_salary is not None else "2. No valid salary data found")
        print(f"3. The location with the most amount of jobs: {most_common_location}({location_count[most_common_location]} jobs).")
        print(f"4. The location with the least amount of jobs: {least_common_location} ({location_count[least_common_location]} job).")
        print(f"5. The total number of software-related jobs on the website: {total_jobs}")

        # Display date information
        if most_recent_job:
            print(f"6. The most recently published job is:")
            print(f"   Publication Date: {most_recent_job['date']}")
        else:
            print("6. No dated jobs found for most recent publication")
            
        if oldest_job:
            print(f"7. The oldest published job is:")
            print(f"   Publication Date: {oldest_job['date']}")
        else:
            print("7. No dated jobs found for oldest publication")

# Call the function
analyze_job_trends(jobs)


=== Job Market Trend Analysis ===



Would you like to perform an analysis on the data (Yes or No)?:  yes



Analyzing the data...............................

1. The highest salary offered: $187,200.00
2. The lowest salary offered: $55,000.00
3. The location with the most amount of jobs: Location
					
										   		Regina (SK)(5 jobs).
4. The location with the least amount of jobs: Location
					
										   		Brampton (ON) (1 job).
5. The total number of software-related jobs on the website: 29
6. The most recently published job is:
   Publication Date: April 06, 2025
7. The oldest published job is:
   Publication Date: January 13, 2025


In [30]:
def analyze_salary_trends(jobs):
    # Initialize variables
    salary_data = []  # Will store tuples of (company, salary)
    location_count = {}
    most_recent_job = None
    oldest_job = None
    most_recent_date = None
    oldest_date = None

    def parse_salary(salary_str):
        # Handle hourly wages
        if 'hourly' in salary_str.lower():
            numbers = [float(s.replace('$', '').replace(',', '')) 
                     for s in salary_str.split() if s.startswith('$')]
            if numbers:
                return max(n * 40 * 52 for n in numbers)  # Convert to annual
        
        # Handle annual salaries
        elif 'annually' in salary_str.lower():
            numbers = [float(s.replace('$', '').replace(',', '')) 
                     for s in salary_str.split() if s.startswith('$')]
            return max(numbers) if numbers else None
        
        # Handle biweekly payments
        elif 'biweekly' in salary_str.lower():
            numbers = [float(s.replace('$', '').replace(',', '')) 
                     for s in salary_str.split() if s.startswith('$')]
            if numbers:
                return max(n * 26 for n in numbers)  # Convert to annual
        
        # Simple dollar amount
        else:
            try:
                return float(salary_str.replace('$', '').replace(',', ''))
            except:
                return None
        
        return None

    # Process all jobs
    for job in jobs:
        # Parse salary
        salary = parse_salary(job.get('salary', ''))
        company = job.get('company', 'Unknown')
        
        if salary is not None:
            salary_data.append((company, salary))
        
        # Track locations
        location = job.get('location', 'Unknown')
        location_count[location] = location_count.get(location, 0) + 1
        
        # Track dates
        if 'date' in job:
            try:
                job_date = datetime.strptime(job['date'], '%B %d, %Y')
                if most_recent_date is None or job_date > most_recent_date:
                    most_recent_date = job_date
                    most_recent_job = job
                if oldest_date is None or job_date < oldest_date:
                    oldest_date = job_date
                    oldest_job = job
            except (ValueError, TypeError):
                pass

    # Find highest and lowest paying companies
    if salary_data:
        highest_paying = max(salary_data, key=lambda x: x[1])
        lowest_paying = min(salary_data, key=lambda x: x[1])
    else:
        highest_paying = lowest_paying = (None, None)

    # Main analysis function
    def show_analysis():
        print("\n=== Job Market Trend Analysis ===")
        print("\n1. Salary Analysis:")
        if highest_paying[0]:
            print(f"   - Highest paying company: {highest_paying[0]} (${highest_paying[1]:,.2f})")
            print(f"   - Lowest paying company: {lowest_paying[0]} (${lowest_paying[1]:,.2f})")
        else:
            print("   No valid salary data available")
        
        # Location analysis
        if location_count:
            most_common = max(location_count.items(), key=lambda x: x[1])
            least_common = min(location_count.items(), key=lambda x: x[1])
            print("\n2. Location Analysis:")
            print(f"   - Most jobs in: {most_common[0]} ({most_common[1]} jobs)")
            print(f"   - Least jobs in: {least_common[0]} ({least_common[1]} job)")
        
        # Date analysis
        print("\n3. Job Posting Dates:")
        if most_recent_job:
            print(f"   - Most recent: {most_recent_job['date']}")
        if oldest_job:
            print(f"   - Oldest: {oldest_job['date']}")

    # User interaction
    while True:
        print("\nOptions:")
        print("1. Show full analysis")
        print("2. Show highest paying companies")
        print("3. Show lowest paying companies")
        print("4. Exit")
        
        choice = input("Enter your choice (1-4): ")  # Giving the user a choice
        print("Analyzing..........................................................")
        time.sleep(10) # 10 seconds delay added
        
        if choice == '1':
            show_analysis()
        elif choice == '2':
            if highest_paying[0]:
                print("\nHighest Paying Companies:")
                
                # Show all companies paying within 5% of the highest salary
                threshold = highest_paying[1] * 0.95
                top_companies = [(c, s) for c, s in salary_data if s >= threshold]
                for company, salary in sorted(top_companies, key=lambda x: -x[1]):
                    print(f"- {company}: ${salary:,.2f}")
            else:
                print("No salary data available")
        elif choice == '3':
            if lowest_paying[0]:
                print("\nLowest Paying Companies:")
                
                # Show all companies paying within 5% of the lowest salary
                threshold = lowest_paying[1] * 1.05
                bottom_companies = [(c, s) for c, s in salary_data if s <= threshold]
                for company, salary in sorted(bottom_companies, key=lambda x: x[1]):
                    print(f"- {company}: ${salary:,.2f}")
            else:
                print("No salary data available")
        elif choice == '4':
            break
        else:
            print("Invalid choice. Please try again.")

analyze_salary_trends(jobs)


Options:
Analyzing..........................................................
1. Show full analysis
2. Show highest paying companies
3. Show lowest paying companies
4. Exit


Enter your choice (1-4):  4


In [27]:
def save_report(jobs):
    # Save the collected job data to a created text file and folder
    folder = "job_reports"
    os.makedirs(folder, exist_ok=True)
    file_name = f"{folder}/Job_Report.txt"

    with open(file_name, 'w', encoding='utf-8') as file:
        file.write("Job Report\n")

        for job in jobs:
            file.write(f"Title: {job['title']}\n")
            file.write(f"Company: {job['company']}\n")
            file.write(f"Location: {job['location']}\n")
            file.write(f"Salary: {job['salary']}\n")
            file.write(f"Publication Date: {job['date']}\n")
            file.write("\n")
            

    print(f"Report saved to {file_name}")
save_report(jobs)

Report saved to job_reports/Job_Report.txt
