## Importing libraries

In [None]:
import numpy as np
import pandas as pd 
import warnings
import os
import requests
import re
from bs4 import BeautifulSoup
from multiprocessing import Pool
from datetime import datetime

## Search Parameters

In [None]:
# Define locations and job roles to search
locations = [
    'United%20States'
]

# List of search strings
it_job = [
    "machine%20learning%20engineer",
    "software%20developer",
    "data%20scientist",
    "network%20administrator",
    "database%20administrator",
    "cyber%20security%20specialist",
    "cloud%20solutions%20architect",
    "web%20developer",
    "devops%20engineer",
    "it%20support%20specialist",
    "business%20intelligence%20analyst",
    "systems%20analyst",
    "blockchain%20developer"
]

# Time period filters
Past_month = "f_TPR=r2592000"
Past_week = "f_TPR=r604800"
Past_24_hours = "f_TPR=r86400"

# Job level mapping
level_mapping = {
    "entry_level": "f_E=2",
    "associate": "f_E=3",
    "mid_senior": "f_E=4"
}

# Work type mapping
work_type_mapping = {
    "onsite": "f_WT=1",
    "remote": "f_WT=2",
    "hybrid": "f_WT=3"
}
    
# HTTP headers to mimic a real browser
headers = {
    'User-Agent': 'Google Chrome/91.0.4472.124 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9'
}

## Scraping function

In [None]:
def scrape_jobs(location, position, searching_period=Past_month): 
    """
    Scrape job data from LinkedIn for a given location, job title, and searching period.

    Parameters:
    location (str): The location to search for jobs (URL encoded).
    position (str): The job title to search for (URL encoded).
    searching_period (str): The period to filter jobs by recency (e.g., 'f_TPR=r2592000' for the past month).

    Returns:
    None: The function saves the scraped data to a CSV file.
    """
    
    # Iterate over each work type and job level combination
    for WorkType_key, WorkType_value in work_type_mapping.items():
        for levelMapping_key, levelMapping_value in level_mapping.items():

            job_data = []
            # Build the basic URL for the job search query
            url_basic = f"https://www.linkedin.com/jobs/search/?location={location}&keywords={position}&{levelMapping_value}&{WorkType_value}&{searching_period}"
            k = -25
            request = requests.get(url_basic, headers=headers)
            data = request.text
            soup = BeautifulSoup(data, 'html.parser')
            
            try:
                # Extract the number of job offers from the page title
                titletag = soup.find('title')
                if titletag is not None:
                    input_string = str(titletag.get_text()).split(" ")[0]
                    digits_only = re.sub(r'\D', '', input_string)
                    nbre_offers = int(digits_only)
                else:
                    nbre_offers = 230  # Fallback number if title not found
                
                nbre_pages = (int(nbre_offers / 25)) * 25
                count = 0
                
                # Pagination loop to go through all job listings
                while k <= nbre_offers:
                    k += 25
                    count += 1
                    titles, locations, links, job_description, company_name = [], [], [], [], []
                    
                    url = url_basic + "&start=" + str(k)
                    request = requests.get(url)
                    data = request.text
                    soup = BeautifulSoup(data, 'html.parser')
                    
                    # Save the HTML content for debugging purposes
                    with open(f"soup_page_{k}_{WorkType_value}_{levelMapping_value}.txt", "w", encoding="utf-8") as file:
                        file.write(soup.prettify())
                    
                    # Extract job details from the HTML
                    job_links = soup.findAll("a", {"class": "base-card__full-link"})  
                    title = soup.findAll("h3", {"class": "base-search-card__title"})
                    location = soup.findAll("span", {"class": "job-search-card__location"})
                    names = soup.findAll("a", {"class": "hidden-nested-link"})

                    for i in title:
                        titles.append(i.text.replace("\n", " ").strip(" ")) 
                    for i in location:
                        locations.append(i.text.replace("\n", " ").strip(" "))   

                    for i in job_links:
                        link = str(i["href"].split("?refId=")[0])
                        links.append(link)
                        
                        # Fetch the detailed job description from the job link
                        request = requests.get(link, timeout=230, headers=headers)
                        data = request.text 
                        soup = BeautifulSoup(data, 'html.parser')
                        job_description_div = soup.find('div', class_='description__text description__text--rich')
                            
                        if job_description_div is not None:
                            job_desc = job_description_div.get_text()
                            job_description.append(job_desc)
                        else:
                            job_description.append("No job description available")
                        
                    for i in names:
                        company_name.append(i.text.replace("\n", " ").strip(" "))

                    # Append the extracted data to the job_data list
                    for j in range(len(job_links)):
                        job_data.append({
                            "position": position.replace("%20", " "),
                            "date": datetime.today().date().strftime('%Y-%m-%d'),
                            "WorkType": WorkType_key,
                            "levelMapping": levelMapping_key,
                            "Title": titles[j],
                            "Company": company_name[j],
                            "Location": locations[j],
                            "Link": links[j],
                            "job description": job_description[j],
                        })
                    
                    # Break the loop if no more job links are found
                    if len(job_links) == 0:
                        break    
            except Exception as error:
                print(f"Error: {error}")
                print(f"Failed URL: {url}")
                pass
            
            # Convert job data to a DataFrame and save it to a CSV file
            job_df = pd.DataFrame(job_data)
            save_string = f'linkedin-positions-{position.replace("%20", "")}-{WorkType_key}-{levelMapping_key}'
            job_df.to_csv(f'./data_files/{save_string}.csv', index=False)

## Scrapping execution

In [None]:
# Iterate over each specified location
for location in locations:
    # For each location, iterate over each job title in the it_job list
    for position in it_job:
        # Call the scrape_jobs function with the current location, job title, and search period set to the past month
        scrape_jobs(location, position, Past_month)

In [None]:
def load_and_concatenate_csvs(directory_path):
    """
    Load all CSV files from the specified directory and concatenate them into a single DataFrame.

    Parameters:
    directory_path (str): The path to the directory containing the CSV files.

    Returns:
    pd.DataFrame: A concatenated DataFrame containing data from all CSV files in the directory.
    """
    # List all files in the specified directory
    files = os.listdir(directory_path)
    
    # Filter the list to include only files that end with '.csv'
    file_list = [file for file in files if file.endswith('.csv')]
    
    # Initialize an empty list to hold DataFrames
    dataframes = []

    # Iterate over the list of CSV files and read each one into a DataFrame
    for file in file_list:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(os.path.join(directory_path, file))
        
        # Append the DataFrame to the list
        dataframes.append(df)
    
    # Concatenate all DataFrames in the list into a single DataFrame
    full_df = pd.concat(dataframes, ignore_index=True)
    
    return full_df

In [None]:
full_df = load_and_concatenate_csvs('./data_files')
full_df.to_csv('./job_postings.csv')