In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
import time
import numpy as np
import pickle
import os

### `login(driver, p)` Function

Logs into LinkedIn using the provided WebDriver and password.

- Navigates to LinkedIn's login page.
- Fills in the username and password fields.
- Submits the login form.


In [None]:
def login(driver,p):
  url =  "https://www.linkedin.com/checkpoint/rm/sign-in-another-account?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin"
  wait = WebDriverWait(driver, 10)
  driver.get(url)
  
  username = driver.find_element(By.ID, "username")
  username.send_keys("riklscrap1@gmail.com")
  password = driver.find_element(By.ID, "password")
  password.send_keys(p)
  driver.find_element(By.CLASS_NAME,"login__form_action_container").click()

### `search(driver)` Function

Navigates to LinkedIn's job search page for "data science" after a 5-second wait.


In [None]:
def search(driver):
  time.sleep(5)
  driver.get("https://www.linkedin.com/jobs/search/?keywords=data%20science")

### `get_jobs(driver)` Function

Waits for up to 10 seconds to locate and return the job list container element on the LinkedIn jobs page.


In [122]:
def get_jobs(driver):
    job_list_container = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'scaffold-layout__list-container'))
        )
    return job_list_container

### `load_next_page(driver)` Function

Loads the next page of results on the LinkedIn jobs search by:
- Retrieving the current page number.
- Finding and clicking the button for the next page.


In [200]:
def load_next_page(driver):
  #loads next page for url retrival
  curr= driver.find_element(By.XPATH,'//*[@aria-current="true"]').text
  next = driver.find_element(By.XPATH,f'//*[@aria-label="Page {int(curr)+1}"]')
  next.click()

### `get_n_results(driver)` Function

Fetches the total number of job results from the LinkedIn jobs search page:

- Waits for the results element to appear (up to 2 seconds).
- Extracts and returns the number of job results by parsing the displayed text.
- If an error occurs, prints the error and returns `None`.

In [201]:
def get_n_results(driver):
    # Wait for the element to be visible before trying to interact with it
    try:
        # Wait for the results element to be located (adjust timeout if necessary)
        wait = WebDriverWait(driver, 2)
        results_div = wait.until(
            EC.presence_of_element_located((By.CLASS_NAME, "jobs-search-results-list__subtitle"))
            # EC.presence_of_element_located((By.XPATH, "/html/body/div/div[2]/div[1]/header/div[1]/small/div/span"))
        )
        
        # Get the number of results
        n_string = results_div.text
        n = int(n_string.split()[0].replace(',', ""))
        return n
    except Exception as e:
        print(f"Error occurred: {e}")
        return None  # Handle the error or return a default value

### `filter_jobs(jobs)` Function

Cleans and filters a list of job dictionaries:

- Cleans job titles by taking the first part of a newline-split string.
- Adds jobs to `cleaned_jobs` only if they have a valid link.
- Returns a filtered dictionary where the job link is the key, and the title and location are the values, excluding jobs where both title and location are `'N/A'`.


In [202]:
def filter_jobs(jobs):
    cleaned_jobs = {}
    
    for job in jobs:
        # Clean the title by splitting on newline and picking the first unique part
        if job['title'] != 'N/A':
            job_title_cleaned = job['title'].split('\n')[0].strip()  # Take the first part of the split title
            job['title'] = job_title_cleaned  # Update the job title in the dictionary
        
        # Only add the job if the link is not 'N/A'
        if job['link'] != 'N/A':
            cleaned_jobs[job['link']] = {
                'title': job['title'],
                'location': job['location']
            }
    
    # Return a dictionary where the key is the link and value is a dictionary with title and location
    filtered_jobs = {link: job for link, job in cleaned_jobs.items() if not (job['title'] == 'N/A' and job['location'] == 'N/A')}

    return filtered_jobs

### `extract_all_job_titles_links_and_locations(driver, job_list_container)` Function

Extracts job titles, links, and locations from the provided `job_list_container`:

- Finds all job cards (`<li>` elements) within the container.
- Loops through each job card to extract:
  - **Job Title** and **Link** from elements with the class `'job-card-list__title'`.
  - **Location** from elements with the class `'job-card-container__metadata-item'`.
- Uses `filter_jobs()` to clean and return a list of jobs with valid titles, links, and locations.
- Handles errors with appropriate fallback values (`'N/A'`) or error messages if necessary.


In [203]:
def extract_all_job_titles_links_and_locations(driver, job_list_container):
    try:
        
    
        # Find all job card <li> elements within the container
        job_cards = job_list_container.find_elements(By.TAG_NAME, 'li')

        # Initialize a list to store job details
        jobs = []

        # Loop through each job card and extract title, link, and location
        for job_card in job_cards:
            # Extract the job title and link by class name
            try:
                job_element = job_card.find_element(By.CLASS_NAME, 'job-card-list__title')
                job_title = job_element.text.strip()  # Extract the text (job title)
                job_link = job_element.get_attribute('href')  # Extract the href attribute (job link)
            except NoSuchElementException as e:
                job_title = 'N/A'
                job_link = 'N/A'

            # Extract the location within the same job card
            try:
                location_element = job_card.find_element(By.CLASS_NAME, 'job-card-container__metadata-item')
                location = location_element.text.strip()  # Extract the location text
            except NoSuchElementException as e:
                location = 'N/A'  # If location is not found, set it to 'N/A'

            # Append a dictionary with job title, link, and location to the list
            jobs.append({
                'title': job_title,
                'link': job_link,
                'location': location
            })

        # Filter out records where all three fields (title, link, and location) are 'N/A'
        filtered_jobs = filter_jobs(jobs)
        return filtered_jobs

    except TimeoutException as e:
        print(f"Timeout waiting for the job list container: {e}")
        return []
    except Exception as e:
        print(f"Error extracting job titles, links, and locations: {e}")
        return []

### `get_details(driver, job_dict)` Function

Scrapes job descriptions, applicants count, and posting time from LinkedIn job listings:

- Iterates over each job link in `job_dict`.
- For each link:
  - Navigates to the job page and checks if the URL matches the job link.
  - Scrapes job description, number of applicants, and time since the job was posted.
  - Updates `job_dict` with the scraped data if successful.
- Tracks successful scrapes in `good` and failed attempts in `fail`.
- Handles errors by printing messages and skipping failed links.


In [204]:
def get_details(driver, job_dict):
    good = []
    fail = []

    # Iterate through the url list to scrape the descriptions
    for link in list(job_dict.keys()):
        if link not in good:
            try:
                driver.get(link)
                time.sleep(5)

                if driver.current_url != link:
                    print(f'failed at {link}')
                    # Remove broken URLs
                    job_dict.pop(link)
                    continue  # Skip to the next iteration

                # Scrape the job description
                job_description_element = driver.find_element(By.ID, 'job-details')
                job_description = job_description_element.text

                # Find the parent div that contains the information
                parent_div = driver.find_element(By.XPATH, "//div[contains(@class, 'job-details-jobs-unified-top-card__primary-description-container')]")

                # Scrape "applicants" within the parent div
                applicants_element = parent_div.find_element(By.XPATH, ".//span[contains(text(), 'applicants')]")
                applicants = applicants_element.text

                # Scrape "time posted" within the parent div
                days_ago_element = parent_div.find_element(By.XPATH, ".//span[contains(text(), 'ago')]")
                days_ago = days_ago_element.text

                # Update the job dictionary with the scraped information
                job_dict[link].update({
                    "description": job_description,
                    "applicants": applicants,
                    "posted_time": days_ago
                })

                good.append(link)
            except Exception as e:
                # Handle the error and keep track of the failed jobs
                print(f"Failed to scrape {link}")
                # print(f"Failed to scrape {link}: {e}")
                fail.append(link)

    return job_dict


### `save(final_file, job_dict)` Function

Saves the job dictionary to a file:

- Takes `final_file` (filename) and `job_dict` (data to save).
- If `job_dict` is `None`, prints an error message and exits.
- Tries to save the dictionary as a `.p` file using `pickle` with the highest protocol.
- Prints success or error message based on the result.

### `load(file_name)` Function

Loads the job dictionary from a file:

- Takes `file_name` (filename to load from).
- If the file does not exist, prints a message and returns an empty dictionary.
- Tries to load the dictionary using `pickle`.
- Prints success or error message and returns the loaded dictionary, or an empty dictionary if an error occurs.


In [205]:
# Save the file in the same folder as the script
def save(final_file, job_dict):
    if job_dict is None:
        print("Error: job_dict is None. Nothing to save.")
        return
    # Saving the file
    try:
        with open(f'{final_file}.p', 'wb') as fp:
            pickle.dump(job_dict, fp, protocol=pickle.HIGHEST_PROTOCOL)
        print(f"Data successfully saved to {final_file}.p")
    except Exception as e:
        print(f"Error saving file: {e}")

# Load the file from the same folder as the script
def load(file_name):
    if not os.path.exists(file_name):
        print(f"{file_name} does not exist. Starting fresh.")
        return {}
    
    # Loading the file
    try:
        with open(file_name, 'rb') as fp:
            job_dict = pickle.load(fp)
        print(f"Data successfully loaded from {file_name}")
        return job_dict
    except Exception as e:
        print(f"Error loading file: {e}")
        return {}

In [242]:
def scroll_through_jobs(driver):
    """Scroll through each job in the left-side job list using scrollIntoView."""
    
    # Find all the job elements in the left-side container
    jobs_block = driver.find_elements(By.XPATH, "//ul[@class='scaffold-layout__list-container']/li")
    
    # Scroll each job into view
    for index, job in enumerate(jobs_block):
        try:
            driver.execute_script("arguments[0].scrollIntoView({block: 'center', inline: 'nearest'});", job)
            # print(f"Scrolled job {index + 1} into view")
            
            # Wait for a second to visually confirm scroll, or adjust the time as needed
            time.sleep(1)
        except Exception as e:
            print(f"Error scrolling job {index + 1}: {e}")
    
    print("Finished scrolling through all jobs.")

### `main(driver, password)` Function

The main function for automating LinkedIn job search and data extraction:

1. **Login**: Logs into LinkedIn using the provided `driver` and `password`.
2. **Search**: Initiates a job search on LinkedIn.
3. **Get Total Jobs**: Retrieves the total number of job results available and prints it.
4. **Extract Job Details**:
   - Loops through pages (currently only one iteration in this example) to extract job details.
   - For each page:
     - Scrolls through job listings.
     - Waits briefly for the jobs to load.
     - Retrieves the job list container and extracts job titles, links, and locations.
     - Updates the cumulative `all_job_details` dictionary with the jobs extracted from the current page.
5. **Load Next Page**: Advances to the next page to repeat the extraction process.
6. **Return Data**: Returns the dictionary containing all extracted job details.

The function performs login, job search, and data extraction over multiple pages.


In [243]:
def main(driver, password):
    
    # Log in to LinkedIn (function assumed to be defined)
    login(driver, password)  
    # Search on LinkedIn (function assumed to be defined)
    search(driver)  
    total_job=get_n_results(driver)
    print(total_job)
    
    all_job_details = {}

    for i in range(10):  # Loop to extract jobs over multiple pages
        scroll_through_jobs(driver)  # Scroll down to load more jobs
        time.sleep(2)
        job_list_container = get_jobs(driver)  # Get jobs from the page
        # Extract job details and add them to the cumulative list
        job_details = extract_all_job_titles_links_and_locations(driver, job_list_container)
        # Append the extracted job details from this page to the main dict
        all_job_details.update(job_details)
        load_next_page(driver)  # Go to the next page
        
    print(all_job_details)
    return all_job_details
    

### Main Script for Job Scraping

1. **Initialize WebDriver**: Sets up Chrome WebDriver using the specified `chrome_driver_path`.
2. **Read Password**: Reads the LinkedIn password from a text file `pass.txt`.
3. **Scrape Job Data**: Calls the `main` function to scrape job details and stores the results in `job_dict`.
4. **Save Data**: Saves the `job_dict` to a file (`job_dict.p`).
5. **Load Data**: Loads the saved job data from the file for future use.
6. **Driver Quit**: Optionally quits the WebDriver session when done.


In [246]:
# Path to the chromedriver executable
chrome_driver_path = r'C:\Program Files (x86)\Google\chromedriver\chromedriver.exe'

# Set up the Chrome service
service = Service(chrome_driver_path)

# Initialize the Chrome WebDriver
driver = webdriver.Chrome(service=service)

f = open("./pass.txt", "r").read().strip()


# Call the main scraping function to scrape and update job_dict
job_dict = main(driver, f)

fileName = 'job_dict'
# Save the updated job_dict after scraping
save(fileName, job_dict)

# This will load data from 'job_data.p'
loaded_job_dict = load(f"{fileName}.p")  

# Quit the WebDriver session
driver.quit()


936
Finished scrolling through all jobs.
{'https://www.linkedin.com/jobs/view/3991052284/?eBP=CwEAAAGSJ0KjNzLiOA6sivuEUbFnscxZdJdntOlXpV1NtGRSFRc5X9SEKIFsNgGnPVlAPieacCJsDnAZ-ioEYcsSLZEzAZ0Ua11BZXshjRUKv8lv7ubm9H7xU3ymeM-cOrxnE4OaW3imMNe4EHORmaAMLRzMjesEbmWZegPw1vdJobyqQaJoezzDaA9mL6PasFr3Uc0OaDda0f_22-GjFnyRbS6VBjmzOQwwBr6WhkJdyJtf-PZ7HNybVeqL6Z2SVvPaWTda2Swbo5thPNfJU2aCIOP3xRdRyHcuYl6K4qKQgzbT7MpJpJl1W4VwX1bTxmP2m3r_aGaSAgFV93jkaWicLbPB6OqcqZVO3jyh2ZZRcg48_PbkEV-YFFQn8g_UNz6129qJpqhXx72mTH5nCWpDtN4Dk6ZrOqZ_V6K8s46XvrXTXCqwqzVvs8JqfAAqenQso2kMimSaj8U6LdHZPM03Tdv06UgzaQ&refId=PVHcQWOvzS8CNhJ84D%2F08Q%3D%3D&trackingId=pY1oS0AWAHej4bE%2B1W0xvw%3D%3D&trk=flagship3_search_srp_jobs': {'title': 'Data Scientist (Data Scientist 3) - 20196', 'location': 'Doral, FL (On-site)'}, 'https://www.linkedin.com/jobs/view/4025933456/?eBP=CwEAAAGSJ0KjN_dMN7N__SZvkNyw_-ogzg5QgxJl14JfzIlQKe7TTYdoe0RSK82sVw6eX-Eye0lOTdPEK436zqiJ1D7L5hfSx-vPDFpjneIGCOvD5NWf-Ge93P1YRSo9eaEJLml07eTI0ggZqrL_U7JuZKeJOHSV0vjbSVHi0

In [None]:
get_details(driver, loaded_job_dict)

In [249]:
loaded_job_dict

{'https://www.linkedin.com/jobs/view/3991052284/?eBP=CwEAAAGSJ0KjNzLiOA6sivuEUbFnscxZdJdntOlXpV1NtGRSFRc5X9SEKIFsNgGnPVlAPieacCJsDnAZ-ioEYcsSLZEzAZ0Ua11BZXshjRUKv8lv7ubm9H7xU3ymeM-cOrxnE4OaW3imMNe4EHORmaAMLRzMjesEbmWZegPw1vdJobyqQaJoezzDaA9mL6PasFr3Uc0OaDda0f_22-GjFnyRbS6VBjmzOQwwBr6WhkJdyJtf-PZ7HNybVeqL6Z2SVvPaWTda2Swbo5thPNfJU2aCIOP3xRdRyHcuYl6K4qKQgzbT7MpJpJl1W4VwX1bTxmP2m3r_aGaSAgFV93jkaWicLbPB6OqcqZVO3jyh2ZZRcg48_PbkEV-YFFQn8g_UNz6129qJpqhXx72mTH5nCWpDtN4Dk6ZrOqZ_V6K8s46XvrXTXCqwqzVvs8JqfAAqenQso2kMimSaj8U6LdHZPM03Tdv06UgzaQ&refId=PVHcQWOvzS8CNhJ84D%2F08Q%3D%3D&trackingId=pY1oS0AWAHej4bE%2B1W0xvw%3D%3D&trk=flagship3_search_srp_jobs': {'title': 'Data Scientist (Data Scientist 3) - 20196',
  'location': 'Doral, FL (On-site)',
  'description': 'About the job\nRequisition Number: 20196\n\nRequired Travel: 0 - 10%\n\nEmployment Type: Full Time/Salaried/Exempt\n\nAnticipated Salary Range: - $110,000.00\n\nSecurity Clearance: TS/SCI\n\nLevel of Experience: Mid HI\n\nThis opportunity resi

In [251]:
fileName = 'job_dict_full'
# Save the updated job_dict after scraping
save(fileName, job_dict)

# This will load data from 'job_data.p'
job_dict_full = load(f"{fileName}.p")  

Data successfully saved to job_dict_full.p
Data successfully loaded from job_dict_full.p


In [252]:
job_dict_full

{'https://www.linkedin.com/jobs/view/3991052284/?eBP=CwEAAAGSJ0KjNzLiOA6sivuEUbFnscxZdJdntOlXpV1NtGRSFRc5X9SEKIFsNgGnPVlAPieacCJsDnAZ-ioEYcsSLZEzAZ0Ua11BZXshjRUKv8lv7ubm9H7xU3ymeM-cOrxnE4OaW3imMNe4EHORmaAMLRzMjesEbmWZegPw1vdJobyqQaJoezzDaA9mL6PasFr3Uc0OaDda0f_22-GjFnyRbS6VBjmzOQwwBr6WhkJdyJtf-PZ7HNybVeqL6Z2SVvPaWTda2Swbo5thPNfJU2aCIOP3xRdRyHcuYl6K4qKQgzbT7MpJpJl1W4VwX1bTxmP2m3r_aGaSAgFV93jkaWicLbPB6OqcqZVO3jyh2ZZRcg48_PbkEV-YFFQn8g_UNz6129qJpqhXx72mTH5nCWpDtN4Dk6ZrOqZ_V6K8s46XvrXTXCqwqzVvs8JqfAAqenQso2kMimSaj8U6LdHZPM03Tdv06UgzaQ&refId=PVHcQWOvzS8CNhJ84D%2F08Q%3D%3D&trackingId=pY1oS0AWAHej4bE%2B1W0xvw%3D%3D&trk=flagship3_search_srp_jobs': {'title': 'Data Scientist (Data Scientist 3) - 20196',
  'location': 'Doral, FL (On-site)'},
 'https://www.linkedin.com/jobs/view/4025933456/?eBP=CwEAAAGSJ0KjN_dMN7N__SZvkNyw_-ogzg5QgxJl14JfzIlQKe7TTYdoe0RSK82sVw6eX-Eye0lOTdPEK436zqiJ1D7L5hfSx-vPDFpjneIGCOvD5NWf-Ge93P1YRSo9eaEJLml07eTI0ggZqrL_U7JuZKeJOHSV0vjbSVHi0dmvT9dyXTenV0Yicew4WMTj6GPVpYtyFTylK3M