Web Scrapping - Job Descriptions (Data Scientist)
URL Source: LINKEDIN Public

PLP Project 
ResGen Team 

In [None]:
pip install requests beautifulsoup4 pandas
pip install re
pip install html

In [1]:
import requests  # Make sure to import requests, to fetch the job listings from the LinkedIn page
from bs4 import BeautifulSoup # to parse the HTML and extract the job details
import pandas as pd
import json
import re # For data cleaning 
import html
import time

In [2]:
# Define the URL
# Here is a LINKEDIN Public JD with 975 Records (Above 975 records is getting webpage error, suspect restriction implemented from LinkedIn)
# Define the base URL
base_url = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%2BScientist&location=Singapore&geoId=102454443&trk=public_jobs_jobs-search-bar_search-submit&currentJobId=3744333339&position=6&pageNum=0&start="


In [4]:
# Import the display function
from IPython.display import display

In [10]:
# Initialize empty lists to store job data
job_titles = []
company_names = []
preferred_qualifications = []
job_links = []

# Set the range for the 'start' parameter
start_range = range(50, 975, 50)  # Adjust the end value as needed. Max 975 for LinkedIn

# Loop through the range of 'start' values
for start_value in start_range:
    # Construct the URL with the current 'start' value
    url = base_url + str(start_value)

    # Send a GET request to the URL
    response = requests.get(url)


    # if the request was successful, to run this code
    if response.status_code == 200:

        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, "html.parser")

        # Initialize empty lists to store job titles and company names
        #job_titles = []
        #company_names = []
        #preferred_qualifications = []
        #job_links = []

        # Find all <li> elements
        job_listings = soup.find_all("li")

        # Extract job titles and company names from each <li>
        for job_listing in job_listings:
            # Extract job title from <h3>
            job_title_element = job_listing.find("h3", class_="base-search-card__title")
            if job_title_element:
                job_title = job_title_element.text.strip()
            else:
                job_title = "N/A"

            # Extract company name from <h4>
            company_name_element = job_listing.find("h4", class_="base-search-card__subtitle")
            if company_name_element:
                company_link = company_name_element.find("a")
                if company_link:
                    company_name = company_link.text.strip()
                else:
                    company_name = "N/A"
            else:
                company_name = "N/A"

            # Extract the link to the individual job posting
            job_link = job_listing.find("a", class_="base-card__full-link")
            if job_link:
                job_url = job_link["href"]
            else:
                job_url = "N/A"

            # Visit the individual job posting's URL if it's not "N/A"
            if job_url != "N/A":
                job_response = requests.get(job_url)
                if job_response.status_code == 200:
                    job_soup = BeautifulSoup(job_response.text, "html.parser")
                    # Extract the preferred qualifications from the JSON data
                    script_element = job_soup.find("script", type="application/ld+json")
                    if script_element:
                        script_content = script_element.text
                        script_data = json.loads(script_content)
                        preferred_qualification = script_data.get("description", "N/A")
                    else:
                        preferred_qualification = "N/A"
                else:
                    preferred_qualification = "N/A"
            else:
                preferred_qualification = "N/A"

            # Append job title, company name, preferred qualifications, and job link to respective lists
            job_titles.append(job_title)
            company_names.append(company_name)
            preferred_qualifications.append(preferred_qualification)
            job_links.append(job_url)

        # Introduce a delay before the next request
        #time.sleep(request_delay)

        # Create a DataFrame from the extracted data
        job_data = pd.DataFrame({
            "Job Title": job_titles,
            "Company Name": company_names,
            "Preferred Qualifications": preferred_qualifications,
            "Link to Job": job_links
        })

        # Print the DataFrame
        print(job_data)

        # You can also save the data to a CSV file if needed
        job_data.to_csv("job_info.csv", index=False)
    else:
        print("Failed to retrieve the page. Status code:", response.status_code)

# Display the DataFrame
display(job_data)

                                            Job Title  \
0                                           Scientist   
1        Growth Data Scientist (Statistical modeling)   
2                               Junior Data Scientist   
3                      Data Scientist, Trust & Safety   
4                Data Scientist (Analysis & Insights)   
5                           Machine Learning Engineer   
6              Senior Data Scientist / Data Scientist   
7   Senior Artificial Intelligence/Machine Learnin...   
8                           Machine Learning Engineer   
9                           Machine Learning Engineer   
10         Senior /Machine Learning Engineer - DSC/EZ   
11                          Machine Learning Engineer   
12                   Data Scientist - Fraud Analytics   
13                            Data Scientist [Search]   
14                            WeChat - Data Scientist   
15                              Senior Data Scientist   
16                             

Unnamed: 0,Job Title,Company Name,Preferred Qualifications,Link to Job
0,Scientist,Horizon Quantum Computing,We are seeking an early-career scientist to he...,https://sg.linkedin.com/jobs/view/scientist-at...
1,Growth Data Scientist (Statistical modeling),Crypto.com,We are seeking a talented and committed Growth...,https://sg.linkedin.com/jobs/view/growth-data-...
2,Junior Data Scientist,,,
3,"Data Scientist, Trust & Safety",TikTok,,https://sg.linkedin.com/jobs/view/data-scienti...
4,Data Scientist (Analysis & Insights),,,
...,...,...,...,...
420,Product Development Software Engineer (C/C++),ST Engineering,&lt;p&gt;&lt;strong&gt;ST Engineering&lt;/stro...,https://sg.linkedin.com/jobs/view/product-deve...
421,Research Fellow/Engineer (Chemical Plume Tomog...,Singapore Institute of Technology,&lt;strong&gt;Key Responsibilities&lt;br&gt;&l...,https://sg.linkedin.com/jobs/view/research-fel...
422,Unreal Engine Developer,Bifrost AI,"At Bifrost, we are building generative engines...",https://sg.linkedin.com/jobs/view/unreal-engin...
423,Software Engineer - Low-code Platform - Singapore,ByteDance,,https://sg.linkedin.com/jobs/view/software-eng...


In [12]:
# Function to clean and extract preferred qualifications in point form
def clean_and_extract_preferred_qualifications(text):

    # Remove unwanted HTML tags
    cleaned_text = re.sub(r'&lt;/?(strong|ul|li|br)&gt;', '', text)

    # Remove unwanted HTML tags and entities
    #cleaned_text = re.sub(r'<[^>]*>', '', text)  # Remove HTML tags
    #cleaned_text = re.sub(r'&[A-Za-z]+;', ' ', cleaned_text)  # Remove HTML entities

    # Extract information after headers (e.g., "Requirements" or "Preferred Qualifications")
    headers = ["Requirements", "Preferred Qualifications", "Requirments", "What We Are Looking For", "Basic Requirements", "The Must-Haves"]

    info_dict = {}

    for header in headers:
        pattern = re.compile(f"{header}(.*?)(?={ '|'.join(headers) }|$)", re.DOTALL| re.IGNORECASE)
        match = pattern.search(cleaned_text)
        if match:
            info = match.group(1).strip()
            info_dict[header] = info
            return info

        else:
            return "N/A"

# Cleaning portion can be further improved*
# For this project, scrapping more JD to pick out non-N/A cases

In [13]:
# Apply the function to clean the "Preferred Qualifications" column
job_data['Preferred Qualifications Cleaned'] = job_data['Preferred Qualifications'].apply(clean_and_extract_preferred_qualifications)

# Convert the list of qualifications to a formatted string
job_data['Preferred Qualifications Cleaned'] = job_data['Preferred Qualifications Cleaned'].apply(lambda x: ''.join(x))

# Display the DataFrame
display(job_data)

job_data.to_csv("LinkedIn_JD_Scrapped_500rows.csv", index=False)

Unnamed: 0,Job Title,Company Name,Preferred Qualifications,Link to Job,Preferred Qualifications Cleaned
0,Scientist,Horizon Quantum Computing,We are seeking an early-career scientist to he...,https://sg.linkedin.com/jobs/view/scientist-at...,"A MSc degree in Computer Science, Physics, a r..."
1,Growth Data Scientist (Statistical modeling),Crypto.com,We are seeking a talented and committed Growth...,https://sg.linkedin.com/jobs/view/growth-data-...,", you can learn more from our talent acquisiti..."
2,Junior Data Scientist,,,,
3,"Data Scientist, Trust & Safety",TikTok,,https://sg.linkedin.com/jobs/view/data-scienti...,
4,Data Scientist (Analysis & Insights),,,,
...,...,...,...,...,...
420,Product Development Software Engineer (C/C++),ST Engineering,&lt;p&gt;&lt;strong&gt;ST Engineering&lt;/stro...,https://sg.linkedin.com/jobs/view/product-deve...,into effective and efficient solutions. Work a...
421,Research Fellow/Engineer (Chemical Plume Tomog...,Singapore Institute of Technology,&lt;strong&gt;Key Responsibilities&lt;br&gt;&l...,https://sg.linkedin.com/jobs/view/research-fel...,Have relevant competence in the areas of data ...
422,Unreal Engine Developer,Bifrost AI,"At Bifrost, we are building generative engines...",https://sg.linkedin.com/jobs/view/unreal-engin...,"Experience with Unreal Engine (Any version, bu..."
423,Software Engineer - Low-code Platform - Singapore,ByteDance,,https://sg.linkedin.com/jobs/view/software-eng...,


In [14]:
# Drop all rows with N/A 
# Take only Job Title, Company, Preferred Qualifications Cleaned, Link to Job 

# Drop rows with "N/A" in the "Preferred Qualifications Cleaned" column
# Select specific columns
job_data_dropNA = job_data[job_data['Preferred Qualifications Cleaned'] != "N/A"]
job_data_dropNA = job_data_dropNA.loc[:, ["Job Title", "Company Name", "Preferred Qualifications Cleaned", "Link to Job"]]

# Rename the "Preferred Qualifications Cleaned" column to "Preferred Qualifications"
job_data_dropNA = job_data_dropNA.rename(columns={"Preferred Qualifications Cleaned": "Preferred Qualifications"})

# Display the DataFrame
display(job_data_dropNA)

# Select specific columns

job_data_dropNA.to_csv("LinkedIn_JD_Scrapped_500rows_Cleaned.csv", index=False)

Unnamed: 0,Job Title,Company Name,Preferred Qualifications,Link to Job
0,Scientist,Horizon Quantum Computing,"A MSc degree in Computer Science, Physics, a r...",https://sg.linkedin.com/jobs/view/scientist-at...
1,Growth Data Scientist (Statistical modeling),Crypto.com,", you can learn more from our talent acquisiti...",https://sg.linkedin.com/jobs/view/growth-data-...
10,Senior /Machine Learning Engineer - DSC/EZ,ST Engineering,", design, develop, test and maintain Machine L...",https://sg.linkedin.com/jobs/view/senior-machi...
12,Data Scientist - Fraud Analytics,Robert Walters,", understand business processes and conduct da...",https://sg.linkedin.com/jobs/view/data-scienti...
13,Data Scientist [Search],Ahrefs,"together with Product Development, Design and ...",https://sg.linkedin.com/jobs/view/data-scienti...
...,...,...,...,...
416,Application Programmer - Murex,MIGSO-PCUBED,&lt;/u&gt;&lt;/p&gt;&lt;p&gt;At least a Bachel...,https://sg.linkedin.com/jobs/view/application-...
418,Software Engineer (Enterprise Applications),DSO National Laboratories,", and ensure the fulfilment of these",https://sg.linkedin.com/jobs/view/software-eng...
420,Product Development Software Engineer (C/C++),ST Engineering,into effective and efficient solutions. Work a...,https://sg.linkedin.com/jobs/view/product-deve...
421,Research Fellow/Engineer (Chemical Plume Tomog...,Singapore Institute of Technology,Have relevant competence in the areas of data ...,https://sg.linkedin.com/jobs/view/research-fel...
