In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from collections import defaultdict
import re
from datetime import datetime, timedelta
import os
import oracledb
from sqlalchemy import create_engine

title = 'data engineer'
location = "St Louis, Missouri, United States" 
#location = "United States" 
job_count = 0
exp_level = "2"  # 2==entry level
post_date = "r604800"  # r604800==Past week

headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"}

base_url = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search"
url = f'{base_url}?f_E={exp_level}&f_TPR={post_date}&keywords="{title}"&location={location}&start={job_count}'


print(url)

https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?f_E=2&f_TPR=r604800&keywords="data engineer"&location=St Louis, Missouri, United States&start=0


### FUNCTION DEFINITIONS

In [6]:
# Sends a request for one results page to LinkedIn to get a list of job titles
def get_jobs_page(job_count) -> list:
    url = f'{base_url}?f_E={exp_level}&f_TPR={post_date}&keywords="{title}"&location={location}&start={job_count}'
    print("job page URL: ", url)
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")
    all_job_titles = soup.find_all("li")
    return all_job_titles

# Parses out the job title from one listing
def parse_job_id(job):
    return job.find("div", {"class":"base-card"}).get("data-entity-urn").split(":")[3]

# Get all job ids from one results page
def get_job_ids(job_count):
    job_ids = []
    all_job_titles = get_jobs_page(job_count)
    if not all_job_titles:  # if no jobs titles in the list
        print("No job titles found!")
        return None 

    all_job_ids = list(map(parse_job_id, all_job_titles))
    print("all job ids: ", all_job_ids)
    return all_job_ids
    



# I can't figure out how to get the count, the below function is useless
def get_result_count():

    base_url = "https://www.linkedin.com/jobs/search"
    url = f'{base_url}?f_E={exp_level}&f_TPR={post_date}&keywords="{title}"&location={location}'
    print(url)
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")
    print(soup)
    count = soup.find("div", {"class":"jobs-search-results-list__subtitle"})
    print(count)




In [8]:
job_ids = []
job_count = 0
max_jobs_to_scrape = 25
# I can't get a job_count for the search query, 
# so going to scrape till it can NOT find any more job ids
# or I'm going to scrape upto 300 jobs or 12 pages
while job_count < max_jobs_to_scrape:
    page_of_ids = get_job_ids(job_count)
    if page_of_ids:
        job_ids.extend(page_of_ids)
    else:
        break
    job_count += 25
    
print(job_ids)

job page URL:  https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?f_E=2&f_TPR=r604800&keywords="data engineer"&location=St Louis, Missouri, United States&start=0
all job ids:  ['3708952912', '3714572531']
['3708952912', '3714572531']


In [9]:
print(job_count)


25


In [15]:
job_url = 'https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{}'
l_all_job_info = []

for id in job_ids:
    d_job_info = {}
    d_job_info["job_id"] = id
    job_desc_url = job_url.format(id)
    print(job_desc_url)
    res = requests.get(job_desc_url)
    soup = BeautifulSoup(res.text, "html.parser")
    
    # Get the company name
    d_job_info["company"] = soup.find("div", {"class":"top-card-layout__card"}).find("a").find("img").get("alt")

    # Get the location
    d_job_info["location"] = soup.find("div", {"class":"topcard__flavor-row"}).find("span", {"class":"topcard__flavor--bullet"}).text.strip()

    # Get the job title
    d_job_info["job_title"] = soup.find("h2", {"class":"top-card-layout__title"}).text.strip()

    # Get the full job description 
    d_job_info["job_description"] = soup.find("div", {"class":"show-more-less-html__markup"}).get_text(separator=u"\n")
    
    # Get years of experience!!!
    d_job_info["experience"] = re.findall(r".*\D\d{1,2}\D.*years?", d_job_info["job_description"])
    d_job_info["experience"] = "\n".join(d_job_info["experience"])

    # Get Seniority level, Employment type, Job function, Industries
    job_criteria_list = soup.find("ul", {"class":"description__job-criteria-list"}).find_all("li")
    for criteria in job_criteria_list:
        criteria = criteria.text.split("\n") # convert lines to a list
        criteria = [i.strip() for i in criteria if i.strip()] # remove lines with only white space
        criteria[0] = criteria[0].replace(" ", "_")
        d_job_info.update({criteria[0]:criteria[1]})

    # Get job posting date
    posting_date = soup.find("span", {"class":"posted-time-ago__text"}).text.strip()
    posting_num = int(re.match(r'\d{1,2}',posting_date).group())
    if "minute" in posting_date:
        d_job_info["posting_date"] = datetime.today() - timedelta(minutes=posting_num)
    elif "hour" in posting_date:
        d_job_info["posting_date"] = datetime.today() - timedelta(hours=posting_num)
    elif "day" in posting_date:
        d_job_info["posting_date"] = datetime.today() - timedelta(days=posting_num)
    else:
        d_job_info["posting_date"] = ""
    
    
    # !!! Get Other useful info with AI !!!

    
    # Get URL 
    d_job_info["url"] = soup.find("a", {"class":"topcard__link"}).get("href")
    print(d_job_info["url"])

    # Append the job info (dict) to the list of job info
    l_all_job_info.append(d_job_info)

# Convert list of all job info dicts to a dataframe
df_all_job_info = pd.DataFrame(l_all_job_info)
df_all_job_info


https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/3708952912
https://www.linkedin.com/jobs/view/tax-digital-transformation-innovation-senior-data-engineer-at-bdo-usa-3708952912?trk=public_jobs_topcard-title
https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/3714572531
https://www.linkedin.com/jobs/view/data-scientist-with-security-clearance-at-clearancejobs-3714572531?trk=public_jobs_topcard-title


Unnamed: 0,job_id,company,location,job_title,job_description,experience,Seniority_level,Employment_type,Job_function,Industries,posting_date,url
0,3708952912,BDO USA,"St Louis, MO",Tax Digital Transformation & Innovation Senior...,\n\nJob Description\nJob Summary:\nThe Digital...,Bachelor's degree and six (6) or more years o...,Entry level,Full-time,Finance and Accounting/Auditing,Accounting,2023-09-09 01:21:20.675585,https://www.linkedin.com/jobs/view/tax-digital...
1,3714572531,ClearanceJobs,"St Louis, MO",Data Scientist with Security Clearance,\n Job Number: R0179261 Data Scientist\...,The Opportunity: Ever-expanding technology lik...,Entry level,Part-time,Engineering and Information Technology,Defense and Space Manufacturing,2023-09-07 01:21:21.359583,https://www.linkedin.com/jobs/view/data-scient...


In [8]:
# Oracle Connection string
cs = '''(description= (retry_count=20)(retry_delay=3)(address=(protocol=tcps)(port=1521)(host=adb.us-sanjose-1.oraclecloud.com))(connect_data=(service_name=ga3e236c6957ba6_oltpdb_high.adb.oraclecloud.com))(security=(ssl_server_dn_match=yes)))'''

connection = oracledb.connect(
    user="appuser",
    password=os.environ['ORACLE_PASSWORD_APPUSER'],
    dsn = cs
)

cursor = connection.cursor()
cursor.execute("SELECT * FROM tbl_jobs")
results = cursor.fetchall()

for row in results:
    print(row)

# Close the cursor
cursor.close()

# Close the connection
connection.close()

(9999, 'test company', 'pleasentville', 'job_title', 'job description', 'experience', 'entry-level', 'full time', 'job function junction', 'pimpin', datetime.datetime(2023, 9, 10, 0, 0), 'www.pimpinainteasy.com')


In [5]:
cs = '''(description= (retry_count=20)(retry_delay=3)(address=(protocol=tcps)(port=1521)(host=adb.us-sanjose-1.oraclecloud.com))(connect_data=(service_name=ga3e236c6957ba6_oltpdb_high.adb.oraclecloud.com))(security=(ssl_server_dn_match=yes)))'''

connection = oracledb.connect(
    user="appuser",
    password=os.environ['ORACLE_PASSWORD_APPUSER'],
    dsn = cs
)

In [24]:
user="appuser"
password=os.environ['ORACLE_PASSWORD_APPUSER']
engine = create_engine(
    f'oracle+oracledb://{user}:{password}@{cs}'
    )

with engine.connect() as conn:
    df_jobs_from_db = pd.read_sql_query('SELECT * FROM tbl_jobs', conn)
               
df_jobs_from_db

Unnamed: 0,job_id,company,location,job_title,job_description,experience,seniority_level,employment_type,job_function,industries,posting_date,url
0,9999,test company,pleasentville,job_title,job description,experience,entry-level,full time,job function junction,pimpin,2023-09-10,www.pimpinainteasy.com
