# IMPORT & SET UP

In [1]:
import pandas as pd
import numpy as np
import time
from datetime import datetime
import urllib.parse
import os
import re

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

pd.set_option('display.max_colwidth', None)

In [4]:
PROJECT_DIR = os.getcwd()
ID_DIR_NAME = 'job_id_dir'
INFO_DIR_NAME = 'job_info_dir'

### DRIVER FUNCTIONS

In [5]:
DRIVER = None

In [6]:
def start_driver():
    global DRIVER
    if DRIVER is None:
        options = Options()
        options.add_argument('--headless=new')
        options.add_argument('--no-sandbox')
        options.add_argument('--incognito')
        options.add_argument('-disable-dev-shm-usage')

        chromedriver_path = PROJECT_DIR + 'Chromedriver/chromedriver_mac_arm64/chromedriver'
        service = Service(chromedriver_path)
        DRIVER = webdriver.Chrome(service=service, options=options)
    else:
        print('A driver is already running. Please close current driver before starting another one.')

def close_driver():
    global DRIVER
    if DRIVER is not None:
        DRIVER.close()
        DRIVER = None

In [7]:
close_driver()

In [8]:
start_driver()

### URL & LOGIN SETUPS

In [10]:
from linkedin_scraper import actions

EMAIL = "minhnnt97@gmail.com"
PASSWORD = "pazqig-higdar-nehmE6"
actions.login(DRIVER, EMAIL, PASSWORD) 

BASE_URL = "https://www.linkedin.com/jobs/search/"
SEARCH_TERM = "Data Scientist"
LOCATION_TERM = "United States"

SEARCH_URL = BASE_URL + f"?keywords={urllib.parse.quote(SEARCH_TERM)}" + f"&location={urllib.parse.quote(LOCATION_TERM)}&refresh=true"
SEARCH_URL

'https://www.linkedin.com/jobs/search/?keywords=Data%20Scientist&location=United%20States&refresh=true'

# FUNCTIONS

## Function to scrape all job IDs on the search page(s)

In [8]:
# Function to scrape all products from a page
def get_all_job_ids_from_page(url, num_page=-1):

    global DRIVER, SEARCH_TERM, LOCATION_TERM

    job_id_scrape_time = datetime.now()
    print(f'DATE & TIME: {job_id_scrape_time.strftime(("%Y/%m/%d %H:%M:%S"))}')
    job_id_list = []      
    DRIVER.get(url) 
    time.sleep(5) 

      # Count total number of pages in the search result
    max_page = DRIVER.find_element(By.CLASS_NAME, 'jobs-search-results-list__pagination').find_element(By.TAG_NAME, 'ul').find_elements(By.TAG_NAME, 'li')[-1]
    max_page = int(max_page.text)
    num_page = min(max_page, num_page) if num_page>0 else max_page
    print(f'Scraping {num_page} page(s) out of {max_page} total pages for {SEARCH_TERM} jobs search in {LOCATION_TERM}...')
    total_err_count = 0

      # Loop through num_page pages to scrape all products listed on each page
    for p in range(1, num_page+1):
      url = url + f'&start={(p-1)*25}'  # p=1: start=0, p=2: start = 25 ...
      DRIVER.get(url)
      time.sleep(3)
      # Find all li tags that contain the job information
      products_all = DRIVER.find_element(By.CLASS_NAME, "jobs-search-results-list").find_element(By.CLASS_NAME, "scaffold-layout__list-container").find_elements(By.CLASS_NAME, "scaffold-layout__list-item")
      print(f'Found {len(products_all)} job IDs on page {p}.', end=' ')

      err_count = 0
      for product in products_all:
          try:
              job_id = product.get_attribute("data-occludable-job-id")
              job_id_list.append(job_id)
          except NoSuchElementException:
              err_count += 1

      print(f'Could not scrape {err_count} jobs.')
      total_err_count += err_count

    # Remove dupes
    job_id_list = list(set(job_id_list))
    print(f'---> Found total {len(job_id_list)} unique jobs. Could not scrape total of {total_err_count} jobs.')

    return job_id_list, job_id_scrape_time


time: 1.21 ms (started: 2023-03-31 20:15:35 -05:00)


## Function to scrape information on the job page

In [9]:
def get_single_job_info(job_id):

    info = {
        'Job ID': job_id,
        'Job URL': None,
        'Name': None,
        'Company': None,
        'Company Logo URL': None,
        'Location': None,
        'Workplace Type': None,
        'Time Posted': None,
        'Applicants Count': None,
        'Job Overview': None,
        'Company Overview': None,
        'HR URL': None
    }

    JOB_URL = f"https://www.linkedin.com/jobs/view/{job_id}"
    DRIVER.get(JOB_URL)
    time.sleep(3)

    # Job URL
    info['Job URL'] = JOB_URL

    # Job Name
    try:
        name = DRIVER.find_element(By.CLASS_NAME, "jobs-unified-top-card__job-title").get_attribute("innerHTML")
        info['Name'] = name.strip()
    except NoSuchElementException:
        pass

    # Company
    try:
        company = DRIVER.find_element(By.CLASS_NAME, "jobs-unified-top-card__company-name").find_element(By.TAG_NAME, 'a').get_attribute("innerHTML")
        info['Company'] = company.strip()
    except NoSuchElementException:
        pass

    # Company Logo
    try:
        comp_logo_url = DRIVER.find_element(By.CLASS_NAME, 'p5').find_element(By.TAG_NAME, 'img').get_attribute('src')
        info['Company Logo URL'] = comp_logo_url.strip()
    except NoSuchElementException:
        pass

    # Location
    try:
        location = DRIVER.find_element(By.CLASS_NAME, "jobs-unified-top-card__subtitle-primary-grouping").find_element(By.CLASS_NAME, "jobs-unified-top-card__bullet").get_attribute("innerHTML")
        info['Location'] = location.strip()
    except NoSuchElementException:
        pass

    # Workplace Type
    try:
        work_type = DRIVER.find_element(By.CLASS_NAME, "jobs-unified-top-card__workplace-type").get_attribute("innerHTML")
        info['Workplace Type'] = work_type.strip()
    except NoSuchElementException:
        pass

    # Time Posted
    try:
        time_posted = DRIVER.find_element(By.CLASS_NAME, "jobs-unified-top-card__posted-date").get_attribute("innerHTML")
        info['Time Posted'] = time_posted.strip()
    except NoSuchElementException:
        pass

    # Applicants Count
    try:
        applicants = DRIVER.find_element(By.CLASS_NAME, "jobs-unified-top-card__subtitle-secondary-grouping").find_element(By.CLASS_NAME, "jobs-unified-top-card__applicant-count").get_attribute("innerHTML")
        info['Applicants Count'] = applicants.strip()
    except NoSuchElementException:
        try:
            applicants = DRIVER.find_element(By.CLASS_NAME, "jobs-unified-top-card__subtitle-secondary-grouping").find_element(By.CLASS_NAME, "jobs-unified-top-card__bullet").get_attribute("innerHTML")
            info['Applicants Count'] = applicants.strip()
        except NoSuchElementException:
            pass

    # Job & Company Insight
    try:
        job_overview, company_overview = DRIVER.find_elements(By.CLASS_NAME, "jobs-unified-top-card__job-insight")[:2]
        # use try except to find info of each column:
        # try:
        #     salary = job_overview.find_element(By.TAG_NAME, 'span').find_element(By.TAG_NAME, 'a').get_attribute("innerHTML")
        #     info['Salary'] = salary.strip('\n<!-> ')
        #     job_overview = job_overview.find_element(By.TAG_NAME, 'span').text # With salary posted
        # except NoSuchElementException:
        #     job_overview = job_overview.find_element(By.TAG_NAME, 'span').get_attribute("innerHTML") # Without salary posted

        # use .text to get all text information
        job_overview = job_overview.find_element(By.TAG_NAME, 'span').text
        job_overview = re.sub(r'<!--(?=.*?-->).*?-->', '', job_overview, flags=re.DOTALL) # Remove all HTML comments

        company_overview = company_overview.find_element(By.TAG_NAME, 'span').text
        company_overview = re.sub(r'<!--(?=.*?-->).*?-->', '', company_overview, flags=re.DOTALL) # Remove all HTML comments

        info['Job Overview'] = job_overview.strip()
        info['Company Overview'] = company_overview.strip()
    except NoSuchElementException:
        pass

    # HR URL
    try:
        hr_url = DRIVER.find_element(By.CSS_SELECTOR, "div[class*='hirer-card__hirer-information'] a").get_attribute('href')
        info['HR URL'] = hr_url.strip()
    except NoSuchElementException:
        pass

        # Job details
    # try:
    #     job_details = DRIVER.find_element(By.ID, 'job-details').find_element(By.TAG_NAME, 'span').get_attribute('innerHTML')
    #     job_details = re.sub(r'<!--(?=.*?-->).*?-->', '', job_details, flags=re.DOTALL) # Remove all HTML comments
    #     info['Job Details'] = job_details.strip()
    # except NoSuchElementException:
    #     pass

    return info


#### Test job_id
- 3437480788
- 3525179725
- 3540119207
- 3515314825

In [11]:
test_info = get_single_job_info(3525179725) 

{'Job ID': 3525179725,
 'Job URL': 'https://www.linkedin.com/jobs/view/3525179725',
 'Name': 'AI Research Engineer- Remote',
 'Company': 'Neo Cybernetica',
 'Company Logo URL': 'https://media.licdn.com/dms/image/C560BAQFdt_fuOIJ5TQ/company-logo_100_100/0/1644369586489?e=1688601600&v=beta&t=QyxXmGwyZyxqfDP7SiKcGuUJGrnmgnsnpdG5FbV-ESw',
 'Location': 'Bedford, NH',
 'Workplace Type': 'Remote',
 'Time Posted': '2 weeks ago',
 'Applicants Count': '77 applicants',
 'Job Overview': '11-50 employees · Software Development',
 'Company Overview': 'See how you compare to 77 applicants. Try Premium for free',
 'HR URL': None}

In [13]:
test_info = get_single_job_info(3515314825)
test_info

{'Job ID': 3515314825,
 'Job URL': 'https://www.linkedin.com/jobs/view/3515314825',
 'Name': 'Project Manager - Continuous Improvement',
 'Company': 'MassMEP',
 'Company Logo URL': 'https://media.licdn.com/dms/image/C560BAQHYyK7bdYEOQQ/company-logo_100_100/0/1655235578188?e=1688601600&v=beta&t=Ia7tWFxQvS3WsPm_Cg-huUsFlKepgL4xu2fXonNnjRk',
 'Location': 'Auburn, MA',
 'Workplace Type': 'Hybrid',
 'Time Posted': '3 weeks ago',
 'Applicants Count': '164 applicants',
 'Job Overview': '$97,000/yr · Full-time',
 'Company Overview': '11-50 employees',
 'HR URL': 'https://www.linkedin.com/in/dawn-sedlier-shrm-scp-66741320'}

# START SCRAPING

Get all job ids from the job search page into a list

In [13]:
num_page = 0 # to scrape all pages: set to 0 or -1
job_id_list, job_id_scrape_time = get_all_job_ids_from_page(SEARCH_URL, num_page=num_page)
# job_id_list

DATE & TIME: 2023/03/29 13:10:29
Scraping 40 page(s) out of 40 total pages for Data Scientist jobs search in United States...
Found 25 job IDs on page 1. Could not scrape 0 jobs.
Found 25 job IDs on page 2. Could not scrape 0 jobs.
Found 25 job IDs on page 3. Could not scrape 0 jobs.
Found 25 job IDs on page 4. Could not scrape 0 jobs.
Found 25 job IDs on page 5. Could not scrape 0 jobs.
Found 25 job IDs on page 6. Could not scrape 0 jobs.
Found 25 job IDs on page 7. Could not scrape 0 jobs.
Found 25 job IDs on page 8. Could not scrape 0 jobs.
Found 25 job IDs on page 9. Could not scrape 0 jobs.
Found 25 job IDs on page 10. Could not scrape 0 jobs.
Found 25 job IDs on page 11. Could not scrape 0 jobs.
Found 25 job IDs on page 12. Could not scrape 0 jobs.
Found 25 job IDs on page 13. Could not scrape 0 jobs.
Found 25 job IDs on page 14. Could not scrape 0 jobs.
Found 25 job IDs on page 15. Could not scrape 0 jobs.
Found 25 job IDs on page 16. Could not scrape 0 jobs.
Found 25 job IDs on

#### Write list of job ids to file

In [436]:
file_name = f'jobs_{job_id_scrape_time.strftime("%y%m%d_%H%M%S")}.txt'
JOB_FILE_PATH = os.path.join(PROJECT_DIR, ID_DIR_NAME, file_name)
print(f'[{job_id_scrape_time.strftime(r"%Y/%m/%d %H:%M:%S")}] Updated job list at {JOB_FILE_PATH}')

# Write file to folder
# with open(JOB_FILE_PATH, 'w+') as f:
#     for job_id in job_id_list:
#         f.write(f'{job_id}\n')

[2023/03/29 13:10:29] Updated job list at /Users/thule/Desktop/DSProjects/Linkedin_Analysis/job_id_list_dir/jobs_230329_131029.txt


#### Get list of job id from most recent file

In [14]:
# Get most recent id file
JOB_FILE_PATH = os.path.join(PROJECT_DIR, ID_DIR_NAME, (os.listdir(os.path.join(PROJECT_DIR, ID_DIR_NAME)))[-1])

with open(JOB_FILE_PATH, 'r') as f:
    job_id_list = [l.strip() for l in f.readlines()]

job_id_list

['3520044004',
 '3523743282',
 '3520459813',
 '3525723554',
 '3527821880',
 '3545964641',
 '3520729278',
 '3516637194',
 '3527065467',
 '3522212905',
 '3522294979',
 '3529041711',
 '3532901532',
 '3529458696',
 '3516672427',
 '3529872684',
 '3529949220',
 '3525654696',
 '3525923153',
 '3520439276',
 '3520458644',
 '3524955715',
 '3530936787',
 '3521523630',
 '3522086390',
 '3521754879',
 '3517168914',
 '3542549318',
 '3522010736',
 '3545964865',
 '3533040991',
 '3526783948',
 '3525790897',
 '3523779032',
 '3522879882',
 '3525207467',
 '3520737436',
 '3527860425',
 '3514747876',
 '3531333875',
 '3532789301',
 '3529457989',
 '3516670537',
 '3520378880',
 '3545989312',
 '3520382049',
 '3520155411',
 '3528643036',
 '3516616194',
 '3526077070',
 '3520820840',
 '3528020320',
 '3531482679',
 '3524248732',
 '3532595880',
 '3540122981',
 '3522671237',
 '3530768301',
 '3531339677',
 '3521702352',
 '3532599805',
 '3527675015',
 '3525729542',
 '3528295490',
 '3542540251',
 '3525564609',
 '35280216

Get information from all job page

In [None]:
from tqdm.notebook import tqdm

job_info_list = []

for job_id in tqdm(job_id_list):
    try:
        job_info = get_single_job_info(job_id)
        job_info_list.append(job_info)
    except Exception as e:
        print(e)

job_info_list[:3]

In [34]:
job_df = pd.DataFrame(job_info_list)
job_df.head(10)

Unnamed: 0,Job ID,Job URL,Name,Company,Company Logo URL,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview
0,3520044004,https://www.linkedin.com/jobs/view/3520044004,"Data Scientist, Research",TikTok,https://media.licdn.com/dms/image/C510BAQGCdThXIss7UQ/company-logo_100_100/0/1539940587971?e=1687996800&v=beta&t=1AyKLKEh0SiRwHFVslSVLJ-5LaOWsCzFqZtQ9ZvHi1o,"San Jose, CA",Hybrid,2 weeks ago,198 applicants,Full-time,"10,001+ employees · Entertainment Providers"
1,3523743282,https://www.linkedin.com/jobs/view/3523743282,Global Data Scientist,Kimberly-Clark,https://media.licdn.com/dms/image/C560BAQFahtjOdf_ETQ/company-logo_100_100/0/1542208571146?e=1687996800&v=beta&t=y-aRHU6gnrNyr6nMPswJJHlWEoEOjZyHL-a1Qs5MPFY,United States,Remote,2 weeks ago,Over 200 applicants,Full-time · Mid-Senior level,"10,001+ employees · Manufacturing"
2,3520459813,https://www.linkedin.com/jobs/view/3520459813,Data Analytics Intern (Summer 2023),Industry Dive,https://media.licdn.com/dms/image/C4E0BAQEAkpLAgFUtpA/company-logo_100_100/0/1520994058492?e=1687996800&v=beta&t=wkHpT8VLjHRZryS_VDdpboOGmMkNRTNhMqVoDrCC2_M,"Washington, DC",Remote,,,$16/hr - $21/hr (from job description) · Internship · Internship,201-500 employees · Online Audio and Video Media
3,3525723554,https://www.linkedin.com/jobs/view/3525723554,Data Scientist Solution Specialist- IT Internship,Waters Corporation,https://media.licdn.com/dms/image/C560BAQHFDhBFVWfhzg/company-logo_100_100/0/1656651227123?e=1687996800&v=beta&t=YmLFUIyNIljy5hv0bST7P3wzrzdVBRln9qUpFAufG0c,"Milford, MA",On-site,1 week ago,Over 200 applicants,Internship · Internship,"5,001-10,000 employees · Biotechnology Research"
4,3527821880,https://www.linkedin.com/jobs/view/3527821880,Data Engineer,Chatham Financial,https://media.licdn.com/dms/image/C4D0BAQFPJJtAqKZSKA/company-logo_100_100/0/1566565993951?e=1687996800&v=beta&t=w4VlO0akxyqbbvH7Io6cMb3i8qShWG84zirwSGF-rDM,"Kennett Square, PA",On-site,2 weeks ago,104 applicants,Full-time · Entry level,"501-1,000 employees · Financial Services"
5,3545964641,https://www.linkedin.com/jobs/view/3545964641,"Programmer, Data Analysis, Epidemiology Analytics","Daiichi Sankyo, Inc.",https://media.licdn.com/dms/image/C4E0BAQG95drOHkKpng/company-logo_100_100/0/1598625647052?e=1687996800&v=beta&t=5Udyx0TVHjP69EsvMylTweGLMgYVUfzVxTHcxCxserQ,"Basking Ridge, NJ",,2 days ago,25 applicants,Full-time,"1,001-5,000 employees · Pharmaceutical Manufacturing"
6,3520729278,https://www.linkedin.com/jobs/view/3520729278,Investment Data Analyst,Westbourne Partners,https://media.licdn.com/dms/image/C560BAQHzeeFsphik2w/company-logo_100_100/0/1644838979087?e=1687996800&v=beta&t=0dJ4YM0X9zk6FQtVPIEz-wM2esA3X0byK0L4Cs-T5Cw,"Chicago, IL",Hybrid,2 weeks ago,Over 200 applicants,"$85,000/yr - $130,000/yr · Full-time · Mid-Senior level",11-50 employees · Staffing and Recruiting
7,3516637194,https://www.linkedin.com/jobs/view/3516637194,"Data Scientist, DC",Rhombus Power Inc.,https://media.licdn.com/dms/image/C560BAQFZ-zC4kM27aQ/company-logo_100_100/0/1626387046517?e=1687996800&v=beta&t=W-n54pZf2YM8Mi-HS2Nq59OUDSpOAebxy-YIJ4I5_H0,"Washington, DC",Hybrid,2 weeks ago,Over 200 applicants,Full-time · Associate,51-200 employees · Defense and Space Manufacturing
8,3527065467,https://www.linkedin.com/jobs/view/3527065467,Data Operations Engineer / Business Operations Analyst,US Tech Solutions,https://media.licdn.com/dms/image/C4D0BAQHPF54GNm3f4w/company-logo_100_100/0/1519856100208?e=1687996800&v=beta&t=RzUD8JFKRffFan_6wyfX_GQZSUbkmy9XbHFIP7PpXoY,"McLean, VA",Hybrid,1 week ago,67 applicants,Full-time · Mid-Senior level,"1,001-5,000 employees · Staffing and Recruiting"
9,3522212905,https://www.linkedin.com/jobs/view/3522212905,Medical Software Quality Engineer - $120k/yr,Bayer,https://media.licdn.com/dms/image/C4E0BAQHKicWqgn2J6g/company-logo_100_100/0/1657870661280?e=1687996800&v=beta&t=WAQmWcZB3S5oRvp4TBIongn5nyCfElrXoGaD2qqUD4M,"Indianola, PA",Hybrid,2 weeks ago,43 applicants,"$120,000/yr · Contract · Mid-Senior level","10,001+ employees · Chemical Manufacturing"


time: 8.52 ms (started: 2023-03-30 10:07:05 -05:00)


#### Save DataFrame to csv file

In [43]:
JOB_INFO_PATH = os.path.join(PROJECT_DIR, INFO_DIR_NAME, os.path.basename(JOB_FILE_PATH)[:-3] + 'csv')

job_df.to_csv(JOB_INFO_PATH, index=0, sep='@')

time: 14.4 ms (started: 2023-03-30 23:24:25 -05:00)


#### Load most recent DataFrame from csv file

In [16]:
JOB_INFO_PATH = os.path.join(PROJECT_DIR, INFO_DIR_NAME, (os.listdir(os.path.join(PROJECT_DIR, INFO_DIR_NAME)))[-1])

job_df = pd.read_csv(JOB_INFO_PATH, index_col=0, sep='@')
job_df.head(10)

Unnamed: 0,Job ID,Job URL,Name,Company,Company Logo URL,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview
0,3520044004,https://www.linkedin.com/jobs/view/3520044004,"Data Scientist, Research",TikTok,https://media.licdn.com/dms/image/C510BAQGCdThXIss7UQ/company-logo_100_100/0/1539940587971?e=1687996800&v=beta&t=1AyKLKEh0SiRwHFVslSVLJ-5LaOWsCzFqZtQ9ZvHi1o,"San Jose, CA",Hybrid,2 weeks ago,198 applicants,Full-time,"10,001+ employees · Entertainment Providers"
1,3523743282,https://www.linkedin.com/jobs/view/3523743282,Global Data Scientist,Kimberly-Clark,https://media.licdn.com/dms/image/C560BAQFahtjOdf_ETQ/company-logo_100_100/0/1542208571146?e=1687996800&v=beta&t=y-aRHU6gnrNyr6nMPswJJHlWEoEOjZyHL-a1Qs5MPFY,United States,Remote,2 weeks ago,Over 200 applicants,Full-time · Mid-Senior level,"10,001+ employees · Manufacturing"
2,3520459813,https://www.linkedin.com/jobs/view/3520459813,Data Analytics Intern (Summer 2023),Industry Dive,https://media.licdn.com/dms/image/C4E0BAQEAkpLAgFUtpA/company-logo_100_100/0/1520994058492?e=1687996800&v=beta&t=wkHpT8VLjHRZryS_VDdpboOGmMkNRTNhMqVoDrCC2_M,"Washington, DC",Remote,,,$16/hr - $21/hr (from job description) · Internship · Internship,201-500 employees · Online Audio and Video Media
3,3525723554,https://www.linkedin.com/jobs/view/3525723554,Data Scientist Solution Specialist- IT Internship,Waters Corporation,https://media.licdn.com/dms/image/C560BAQHFDhBFVWfhzg/company-logo_100_100/0/1656651227123?e=1687996800&v=beta&t=YmLFUIyNIljy5hv0bST7P3wzrzdVBRln9qUpFAufG0c,"Milford, MA",On-site,1 week ago,Over 200 applicants,Internship · Internship,"5,001-10,000 employees · Biotechnology Research"
4,3527821880,https://www.linkedin.com/jobs/view/3527821880,Data Engineer,Chatham Financial,https://media.licdn.com/dms/image/C4D0BAQFPJJtAqKZSKA/company-logo_100_100/0/1566565993951?e=1687996800&v=beta&t=w4VlO0akxyqbbvH7Io6cMb3i8qShWG84zirwSGF-rDM,"Kennett Square, PA",On-site,2 weeks ago,104 applicants,Full-time · Entry level,"501-1,000 employees · Financial Services"
5,3545964641,https://www.linkedin.com/jobs/view/3545964641,"Programmer, Data Analysis, Epidemiology Analytics","Daiichi Sankyo, Inc.",https://media.licdn.com/dms/image/C4E0BAQG95drOHkKpng/company-logo_100_100/0/1598625647052?e=1687996800&v=beta&t=5Udyx0TVHjP69EsvMylTweGLMgYVUfzVxTHcxCxserQ,"Basking Ridge, NJ",,2 days ago,25 applicants,Full-time,"1,001-5,000 employees · Pharmaceutical Manufacturing"
6,3520729278,https://www.linkedin.com/jobs/view/3520729278,Investment Data Analyst,Westbourne Partners,https://media.licdn.com/dms/image/C560BAQHzeeFsphik2w/company-logo_100_100/0/1644838979087?e=1687996800&v=beta&t=0dJ4YM0X9zk6FQtVPIEz-wM2esA3X0byK0L4Cs-T5Cw,"Chicago, IL",Hybrid,2 weeks ago,Over 200 applicants,"$85,000/yr - $130,000/yr · Full-time · Mid-Senior level",11-50 employees · Staffing and Recruiting
7,3516637194,https://www.linkedin.com/jobs/view/3516637194,"Data Scientist, DC",Rhombus Power Inc.,https://media.licdn.com/dms/image/C560BAQFZ-zC4kM27aQ/company-logo_100_100/0/1626387046517?e=1687996800&v=beta&t=W-n54pZf2YM8Mi-HS2Nq59OUDSpOAebxy-YIJ4I5_H0,"Washington, DC",Hybrid,2 weeks ago,Over 200 applicants,Full-time · Associate,51-200 employees · Defense and Space Manufacturing
8,3527065467,https://www.linkedin.com/jobs/view/3527065467,Data Operations Engineer / Business Operations Analyst,US Tech Solutions,https://media.licdn.com/dms/image/C4D0BAQHPF54GNm3f4w/company-logo_100_100/0/1519856100208?e=1687996800&v=beta&t=RzUD8JFKRffFan_6wyfX_GQZSUbkmy9XbHFIP7PpXoY,"McLean, VA",Hybrid,1 week ago,67 applicants,Full-time · Mid-Senior level,"1,001-5,000 employees · Staffing and Recruiting"
9,3522212905,https://www.linkedin.com/jobs/view/3522212905,Medical Software Quality Engineer - $120k/yr,Bayer,https://media.licdn.com/dms/image/C4E0BAQHKicWqgn2J6g/company-logo_100_100/0/1657870661280?e=1687996800&v=beta&t=WAQmWcZB3S5oRvp4TBIongn5nyCfElrXoGaD2qqUD4M,"Indianola, PA",Hybrid,2 weeks ago,43 applicants,"$120,000/yr · Contract · Mid-Senior level","10,001+ employees · Chemical Manufacturing"


time: 12.9 ms (started: 2023-03-31 20:25:26 -05:00)


# CLEANING

In [16]:
job_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656 entries, 0 to 655
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Job ID            656 non-null    object
 1   Job URL           656 non-null    object
 2   Name              656 non-null    object
 3   Company           655 non-null    object
 4   Company Logo URL  656 non-null    object
 5   Location          656 non-null    object
 6   Workplace Type    561 non-null    object
 7   Time Posted       655 non-null    object
 8   Applicants Count  449 non-null    object
 9   Job Overview      656 non-null    object
 10  Company Overview  656 non-null    object
dtypes: object(11)
memory usage: 56.5+ KB


In [393]:
job_df_2 = job_df.copy() # Make a copy
job_df_2.head()

Unnamed: 0,Job ID,Job URL,Name,Company,Company Logo URL,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview
0,3530951644,https://www.linkedin.com/jobs/view/3530951644,Data Analyst/Administrative Assistant,"Tuba Group, Inc.",https://media.licdn.com/dms/image/C4D0BAQHw44zVuWeBMQ/company-logo_100_100/0/1519951880616?e=1687996800&v=beta&t=9is_bVp6AbucbbnSbDUPHSA6kqgLhpUF7TYuXDA1TsA,"Sioux City, IA",On-site,1 week ago,40 applicants,Contract · Entry level,51-200 employees · Defense and Space Manufacturing
1,3544009597,https://www.linkedin.com/jobs/view/3544009597,Azure Data Engineer,IT Minds LLC,https://media.licdn.com/dms/image/C560BAQEZkh7wG0X0ew/company-logo_100_100/0/1618287716162?e=1687996800&v=beta&t=nLjiBBkms1IPvKEQf2SkcAKcopZV_ISF51Esz0TjGbY,"Charlotte, NC",On-site,3 weeks ago,,Full-time · Entry level,51-200 employees · IT Services and IT Consulting
2,3541503028,https://www.linkedin.com/jobs/view/3541503028,Big Data Developer,Diverse Lynx,https://media.licdn.com/dms/image/C4E0BAQGk4XHA-0aydA/company-logo_100_100/0/1553721179130?e=1687996800&v=beta&t=iLaN2mgy_v0G0TzRbyYv_laOSXWA7F7pmgATQ0ltneI,"Atlantic City, NJ",Hybrid,3 weeks ago,,Contract · Entry level,"1,001-5,000 employees · Software Development"
3,3547594589,https://www.linkedin.com/jobs/view/3547594589,Data Scientist @ Remote,Diverse Lynx,https://media.licdn.com/dms/image/C4E0BAQGk4XHA-0aydA/company-logo_100_100/0/1553721179130?e=1687996800&v=beta&t=iLaN2mgy_v0G0TzRbyYv_laOSXWA7F7pmgATQ0ltneI,United States,Remote,27 minutes ago,,Contract · Entry level,"1,001-5,000 employees · Software Development"
4,3547517166,https://www.linkedin.com/jobs/view/3547517166,Viral Research Scientist I,Compunnel Inc.,https://media.licdn.com/dms/image/C4E0BAQFe-tL0qK8kxw/company-logo_100_100/0/1632140982446?e=1687996800&v=beta&t=2gBvJLsqrfKdM1f8_cG0qi5shKIGVW5As4rdpceMmpo,"Athens, GA",On-site,1 day ago,,Full-time · Mid-Senior level,"1,001-5,000 employees · IT Services and IT Consulting"


### Split Location
New columns:
- Location_City
- Location_State

In [394]:
job_df_2['Location'].str.split(', ').map(len).value_counts()

2    614
1     34
3      8
Name: Location, dtype: int64

In [395]:
# Remove United States from Location col
job_df_2['Location'] = job_df_2['Location'].str.replace('United States', '', regex=False).str.strip(', ')
# Split city and state from location col
job_df_2[['Location_City', 'Location_State']] = job_df_2['Location'].str.split(', ', expand=True)
job_df_2.head()

Unnamed: 0,Job ID,Job URL,Name,Company,Company Logo URL,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State
0,3530951644,https://www.linkedin.com/jobs/view/3530951644,Data Analyst/Administrative Assistant,"Tuba Group, Inc.",https://media.licdn.com/dms/image/C4D0BAQHw44zVuWeBMQ/company-logo_100_100/0/1519951880616?e=1687996800&v=beta&t=9is_bVp6AbucbbnSbDUPHSA6kqgLhpUF7TYuXDA1TsA,"Sioux City, IA",On-site,1 week ago,40 applicants,Contract · Entry level,51-200 employees · Defense and Space Manufacturing,Sioux City,IA
1,3544009597,https://www.linkedin.com/jobs/view/3544009597,Azure Data Engineer,IT Minds LLC,https://media.licdn.com/dms/image/C560BAQEZkh7wG0X0ew/company-logo_100_100/0/1618287716162?e=1687996800&v=beta&t=nLjiBBkms1IPvKEQf2SkcAKcopZV_ISF51Esz0TjGbY,"Charlotte, NC",On-site,3 weeks ago,,Full-time · Entry level,51-200 employees · IT Services and IT Consulting,Charlotte,NC
2,3541503028,https://www.linkedin.com/jobs/view/3541503028,Big Data Developer,Diverse Lynx,https://media.licdn.com/dms/image/C4E0BAQGk4XHA-0aydA/company-logo_100_100/0/1553721179130?e=1687996800&v=beta&t=iLaN2mgy_v0G0TzRbyYv_laOSXWA7F7pmgATQ0ltneI,"Atlantic City, NJ",Hybrid,3 weeks ago,,Contract · Entry level,"1,001-5,000 employees · Software Development",Atlantic City,NJ
3,3547594589,https://www.linkedin.com/jobs/view/3547594589,Data Scientist @ Remote,Diverse Lynx,https://media.licdn.com/dms/image/C4E0BAQGk4XHA-0aydA/company-logo_100_100/0/1553721179130?e=1687996800&v=beta&t=iLaN2mgy_v0G0TzRbyYv_laOSXWA7F7pmgATQ0ltneI,,Remote,27 minutes ago,,Contract · Entry level,"1,001-5,000 employees · Software Development",,
4,3547517166,https://www.linkedin.com/jobs/view/3547517166,Viral Research Scientist I,Compunnel Inc.,https://media.licdn.com/dms/image/C4E0BAQFe-tL0qK8kxw/company-logo_100_100/0/1632140982446?e=1687996800&v=beta&t=2gBvJLsqrfKdM1f8_cG0qi5shKIGVW5As4rdpceMmpo,"Athens, GA",On-site,1 day ago,,Full-time · Mid-Senior level,"1,001-5,000 employees · IT Services and IT Consulting",Athens,GA


### Match State Names with Abbreviation

In [396]:
states_dict = {
'Alabama':	            'AL',
'Kentucky':	            'KY',
'Ohio':	                'OH',
'Alaska':	            'AK',
'Louisiana':	        'LA',
'Oklahoma':	            'OK',
'Arizona':	            'AZ',
'Maine':	            'ME',
'Oregon':	            'OR',
'Arkansas':	            'AR',
'Maryland':	            'MD',
'Pennsylvania':	        'PA',
'American Samoa':	    'AS',
'Massachusetts':	    'MA',
'Puerto Rico':	        'PR',
'California':	        'CA',
'Michigan':	            'MI',
'Rhode Island':	        'RI',
'Colorado':	            'CO',
'Minnesota':	        'MN',
'South Carolina':	    'SC',
'Connecticut':	        'CT',
'Mississippi':	        'MS',
'South Dakota':	        'SD',
'Delaware':	            'DE',
'Missouri':	            'MO',
'Tennessee':	        'TN',
'District of Columbia':	'DC',	
'Montana':	            'MT',
'Texas':	            'TX',
'Florida':	            'FL',
'Nebraska':	            'NE',
'Trust Territories':	'TT',
'Georgia':	            'GA',
'Nevada':	            'NV',
'Utah':	                'UT',
'Guam':	                'GU',
'New Hampshire':	    'NH',
'Vermont':	            'VT',
'Hawaii':	            'HI',
'New Jersey':	        'NJ',
'Virginia':	            'VA',
'Idaho':	            'ID',
'New Mexico':	        'NM',
'Virgin Islands':	    'VI',
'Illinois':	            'IL',
'New York':	            'NY',
'Washington':	        'WA',
'Indiana':	            'IN',
'North Carolina':	    'NC',
'West Virginia':	    'WV',
'Iowa':	                'IA',
'North Dakota':	        'ND',
'Wisconsin':	        'WI',
'Kansas':	            'KS',
'Northern Mariana Islands':	'MP',	
'Wyoming':	            'WY'
}
states_dict = dict(sorted(states_dict.items()))
states_dict = dict((v, k) for k, v in states_dict.items())
states_dict

{'AL': 'Alabama',
 'AK': 'Alaska',
 'AS': 'American Samoa',
 'AZ': 'Arizona',
 'AR': 'Arkansas',
 'CA': 'California',
 'CO': 'Colorado',
 'CT': 'Connecticut',
 'DE': 'Delaware',
 'DC': 'District of Columbia',
 'FL': 'Florida',
 'GA': 'Georgia',
 'GU': 'Guam',
 'HI': 'Hawaii',
 'ID': 'Idaho',
 'IL': 'Illinois',
 'IN': 'Indiana',
 'IA': 'Iowa',
 'KS': 'Kansas',
 'KY': 'Kentucky',
 'LA': 'Louisiana',
 'ME': 'Maine',
 'MD': 'Maryland',
 'MA': 'Massachusetts',
 'MI': 'Michigan',
 'MN': 'Minnesota',
 'MS': 'Mississippi',
 'MO': 'Missouri',
 'MT': 'Montana',
 'NE': 'Nebraska',
 'NV': 'Nevada',
 'NH': 'New Hampshire',
 'NJ': 'New Jersey',
 'NM': 'New Mexico',
 'NY': 'New York',
 'NC': 'North Carolina',
 'ND': 'North Dakota',
 'MP': 'Northern Mariana Islands',
 'OH': 'Ohio',
 'OK': 'Oklahoma',
 'OR': 'Oregon',
 'PA': 'Pennsylvania',
 'PR': 'Puerto Rico',
 'RI': 'Rhode Island',
 'SC': 'South Carolina',
 'SD': 'South Dakota',
 'TN': 'Tennessee',
 'TX': 'Texas',
 'TT': 'Trust Territories',
 'UT': 

In [397]:
job_df_2['Location_State'] = job_df_2['Location_State'].apply(lambda x: states_dict[x] if x in states_dict else None)
job_df_2['Location_State'].unique()

array(['Iowa', 'North Carolina', 'New Jersey', None, 'Georgia',
       'California', 'Indiana', 'Florida', 'Missouri',
       'District of Columbia', 'Maryland', 'Minnesota', 'New York',
       'Michigan', 'Ohio', 'Illinois', 'Alabama', 'Massachusetts',
       'Texas', 'Colorado', 'New Hampshire', 'Pennsylvania', 'Virginia',
       'Kansas', 'Washington', 'Nebraska', 'Connecticut', 'Wyoming',
       'Arizona', 'South Carolina', 'Louisiana', 'Oklahoma', 'Kentucky',
       'Tennessee', 'Wisconsin', 'Alaska', 'North Dakota', 'Nevada',
       'New Mexico', 'Mississippi', 'Rhode Island', 'Utah', 'Oregon',
       'Hawaii', 'Montana', 'Arkansas'], dtype=object)

### Split Company Overview
New columns:
- Company Size
- Industry

In [398]:
job_df_2['Company Overview'].str.split(' · ').map(len).value_counts()

2    650
1      6
Name: Company Overview, dtype: int64

In [399]:
list(set(job_df_2['Company Overview'].str.split(' · ').values.sum()))

['Political Organizations',
 'Administration of Justice',
 'Real Estate',
 'Spectator Sports',
 'Entertainment Providers',
 'Non-profit Organizations',
 'Environmental Services',
 'Defense and Space Manufacturing',
 'Manufacturing',
 'Higher Education',
 '1,001-5,000 employees',
 'Garrett Dobbertin is hiring for this job',
 'Food and Beverage Services',
 'Electric Power Generation',
 'Automotive',
 'Telecommunications',
 '1-10 employees',
 'Industrial Machinery Manufacturing',
 'Retail',
 'Armed Forces',
 'Education Administration Programs',
 'IT Services and IT Consulting',
 '<a class="app-aware-link " target="_self" href="https://www.linkedin.com/search/results/people/?origin=JOB_PAGE_CANNED_SEARCH&amp;currentCompany=%5B166729%5D&amp;schoolFilter=%5B157342%5D" data-test-app-aware-link="">4 school alumni</a>',
 'Venture Capital and Private Equity Principals',
 'Civic and Social Organizations',
 '51-200 employees',
 'Advertising Services',
 'Consumer Goods',
 '201-500 employees',
 'Con

In [400]:
job_df_2.loc[~job_df_2['Company Overview'].str.contains('employees'), 'Company Overview'] = None # replace invalid Company Overview with None
job_df_2[job_df_2['Company Overview'].isna()] # recheck

Unnamed: 0,Job ID,Job URL,Name,Company,Company Logo URL,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State
5,3541502847,https://www.linkedin.com/jobs/view/3541502847,Artificial Intelligence IOS developer,,"data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7",,Remote,56 minutes ago,1 applicant,Temporary,,,
105,3524205334,https://www.linkedin.com/jobs/view/3524205334,Researcher -- E 2023,USG,https://media.licdn.com/dms/image/C4D0BAQEtFVFfSbtRWA/company-logo_100_100/0/1668528042685?e=1687996800&v=beta&t=PP1d3A97jGsesUohpTQHe2XooPDPwEO32l3exXM9828,"Libertyville, IL",On-site,1 week ago,4 applicants,"5,001-10,000 employees · Wholesale Building Materials",,Libertyville,Illinois
130,3525179725,https://www.linkedin.com/jobs/view/3525179725,AI Research Engineer- Remote,Neo Cybernetica,https://media.licdn.com/dms/image/C560BAQFdt_fuOIJ5TQ/company-logo_100_100/0/1644369586489?e=1687996800&v=beta&t=48gLYFNmehIuoOD3QnQZfOUPPAg-EZncaIXIOVNLs8A,"Bedford, NH",Remote,2 weeks ago,70 applicants,11-50 employees · Software Development,,Bedford,New Hampshire


In [401]:
job_df_2[job_df_2['Company Overview'].notna()]['Company Overview'].str.split(' · ').map(len).value_counts()

2    650
1      3
Name: Company Overview, dtype: int64

In [402]:
tmp = job_df_2[(job_df_2['Company Overview'].notna())]
tmp[tmp['Company Overview'].str.split('·').map(len)==1]

Unnamed: 0,Job ID,Job URL,Name,Company,Company Logo URL,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State
142,3511519779,https://www.linkedin.com/jobs/view/3511519779,Development Project Manger,Novin Development Corp (NDC),https://media.licdn.com/dms/image/C4D0BAQG0dL80mmXHOw/company-logo_100_100/0/1519906436626?e=1687996800&v=beta&t=Ztw1tnXM0NJh9VxYhYv0RgLAX6aInl0J5E9Ec2wegMc,San Francisco Bay Area,Hybrid,2 weeks ago,Over 200 applicants,"$150,000/yr (from job description) · Full-time",11-50 employees,San Francisco Bay Area,
513,3501141000,https://www.linkedin.com/jobs/view/3501141000,Communications and Non-Profit Management Internship Summer 2023,Institute for the Study of War,https://media.licdn.com/dms/image/C4D0BAQGQvkGlPGb5Eg/company-logo_100_100/0/1519908135181?e=1687996800&v=beta&t=7rnPeBJpC_x04uzM9IMjrKrMA3mLuK6y4VVhDp76qMo,"Washington, DC",Hybrid,1 month ago,Over 200 applicants,Internship,11-50 employees,Washington,District of Columbia
571,3515314825,https://www.linkedin.com/jobs/view/3515314825,Project Manager - Continuous Improvement,MassMEP,https://media.licdn.com/dms/image/C560BAQHYyK7bdYEOQQ/company-logo_100_100/0/1655235578188?e=1687996800&v=beta&t=Hh_pA2vyR-SmTHl01ihOdUhrbqc5EegSh6FfVp1dlj0,"Auburn, MA",Hybrid,3 weeks ago,158 applicants,"$97,000/yr · Full-time",11-50 employees,Auburn,Massachusetts


In [403]:
# mask = job_df_2[job_df_2['Company Overview'].str.contains('employees')]
job_df_2[['Company Size', 'Industry']] = job_df_2['Company Overview'].str.split(' · ', expand=True)
job_df_2.head()

Unnamed: 0,Job ID,Job URL,Name,Company,Company Logo URL,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State,Company Size,Industry
0,3530951644,https://www.linkedin.com/jobs/view/3530951644,Data Analyst/Administrative Assistant,"Tuba Group, Inc.",https://media.licdn.com/dms/image/C4D0BAQHw44zVuWeBMQ/company-logo_100_100/0/1519951880616?e=1687996800&v=beta&t=9is_bVp6AbucbbnSbDUPHSA6kqgLhpUF7TYuXDA1TsA,"Sioux City, IA",On-site,1 week ago,40 applicants,Contract · Entry level,51-200 employees · Defense and Space Manufacturing,Sioux City,Iowa,51-200 employees,Defense and Space Manufacturing
1,3544009597,https://www.linkedin.com/jobs/view/3544009597,Azure Data Engineer,IT Minds LLC,https://media.licdn.com/dms/image/C560BAQEZkh7wG0X0ew/company-logo_100_100/0/1618287716162?e=1687996800&v=beta&t=nLjiBBkms1IPvKEQf2SkcAKcopZV_ISF51Esz0TjGbY,"Charlotte, NC",On-site,3 weeks ago,,Full-time · Entry level,51-200 employees · IT Services and IT Consulting,Charlotte,North Carolina,51-200 employees,IT Services and IT Consulting
2,3541503028,https://www.linkedin.com/jobs/view/3541503028,Big Data Developer,Diverse Lynx,https://media.licdn.com/dms/image/C4E0BAQGk4XHA-0aydA/company-logo_100_100/0/1553721179130?e=1687996800&v=beta&t=iLaN2mgy_v0G0TzRbyYv_laOSXWA7F7pmgATQ0ltneI,"Atlantic City, NJ",Hybrid,3 weeks ago,,Contract · Entry level,"1,001-5,000 employees · Software Development",Atlantic City,New Jersey,"1,001-5,000 employees",Software Development
3,3547594589,https://www.linkedin.com/jobs/view/3547594589,Data Scientist @ Remote,Diverse Lynx,https://media.licdn.com/dms/image/C4E0BAQGk4XHA-0aydA/company-logo_100_100/0/1553721179130?e=1687996800&v=beta&t=iLaN2mgy_v0G0TzRbyYv_laOSXWA7F7pmgATQ0ltneI,,Remote,27 minutes ago,,Contract · Entry level,"1,001-5,000 employees · Software Development",,,"1,001-5,000 employees",Software Development
4,3547517166,https://www.linkedin.com/jobs/view/3547517166,Viral Research Scientist I,Compunnel Inc.,https://media.licdn.com/dms/image/C4E0BAQFe-tL0qK8kxw/company-logo_100_100/0/1632140982446?e=1687996800&v=beta&t=2gBvJLsqrfKdM1f8_cG0qi5shKIGVW5As4rdpceMmpo,"Athens, GA",On-site,1 day ago,,Full-time · Mid-Senior level,"1,001-5,000 employees · IT Services and IT Consulting",Athens,Georgia,"1,001-5,000 employees",IT Services and IT Consulting


only works when comp size is avail

### Create Salary Column

In [404]:
# salary_pattern = r"\$(\d)+.*\/(yr|hr)"
# seq = '$50,000/yr - $75,000/yr (from job description) · Full-time · Associate'
# seq = '$16.74/hr - $21.75/hr'
# seq = '$20/hr - $23/hr'
# seq = '$40/hr - $45/hr'
# result = re.search(salary_pattern, seq)
# result.group()

In [405]:
salary_pattern = r"\$(\d)+.*\/(yr|hr)"

for idx, row in job_df_2.iterrows():
    job_overview = row['Job Overview']
    try:
        salary = re.search(salary_pattern, job_overview)
        job_df_2.at[idx, 'Salary'] = salary.group() if salary is not None else None
    except NoSuchElementException:
        pass

job_df_2.head(5)

Unnamed: 0,Job ID,Job URL,Name,Company,Company Logo URL,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State,Company Size,Industry,Salary
0,3530951644,https://www.linkedin.com/jobs/view/3530951644,Data Analyst/Administrative Assistant,"Tuba Group, Inc.",https://media.licdn.com/dms/image/C4D0BAQHw44zVuWeBMQ/company-logo_100_100/0/1519951880616?e=1687996800&v=beta&t=9is_bVp6AbucbbnSbDUPHSA6kqgLhpUF7TYuXDA1TsA,"Sioux City, IA",On-site,1 week ago,40 applicants,Contract · Entry level,51-200 employees · Defense and Space Manufacturing,Sioux City,Iowa,51-200 employees,Defense and Space Manufacturing,
1,3544009597,https://www.linkedin.com/jobs/view/3544009597,Azure Data Engineer,IT Minds LLC,https://media.licdn.com/dms/image/C560BAQEZkh7wG0X0ew/company-logo_100_100/0/1618287716162?e=1687996800&v=beta&t=nLjiBBkms1IPvKEQf2SkcAKcopZV_ISF51Esz0TjGbY,"Charlotte, NC",On-site,3 weeks ago,,Full-time · Entry level,51-200 employees · IT Services and IT Consulting,Charlotte,North Carolina,51-200 employees,IT Services and IT Consulting,
2,3541503028,https://www.linkedin.com/jobs/view/3541503028,Big Data Developer,Diverse Lynx,https://media.licdn.com/dms/image/C4E0BAQGk4XHA-0aydA/company-logo_100_100/0/1553721179130?e=1687996800&v=beta&t=iLaN2mgy_v0G0TzRbyYv_laOSXWA7F7pmgATQ0ltneI,"Atlantic City, NJ",Hybrid,3 weeks ago,,Contract · Entry level,"1,001-5,000 employees · Software Development",Atlantic City,New Jersey,"1,001-5,000 employees",Software Development,
3,3547594589,https://www.linkedin.com/jobs/view/3547594589,Data Scientist @ Remote,Diverse Lynx,https://media.licdn.com/dms/image/C4E0BAQGk4XHA-0aydA/company-logo_100_100/0/1553721179130?e=1687996800&v=beta&t=iLaN2mgy_v0G0TzRbyYv_laOSXWA7F7pmgATQ0ltneI,,Remote,27 minutes ago,,Contract · Entry level,"1,001-5,000 employees · Software Development",,,"1,001-5,000 employees",Software Development,
4,3547517166,https://www.linkedin.com/jobs/view/3547517166,Viral Research Scientist I,Compunnel Inc.,https://media.licdn.com/dms/image/C4E0BAQFe-tL0qK8kxw/company-logo_100_100/0/1632140982446?e=1687996800&v=beta&t=2gBvJLsqrfKdM1f8_cG0qi5shKIGVW5As4rdpceMmpo,"Athens, GA",On-site,1 day ago,,Full-time · Mid-Senior level,"1,001-5,000 employees · IT Services and IT Consulting",Athens,Georgia,"1,001-5,000 employees",IT Services and IT Consulting,


### Split Job Overview

- Workplace Type
- Level of Expertise

In [406]:
tmp = job_df_2.copy()
tmp = tmp['Job Overview'].str.replace(' (from job description)', '', regex=False)
tmp = tmp.str.replace(r'\$(\d)+.*\/(yr|hr)', '', regex=True).str.strip(' · ')
list(set(tmp.str.split(' · ').values.sum()))

['11-50 employees',
 '5,001-10,000 employees',
 'Director',
 'Contract',
 'Wholesale Building Materials',
 'Entry level',
 'Temporary',
 'Full-time',
 'Software Development',
 'Executive',
 'Associate',
 'Part-time',
 'Internship',
 'Mid-Senior level']

In [407]:
contract_type_list = ['Temporary', 'Part-time', 'Full-time', 'Internship', 'Contract']
contract_type_pattern = '|'.join(contract_type_list)

for idx, row in job_df_2.iterrows():
    job_overview = row['Job Overview']
    try:
        contract_type = re.search(contract_type_pattern, job_overview)
        job_df_2.at[idx, 'Contract Type'] = contract_type.group() if contract_type is not None else None
    except NoSuchElementException:
        pass
job_df_2.head(5)

Unnamed: 0,Job ID,Job URL,Name,Company,Company Logo URL,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State,Company Size,Industry,Salary,Contract Type
0,3530951644,https://www.linkedin.com/jobs/view/3530951644,Data Analyst/Administrative Assistant,"Tuba Group, Inc.",https://media.licdn.com/dms/image/C4D0BAQHw44zVuWeBMQ/company-logo_100_100/0/1519951880616?e=1687996800&v=beta&t=9is_bVp6AbucbbnSbDUPHSA6kqgLhpUF7TYuXDA1TsA,"Sioux City, IA",On-site,1 week ago,40 applicants,Contract · Entry level,51-200 employees · Defense and Space Manufacturing,Sioux City,Iowa,51-200 employees,Defense and Space Manufacturing,,Contract
1,3544009597,https://www.linkedin.com/jobs/view/3544009597,Azure Data Engineer,IT Minds LLC,https://media.licdn.com/dms/image/C560BAQEZkh7wG0X0ew/company-logo_100_100/0/1618287716162?e=1687996800&v=beta&t=nLjiBBkms1IPvKEQf2SkcAKcopZV_ISF51Esz0TjGbY,"Charlotte, NC",On-site,3 weeks ago,,Full-time · Entry level,51-200 employees · IT Services and IT Consulting,Charlotte,North Carolina,51-200 employees,IT Services and IT Consulting,,Full-time
2,3541503028,https://www.linkedin.com/jobs/view/3541503028,Big Data Developer,Diverse Lynx,https://media.licdn.com/dms/image/C4E0BAQGk4XHA-0aydA/company-logo_100_100/0/1553721179130?e=1687996800&v=beta&t=iLaN2mgy_v0G0TzRbyYv_laOSXWA7F7pmgATQ0ltneI,"Atlantic City, NJ",Hybrid,3 weeks ago,,Contract · Entry level,"1,001-5,000 employees · Software Development",Atlantic City,New Jersey,"1,001-5,000 employees",Software Development,,Contract
3,3547594589,https://www.linkedin.com/jobs/view/3547594589,Data Scientist @ Remote,Diverse Lynx,https://media.licdn.com/dms/image/C4E0BAQGk4XHA-0aydA/company-logo_100_100/0/1553721179130?e=1687996800&v=beta&t=iLaN2mgy_v0G0TzRbyYv_laOSXWA7F7pmgATQ0ltneI,,Remote,27 minutes ago,,Contract · Entry level,"1,001-5,000 employees · Software Development",,,"1,001-5,000 employees",Software Development,,Contract
4,3547517166,https://www.linkedin.com/jobs/view/3547517166,Viral Research Scientist I,Compunnel Inc.,https://media.licdn.com/dms/image/C4E0BAQFe-tL0qK8kxw/company-logo_100_100/0/1632140982446?e=1687996800&v=beta&t=2gBvJLsqrfKdM1f8_cG0qi5shKIGVW5As4rdpceMmpo,"Athens, GA",On-site,1 day ago,,Full-time · Mid-Senior level,"1,001-5,000 employees · IT Services and IT Consulting",Athens,Georgia,"1,001-5,000 employees",IT Services and IT Consulting,,Full-time


In [408]:
exp_levels_list = ['Entry level', 'Junior', 'Mid-Senior level', 'Associate', 'Executive', 'Director']
exp_levels_pattern = '|'.join(exp_levels_list)

for idx, row in job_df_2.iterrows():
    job_overview = row['Job Overview']
    try:
        exp_level = re.search(exp_levels_pattern, job_overview)
        job_df_2.at[idx, 'Level of Expertise'] = exp_level.group() if exp_level is not None else None
    except NoSuchElementException:
        pass
job_df_2.head(5)

Unnamed: 0,Job ID,Job URL,Name,Company,Company Logo URL,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State,Company Size,Industry,Salary,Contract Type,Level of Expertise
0,3530951644,https://www.linkedin.com/jobs/view/3530951644,Data Analyst/Administrative Assistant,"Tuba Group, Inc.",https://media.licdn.com/dms/image/C4D0BAQHw44zVuWeBMQ/company-logo_100_100/0/1519951880616?e=1687996800&v=beta&t=9is_bVp6AbucbbnSbDUPHSA6kqgLhpUF7TYuXDA1TsA,"Sioux City, IA",On-site,1 week ago,40 applicants,Contract · Entry level,51-200 employees · Defense and Space Manufacturing,Sioux City,Iowa,51-200 employees,Defense and Space Manufacturing,,Contract,Entry level
1,3544009597,https://www.linkedin.com/jobs/view/3544009597,Azure Data Engineer,IT Minds LLC,https://media.licdn.com/dms/image/C560BAQEZkh7wG0X0ew/company-logo_100_100/0/1618287716162?e=1687996800&v=beta&t=nLjiBBkms1IPvKEQf2SkcAKcopZV_ISF51Esz0TjGbY,"Charlotte, NC",On-site,3 weeks ago,,Full-time · Entry level,51-200 employees · IT Services and IT Consulting,Charlotte,North Carolina,51-200 employees,IT Services and IT Consulting,,Full-time,Entry level
2,3541503028,https://www.linkedin.com/jobs/view/3541503028,Big Data Developer,Diverse Lynx,https://media.licdn.com/dms/image/C4E0BAQGk4XHA-0aydA/company-logo_100_100/0/1553721179130?e=1687996800&v=beta&t=iLaN2mgy_v0G0TzRbyYv_laOSXWA7F7pmgATQ0ltneI,"Atlantic City, NJ",Hybrid,3 weeks ago,,Contract · Entry level,"1,001-5,000 employees · Software Development",Atlantic City,New Jersey,"1,001-5,000 employees",Software Development,,Contract,Entry level
3,3547594589,https://www.linkedin.com/jobs/view/3547594589,Data Scientist @ Remote,Diverse Lynx,https://media.licdn.com/dms/image/C4E0BAQGk4XHA-0aydA/company-logo_100_100/0/1553721179130?e=1687996800&v=beta&t=iLaN2mgy_v0G0TzRbyYv_laOSXWA7F7pmgATQ0ltneI,,Remote,27 minutes ago,,Contract · Entry level,"1,001-5,000 employees · Software Development",,,"1,001-5,000 employees",Software Development,,Contract,Entry level
4,3547517166,https://www.linkedin.com/jobs/view/3547517166,Viral Research Scientist I,Compunnel Inc.,https://media.licdn.com/dms/image/C4E0BAQFe-tL0qK8kxw/company-logo_100_100/0/1632140982446?e=1687996800&v=beta&t=2gBvJLsqrfKdM1f8_cG0qi5shKIGVW5As4rdpceMmpo,"Athens, GA",On-site,1 day ago,,Full-time · Mid-Senior level,"1,001-5,000 employees · IT Services and IT Consulting",Athens,Georgia,"1,001-5,000 employees",IT Services and IT Consulting,,Full-time,Mid-Senior level


### Replace all None and emptry strings with NaN

In [430]:
job_df_2 = job_df_2.replace('', np.nan).fillna('Unknown')

# EDA

In [427]:
cols = ['Job URL', 'Name', 'Company', 'Location_City', 'Location_State',
       'Workplace Type', 'Time Posted', 'Applicants Count',
        'Salary', 'Contract Type', 'Level of Expertise',
       'Company Size', 'Industry']
job_df_2 = job_df_2[cols]

In [431]:
job_df_2.head(10)

Unnamed: 0,Job URL,Name,Company,Location_City,Location_State,Workplace Type,Time Posted,Applicants Count,Salary,Contract Type,Level of Expertise,Company Size,Industry
0,https://www.linkedin.com/jobs/view/3530951644,Data Analyst/Administrative Assistant,"Tuba Group, Inc.",Sioux City,Iowa,On-site,1 week ago,40 applicants,Unknown,Contract,Entry level,51-200 employees,Defense and Space Manufacturing
1,https://www.linkedin.com/jobs/view/3544009597,Azure Data Engineer,IT Minds LLC,Charlotte,North Carolina,On-site,3 weeks ago,Unknown,Unknown,Full-time,Entry level,51-200 employees,IT Services and IT Consulting
2,https://www.linkedin.com/jobs/view/3541503028,Big Data Developer,Diverse Lynx,Atlantic City,New Jersey,Hybrid,3 weeks ago,Unknown,Unknown,Contract,Entry level,"1,001-5,000 employees",Software Development
3,https://www.linkedin.com/jobs/view/3547594589,Data Scientist @ Remote,Diverse Lynx,Unknown,Unknown,Remote,27 minutes ago,Unknown,Unknown,Contract,Entry level,"1,001-5,000 employees",Software Development
4,https://www.linkedin.com/jobs/view/3547517166,Viral Research Scientist I,Compunnel Inc.,Athens,Georgia,On-site,1 day ago,Unknown,Unknown,Full-time,Mid-Senior level,"1,001-5,000 employees",IT Services and IT Consulting
5,https://www.linkedin.com/jobs/view/3541502847,Artificial Intelligence IOS developer,Unknown,Unknown,Unknown,Remote,56 minutes ago,1 applicant,Unknown,Temporary,Unknown,Unknown,Unknown
6,https://www.linkedin.com/jobs/view/3526821605,Data Scientist,Diverse Lynx,Santa Clara,California,Hybrid,1 week ago,11 applicants,Unknown,Contract,Entry level,"1,001-5,000 employees",Software Development
7,https://www.linkedin.com/jobs/view/3513213441,Dev10 Entry Level Data Engineer - Nationwide,Genesis10,Fort Wayne,Indiana,On-site,2 weeks ago,5 applicants,Unknown,Full-time,Entry level,"1,001-5,000 employees",IT Services and IT Consulting
8,https://www.linkedin.com/jobs/view/3532768586,68053338 - DATA BASE ANALYST,State of Florida,Tallahassee,Florida,On-site,1 week ago,4 applicants,Unknown,Part-time,Entry level,"10,001+ employees",Government Administration
9,https://www.linkedin.com/jobs/view/3529968990,Sr Project Manager - Corporate Talent Acquisition,H&amp;R Block,Kansas City,Missouri,On-site,1 week ago,25 applicants,Unknown,Full-time,Mid-Senior level,"10,001+ employees",Retail


In [432]:
job_df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 656 entries, 0 to 655
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Job URL             656 non-null    object
 1   Name                656 non-null    object
 2   Company             656 non-null    object
 3   Location_City       656 non-null    object
 4   Location_State      656 non-null    object
 5   Workplace Type      656 non-null    object
 6   Time Posted         656 non-null    object
 7   Applicants Count    656 non-null    object
 8   Salary              656 non-null    object
 9   Contract Type       656 non-null    object
 10  Level of Expertise  656 non-null    object
 11  Company Size        656 non-null    object
 12  Industry            656 non-null    object
dtypes: object(13)
memory usage: 66.8+ KB
