# IMPORT & SET UP

In [14]:
import pandas as pd
import numpy as np
import time
from datetime import datetime
import urllib.parse
import os
import re

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

pd.set_option('display.max_colwidth', None)

In [15]:
PROJECT_DIR = os.getcwd()
ID_DIR_NAME = 'job_id_dir'
INFO_DIR_NAME = 'job_info_dir'

# CLEANING

#### Load most recent DataFrame from csv file

In [16]:
JOB_INFO_PATH = os.path.join(PROJECT_DIR, INFO_DIR_NAME, (os.listdir(os.path.join(PROJECT_DIR, INFO_DIR_NAME)))[-1])

job_df = pd.read_csv(JOB_INFO_PATH, index_col=0, sep='@')
job_df.head(5)

Unnamed: 0,Job ID,Job URL,Name,Company,Company Logo URL,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview
0,3520044004,https://www.linkedin.com/jobs/view/3520044004,"Data Scientist, Research",TikTok,https://media.licdn.com/dms/image/C510BAQGCdThXIss7UQ/company-logo_100_100/0/1539940587971?e=1687996800&v=beta&t=1AyKLKEh0SiRwHFVslSVLJ-5LaOWsCzFqZtQ9ZvHi1o,"San Jose, CA",Hybrid,2 weeks ago,198 applicants,Full-time,"10,001+ employees · Entertainment Providers"
1,3523743282,https://www.linkedin.com/jobs/view/3523743282,Global Data Scientist,Kimberly-Clark,https://media.licdn.com/dms/image/C560BAQFahtjOdf_ETQ/company-logo_100_100/0/1542208571146?e=1687996800&v=beta&t=y-aRHU6gnrNyr6nMPswJJHlWEoEOjZyHL-a1Qs5MPFY,United States,Remote,2 weeks ago,Over 200 applicants,Full-time · Mid-Senior level,"10,001+ employees · Manufacturing"
2,3520459813,https://www.linkedin.com/jobs/view/3520459813,Data Analytics Intern (Summer 2023),Industry Dive,https://media.licdn.com/dms/image/C4E0BAQEAkpLAgFUtpA/company-logo_100_100/0/1520994058492?e=1687996800&v=beta&t=wkHpT8VLjHRZryS_VDdpboOGmMkNRTNhMqVoDrCC2_M,"Washington, DC",Remote,,,$16/hr - $21/hr (from job description) · Internship · Internship,201-500 employees · Online Audio and Video Media
3,3525723554,https://www.linkedin.com/jobs/view/3525723554,Data Scientist Solution Specialist- IT Internship,Waters Corporation,https://media.licdn.com/dms/image/C560BAQHFDhBFVWfhzg/company-logo_100_100/0/1656651227123?e=1687996800&v=beta&t=YmLFUIyNIljy5hv0bST7P3wzrzdVBRln9qUpFAufG0c,"Milford, MA",On-site,1 week ago,Over 200 applicants,Internship · Internship,"5,001-10,000 employees · Biotechnology Research"
4,3527821880,https://www.linkedin.com/jobs/view/3527821880,Data Engineer,Chatham Financial,https://media.licdn.com/dms/image/C4D0BAQFPJJtAqKZSKA/company-logo_100_100/0/1566565993951?e=1687996800&v=beta&t=w4VlO0akxyqbbvH7Io6cMb3i8qShWG84zirwSGF-rDM,"Kennett Square, PA",On-site,2 weeks ago,104 applicants,Full-time · Entry level,"501-1,000 employees · Financial Services"


In [17]:
job_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 885 entries, 0 to 884
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Job ID            885 non-null    int64 
 1   Job URL           885 non-null    object
 2   Name              885 non-null    object
 3   Company           883 non-null    object
 4   Company Logo URL  885 non-null    object
 5   Location          885 non-null    object
 6   Workplace Type    754 non-null    object
 7   Time Posted       778 non-null    object
 8   Applicants Count  776 non-null    object
 9   Job Overview      885 non-null    object
 10  Company Overview  885 non-null    object
dtypes: int64(1), object(10)
memory usage: 83.0+ KB


In [18]:
job_df_2 = job_df.copy() # Make a copy

### Split Location
New columns:
- Location_City
- Location_State

In [19]:
job_df['Location'].str.split(', ').map(len).value_counts()

2    776
1    104
3      5
Name: Location, dtype: int64

#### Locations that have 3 values:

- City, States, Country

In [20]:
idx = job_df[job_df['Location'].str.split(', ').map(len)==3].index.to_list()
job_df[['Location_City', 'Location_State', 'Location_Country']] = job_df.iloc[idx]['Location'].str.split(', ', expand=True)
job_df.iloc[idx, 5:]

Unnamed: 0,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State,Location_Country
158,"Los Angeles, California, United States",Remote,,,"$130,000/yr - $180,000/yr (from job description) · Full-time · Mid-Senior level","501-1,000 employees · Staffing and Recruiting",Los Angeles,California,United States
460,"Fort Bragg, North Carolina, United States",On-site,2 weeks ago,76 applicants,Full-time · Entry level,51-200 employees · IT Services and IT Consulting,Fort Bragg,North Carolina,United States
477,"Fort Bragg, North Carolina, United States",,2 weeks ago,1 applicant,Full-time · Entry level,11-50 employees · Defense and Space Manufacturing,Fort Bragg,North Carolina,United States
577,"La Jolla, California, United States",On-site,2 weeks ago,5 applicants,Contract · Entry level,"1,001-5,000 employees · Staffing and Recruiting",La Jolla,California,United States
691,"New York, New York, United States",,1 week ago,45 applicants,"$105,000/yr - $116,000/yr (from job description) · Full-time · Mid-Senior level","10,001+ employees · Telecommunications",New York,New York,United States


#### Locations that have 2 values:
- City, State
- State, Country

In [21]:
pattern_1 = r'(.+), (United States)'
pattern_2 = r'(.+), ([A-Z]{2})'
mask = (job_df['Location'].str.split(', ').map(len)==2)

for idx, row in job_df[mask].iterrows():
    seq = row['Location']

    location_1 = re.search(pattern_1, seq)
    location_2 = re.search(pattern_2, seq)

    if location_1:
        job_df.at[idx, 'Location_State'] = location_1.group(1)
    elif location_2:
        job_df.at[idx, 'Location_City'] = location_2.group(1)
        job_df.at[idx, 'Location_State'] = location_2.group(2)
    else:
        'Unknown'

job_df.iloc[10:15, 5:]

Unnamed: 0,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State,Location_Country
10,"Washington, DC",,2 weeks ago,Over 200 applicants,Contract · Entry level,201-500 employees · Information Technology &amp; Services,Washington,DC,
11,"Tampa, FL",Hybrid,2 weeks ago,83 applicants,Full-time · Mid-Senior level,"10,001+ employees · IT Services and IT Consulting",Tampa,FL,
12,United States,Remote,1 week ago,Over 200 applicants,"$72,800/yr - $109,200/yr (from job description) · Full-time · Entry level",201-500 employees · Education Management,,,
13,"Maine, United States",Hybrid,2 weeks ago,38 applicants,Full-time · Mid-Senior level,"501-1,000 employees · Technology, Information and Internet",,Maine,
14,"McLean, VA",Hybrid,2 weeks ago,113 applicants,"$58,400/yr - $133,000/yr (from job description) · Full-time","10,001+ employees · IT Services and IT Consulting",McLean,VA,


### Except Washington DC

#### Locations that have only 1 information

In [22]:
job_df[~(job_df['Location'].str.contains(r', '))]['Location'].value_counts()

United States                          69
New York City Metropolitan Area         9
San Francisco Bay Area                  6
Washington DC-Baltimore Area            3
Greater Boston                          3
Cincinnati Metropolitan Area            2
Greater Chicago Area                    2
Des Moines Metropolitan Area            2
Dallas-Fort Worth Metroplex             2
Atlanta Metropolitan Area               1
Greater Sacramento                      1
Greater Houston                         1
Greater Vancouver Metropolitan Area     1
Greater Seattle Area                    1
Los Angeles Metropolitan Area           1
Name: Location, dtype: int64

In [23]:
mask = ~(job_df['Location'].str.contains(r', ')) & (job_df['Location']!='United States')
job_df.loc[mask, 'Location_City'] = job_df[mask]['Location']
job_df[mask].iloc[:3,5:]

Unnamed: 0,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State,Location_Country
39,New York City Metropolitan Area,Hybrid,1 week ago,185 applicants,Contract,"10,001+ employees · Beverage Manufacturing",New York City Metropolitan Area,,
56,Washington DC-Baltimore Area,Hybrid,2 weeks ago,33 applicants,"$85,000/yr · Full-time",1-10 employees,Washington DC-Baltimore Area,,
117,Cincinnati Metropolitan Area,Hybrid,1 day ago,Over 200 applicants,Contract · Mid-Senior level,"501-1,000 employees · IT Services and IT Consulting",Cincinnati Metropolitan Area,,


### Match State Names with Abbreviation

In [24]:
# remove city col for jobs in Washington, DC
job_df.loc[job_df['Location_City']=='Washington', 'Location_City'] = None

In [25]:
states_dict = {
'Alabama':	            'AL',
'Kentucky':	            'KY',
'Ohio':	                'OH',
'Alaska':	            'AK',
'Louisiana':	        'LA',
'Oklahoma':	            'OK',
'Arizona':	            'AZ',
'Maine':	            'ME',
'Oregon':	            'OR',
'Arkansas':	            'AR',
'Maryland':	            'MD',
'Pennsylvania':	        'PA',
'American Samoa':	    'AS',
'Massachusetts':	    'MA',
'Puerto Rico':	        'PR',
'California':	        'CA',
'Michigan':	            'MI',
'Rhode Island':	        'RI',
'Colorado':	            'CO',
'Minnesota':	        'MN',
'South Carolina':	    'SC',
'Connecticut':	        'CT',
'Mississippi':	        'MS',
'South Dakota':	        'SD',
'Delaware':	            'DE',
'Missouri':	            'MO',
'Tennessee':	        'TN',
'District of Columbia':	'DC',	
'Montana':	            'MT',
'Texas':	            'TX',
'Florida':	            'FL',
'Nebraska':	            'NE',
'Trust Territories':	'TT',
'Georgia':	            'GA',
'Nevada':	            'NV',
'Utah':	                'UT',
'Guam':	                'GU',
'New Hampshire':	    'NH',
'Vermont':	            'VT',
'Hawaii':	            'HI',
'New Jersey':	        'NJ',
'Virginia':	            'VA',
'Idaho':	            'ID',
'New Mexico':	        'NM',
'Virgin Islands':	    'VI',
'Illinois':	            'IL',
'New York':	            'NY',
'Washington':	        'WA',
'Indiana':	            'IN',
'North Carolina':	    'NC',
'West Virginia':	    'WV',
'Iowa':	                'IA',
'North Dakota':	        'ND',
'Wisconsin':	        'WI',
'Kansas':	            'KS',
'Northern Mariana Islands':	'MP',	
'Wyoming':	            'WY'
}
states_dict = dict(sorted(states_dict.items()))
states_dict = dict((v, k) for k, v in states_dict.items())
states_dict

{'AL': 'Alabama',
 'AK': 'Alaska',
 'AS': 'American Samoa',
 'AZ': 'Arizona',
 'AR': 'Arkansas',
 'CA': 'California',
 'CO': 'Colorado',
 'CT': 'Connecticut',
 'DE': 'Delaware',
 'DC': 'District of Columbia',
 'FL': 'Florida',
 'GA': 'Georgia',
 'GU': 'Guam',
 'HI': 'Hawaii',
 'ID': 'Idaho',
 'IL': 'Illinois',
 'IN': 'Indiana',
 'IA': 'Iowa',
 'KS': 'Kansas',
 'KY': 'Kentucky',
 'LA': 'Louisiana',
 'ME': 'Maine',
 'MD': 'Maryland',
 'MA': 'Massachusetts',
 'MI': 'Michigan',
 'MN': 'Minnesota',
 'MS': 'Mississippi',
 'MO': 'Missouri',
 'MT': 'Montana',
 'NE': 'Nebraska',
 'NV': 'Nevada',
 'NH': 'New Hampshire',
 'NJ': 'New Jersey',
 'NM': 'New Mexico',
 'NY': 'New York',
 'NC': 'North Carolina',
 'ND': 'North Dakota',
 'MP': 'Northern Mariana Islands',
 'OH': 'Ohio',
 'OK': 'Oklahoma',
 'OR': 'Oregon',
 'PA': 'Pennsylvania',
 'PR': 'Puerto Rico',
 'RI': 'Rhode Island',
 'SC': 'South Carolina',
 'SD': 'South Dakota',
 'TN': 'Tennessee',
 'TX': 'Texas',
 'TT': 'Trust Territories',
 'UT': 

In [26]:
job_df['Location_State'] = job_df['Location_State'].apply(lambda x: states_dict[x] if x in states_dict else x)
job_df['Location_State'].unique()

array(['California', nan, 'District of Columbia', 'Massachusetts',
       'Pennsylvania', 'New Jersey', 'Illinois', 'Virginia', 'Florida',
       'Maine', 'Kansas', 'Texas', 'Colorado', 'Nevada', 'Arkansas',
       'Georgia', 'Maryland', 'New York', 'Delaware', 'Washington',
       'North Carolina', 'Connecticut', 'Rhode Island', 'Missouri',
       'South Carolina', 'Indiana', 'Iowa', 'Ohio', 'Hawaii',
       'New Hampshire', 'Alabama', 'Wisconsin', 'Oklahoma', 'Tennessee',
       'Michigan', 'Kentucky', 'Oregon', 'Idaho', 'Minnesota', 'Vermont',
       'Nebraska', 'Arizona', 'South Dakota', 'Louisiana', 'Utah',
       'Mississippi', 'New Mexico', 'Montana'], dtype=object)

In [27]:
job_df = job_df.drop(columns=['Location_Country'])

#### Fill 'Remote' location for Remote jobs

In [28]:
job_df.loc[(job_df['Workplace Type']=='Remote'), 'Location_City'] = 'Remote'
job_df.loc[(job_df['Workplace Type']=='Remote'), 'Location_State'] = 'Remote'

In [29]:
job_df.iloc[10:15,5:] # Recheck

Unnamed: 0,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State
10,"Washington, DC",,2 weeks ago,Over 200 applicants,Contract · Entry level,201-500 employees · Information Technology &amp; Services,,District of Columbia
11,"Tampa, FL",Hybrid,2 weeks ago,83 applicants,Full-time · Mid-Senior level,"10,001+ employees · IT Services and IT Consulting",Tampa,Florida
12,United States,Remote,1 week ago,Over 200 applicants,"$72,800/yr - $109,200/yr (from job description) · Full-time · Entry level",201-500 employees · Education Management,Remote,Remote
13,"Maine, United States",Hybrid,2 weeks ago,38 applicants,Full-time · Mid-Senior level,"501-1,000 employees · Technology, Information and Internet",,Maine
14,"McLean, VA",Hybrid,2 weeks ago,113 applicants,"$58,400/yr - $133,000/yr (from job description) · Full-time","10,001+ employees · IT Services and IT Consulting",McLean,Virginia


### Split Company Overview
New columns:
- Company Size
- Industry

In [30]:
job_df['Company Overview'].str.split(' · ').map(len).value_counts()

2    844
1     41
Name: Company Overview, dtype: int64

In [31]:
list(set(job_df['Company Overview'].str.split(' · ').values.sum()))

['201-500 employees',
 'Non-profit Organizations',
 'Book and Periodical Publishing',
 'Farming',
 'Food and Beverage Manufacturing',
 '51-200 employees',
 '10,001+ employees',
 'Semiconductors',
 'Law Enforcement',
 'Environmental Services',
 'Human Resources Services',
 'Professional Services',
 'Truck Transportation',
 'Medical Equipment Manufacturing',
 'Investment Management',
 'Legal Services',
 'Medical Practices',
 'Technology, Information and Internet',
 'See how you compare to 92 applicants.<span class="white-space-pre"> </span><a class="app-aware-link " target="_self" href="https://www.linkedin.com/premium/products/?family=JSS&amp;upsellOrderOrigin=premium_job_details_summary_card&amp;utype=job" data-test-app-aware-link="">Try Premium for free</a>',
 'Airlines and Aviation',
 'Biotechnology',
 'Motor Vehicle Manufacturing',
 'Data Infrastructure and Analytics',
 'Information Services',
 'Consumer Services',
 'Advertising Services',
 'Hospitals and Health Care',
 'Greg M. is 

#### Remove invalid Company Overview values

In [32]:
job_df.loc[~job_df['Company Overview'].str.contains('employees')]

Unnamed: 0,Job ID,Job URL,Name,Company,Company Logo URL,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State
534,3525735059,https://www.linkedin.com/jobs/view/3525735059,Data Engineer,,"data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7","Dallas, TX",Hybrid,1 week ago,92 applicants,Full-time · Mid-Senior level,"See how you compare to 92 applicants.<span class=""white-space-pre""> </span><a class=""app-aware-link "" target=""_self"" href=""https://www.linkedin.com/premium/products/?family=JSS&amp;upsellOrderOrigin=premium_job_details_summary_card&amp;utype=job"" data-test-app-aware-link="""">Try Premium for free</a>",Dallas,Texas
603,3525016432,https://www.linkedin.com/jobs/view/3525016432,Project Manager,,"data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7","Stuart, FL",Hybrid,2 weeks ago,44 applicants,Full-time,Greg M. is hiring for this job,Stuart,Florida


In [33]:
job_df.loc[~job_df['Company Overview'].str.contains('employees'), 'Company Overview'] = 'Unknown' # replace invalid Company Overview with Unknown
job_df[job_df['Company Overview'] == 'Unknown'] # recheck

Unnamed: 0,Job ID,Job URL,Name,Company,Company Logo URL,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State
534,3525735059,https://www.linkedin.com/jobs/view/3525735059,Data Engineer,,"data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7","Dallas, TX",Hybrid,1 week ago,92 applicants,Full-time · Mid-Senior level,Unknown,Dallas,Texas
603,3525016432,https://www.linkedin.com/jobs/view/3525016432,Project Manager,,"data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7","Stuart, FL",Hybrid,2 weeks ago,44 applicants,Full-time,Unknown,Stuart,Florida


#### Split method 1: regex

In [34]:
# seq = '1-10 employees'
# seq = '5,001-10,000 employees'
# seq = '10,001+ employees · Entertainment Providers'
# seq = '201-500 employees · Online Audio and Video Media'
# pattern = r"(\d+,?\d*\+?[\-?\d+,?\d+]* employees)( · )?(.+)*"
# re.search(pattern, seq)

In [35]:
pattern = r"(\d+,?\d*\+?[\-?\d+,?\d+]* employees)( · )?(.+)*"

for idx, row in job_df.iterrows():
    seq = row['Company Overview']
    company_overview = re.search(pattern, seq)
    job_df.at[idx, 'Company Size'] = company_overview.group(1) if company_overview is not None else 'Unknown'
    job_df.at[idx, 'Industry'] = company_overview.group(3) if company_overview is not None else 'Unknown'

job_df.iloc[:10, 5:]

Unnamed: 0,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State,Company Size,Industry
0,"San Jose, CA",Hybrid,2 weeks ago,198 applicants,Full-time,"10,001+ employees · Entertainment Providers",San Jose,California,"10,001+ employees",Entertainment Providers
1,United States,Remote,2 weeks ago,Over 200 applicants,Full-time · Mid-Senior level,"10,001+ employees · Manufacturing",Remote,Remote,"10,001+ employees",Manufacturing
2,"Washington, DC",Remote,,,$16/hr - $21/hr (from job description) · Internship · Internship,201-500 employees · Online Audio and Video Media,Remote,Remote,201-500 employees,Online Audio and Video Media
3,"Milford, MA",On-site,1 week ago,Over 200 applicants,Internship · Internship,"5,001-10,000 employees · Biotechnology Research",Milford,Massachusetts,"5,001-10,000 employees",Biotechnology Research
4,"Kennett Square, PA",On-site,2 weeks ago,104 applicants,Full-time · Entry level,"501-1,000 employees · Financial Services",Kennett Square,Pennsylvania,"501-1,000 employees",Financial Services
5,"Basking Ridge, NJ",,2 days ago,25 applicants,Full-time,"1,001-5,000 employees · Pharmaceutical Manufacturing",Basking Ridge,New Jersey,"1,001-5,000 employees",Pharmaceutical Manufacturing
6,"Chicago, IL",Hybrid,2 weeks ago,Over 200 applicants,"$85,000/yr - $130,000/yr · Full-time · Mid-Senior level",11-50 employees · Staffing and Recruiting,Chicago,Illinois,11-50 employees,Staffing and Recruiting
7,"Washington, DC",Hybrid,2 weeks ago,Over 200 applicants,Full-time · Associate,51-200 employees · Defense and Space Manufacturing,,District of Columbia,51-200 employees,Defense and Space Manufacturing
8,"McLean, VA",Hybrid,1 week ago,67 applicants,Full-time · Mid-Senior level,"1,001-5,000 employees · Staffing and Recruiting",McLean,Virginia,"1,001-5,000 employees",Staffing and Recruiting
9,"Indianola, PA",Hybrid,2 weeks ago,43 applicants,"$120,000/yr · Contract · Mid-Senior level","10,001+ employees · Chemical Manufacturing",Indianola,Pennsylvania,"10,001+ employees",Chemical Manufacturing


#### Split method 2: .split()

In [36]:
# mask = job_df[job_df['Company Overview'].str.contains('employees')]
# job_df[['Company Size', 'Industry']] = job_df['Company Overview'].str.split(' · ', expand=True)
# job_df.head()

*Note: This method only works when Company size is available in Company Overview*

### Split Job Overview

- Salary
- Workplace Type
- Level of Expertise

#### Create Salary Column

In [37]:
# salary_pattern = r"\$(\d)+.*\/(yr|hr)"
# seq = '$50,000/yr - $75,000/yr (from job description) · Full-time · Associate'
# seq = '$16.74/hr - $21.75/hr'
# seq = '$20/hr - $23/hr'
# seq = '$40/hr - $45/hr'
# result = re.search(salary_pattern, seq)
# result.group()

In [38]:
salary_pattern = r"\$(\d)+.*\/(yr|hr)"

for idx, row in job_df.iterrows():
    job_overview = row['Job Overview']
    try:
        salary = re.search(salary_pattern, job_overview)
        job_df.at[idx, 'Salary'] = salary.group() if salary is not None else 'Unknown'
    except NoSuchElementException:
        pass

job_df.head(5)

Unnamed: 0,Job ID,Job URL,Name,Company,Company Logo URL,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State,Company Size,Industry,Salary
0,3520044004,https://www.linkedin.com/jobs/view/3520044004,"Data Scientist, Research",TikTok,https://media.licdn.com/dms/image/C510BAQGCdThXIss7UQ/company-logo_100_100/0/1539940587971?e=1687996800&v=beta&t=1AyKLKEh0SiRwHFVslSVLJ-5LaOWsCzFqZtQ9ZvHi1o,"San Jose, CA",Hybrid,2 weeks ago,198 applicants,Full-time,"10,001+ employees · Entertainment Providers",San Jose,California,"10,001+ employees",Entertainment Providers,Unknown
1,3523743282,https://www.linkedin.com/jobs/view/3523743282,Global Data Scientist,Kimberly-Clark,https://media.licdn.com/dms/image/C560BAQFahtjOdf_ETQ/company-logo_100_100/0/1542208571146?e=1687996800&v=beta&t=y-aRHU6gnrNyr6nMPswJJHlWEoEOjZyHL-a1Qs5MPFY,United States,Remote,2 weeks ago,Over 200 applicants,Full-time · Mid-Senior level,"10,001+ employees · Manufacturing",Remote,Remote,"10,001+ employees",Manufacturing,Unknown
2,3520459813,https://www.linkedin.com/jobs/view/3520459813,Data Analytics Intern (Summer 2023),Industry Dive,https://media.licdn.com/dms/image/C4E0BAQEAkpLAgFUtpA/company-logo_100_100/0/1520994058492?e=1687996800&v=beta&t=wkHpT8VLjHRZryS_VDdpboOGmMkNRTNhMqVoDrCC2_M,"Washington, DC",Remote,,,$16/hr - $21/hr (from job description) · Internship · Internship,201-500 employees · Online Audio and Video Media,Remote,Remote,201-500 employees,Online Audio and Video Media,$16/hr - $21/hr
3,3525723554,https://www.linkedin.com/jobs/view/3525723554,Data Scientist Solution Specialist- IT Internship,Waters Corporation,https://media.licdn.com/dms/image/C560BAQHFDhBFVWfhzg/company-logo_100_100/0/1656651227123?e=1687996800&v=beta&t=YmLFUIyNIljy5hv0bST7P3wzrzdVBRln9qUpFAufG0c,"Milford, MA",On-site,1 week ago,Over 200 applicants,Internship · Internship,"5,001-10,000 employees · Biotechnology Research",Milford,Massachusetts,"5,001-10,000 employees",Biotechnology Research,Unknown
4,3527821880,https://www.linkedin.com/jobs/view/3527821880,Data Engineer,Chatham Financial,https://media.licdn.com/dms/image/C4D0BAQFPJJtAqKZSKA/company-logo_100_100/0/1566565993951?e=1687996800&v=beta&t=w4VlO0akxyqbbvH7Io6cMb3i8qShWG84zirwSGF-rDM,"Kennett Square, PA",On-site,2 weeks ago,104 applicants,Full-time · Entry level,"501-1,000 employees · Financial Services",Kennett Square,Pennsylvania,"501-1,000 employees",Financial Services,Unknown


#### Create Workplace Type & Level of Expertise columns

In [39]:
tmp = job_df.copy()
tmp = tmp['Job Overview'].str.replace(' (from job description)', '', regex=False)
tmp = tmp.str.replace(r'\$(\d)+.*\/(yr|hr)', '', regex=True).str.strip(' · ')
list(set(tmp.str.split(' · ').values.sum()))

['Mid-Senior level',
 'Temporary',
 'Internship',
 'Part-time',
 '$6,597/month - $7,431/month',
 'Executive',
 '$5,380/month - $7,372/month',
 'Entry level',
 'Contract',
 'Director',
 'Associate',
 '$5,250/month - $5,583/month',
 'Full-time']

In [40]:
contract_type_list = ['Temporary', 'Part-time', 'Full-time', 'Internship', 'Contract']
contract_type_pattern = '|'.join(contract_type_list)

for idx, row in job_df.iterrows():
    job_overview = row['Job Overview']
    try:
        contract_type = re.search(contract_type_pattern, job_overview)
        job_df.at[idx, 'Contract Type'] = contract_type.group() if contract_type is not None else 'Unknown'
    except NoSuchElementException:
        pass
job_df.head(5)

Unnamed: 0,Job ID,Job URL,Name,Company,Company Logo URL,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State,Company Size,Industry,Salary,Contract Type
0,3520044004,https://www.linkedin.com/jobs/view/3520044004,"Data Scientist, Research",TikTok,https://media.licdn.com/dms/image/C510BAQGCdThXIss7UQ/company-logo_100_100/0/1539940587971?e=1687996800&v=beta&t=1AyKLKEh0SiRwHFVslSVLJ-5LaOWsCzFqZtQ9ZvHi1o,"San Jose, CA",Hybrid,2 weeks ago,198 applicants,Full-time,"10,001+ employees · Entertainment Providers",San Jose,California,"10,001+ employees",Entertainment Providers,Unknown,Full-time
1,3523743282,https://www.linkedin.com/jobs/view/3523743282,Global Data Scientist,Kimberly-Clark,https://media.licdn.com/dms/image/C560BAQFahtjOdf_ETQ/company-logo_100_100/0/1542208571146?e=1687996800&v=beta&t=y-aRHU6gnrNyr6nMPswJJHlWEoEOjZyHL-a1Qs5MPFY,United States,Remote,2 weeks ago,Over 200 applicants,Full-time · Mid-Senior level,"10,001+ employees · Manufacturing",Remote,Remote,"10,001+ employees",Manufacturing,Unknown,Full-time
2,3520459813,https://www.linkedin.com/jobs/view/3520459813,Data Analytics Intern (Summer 2023),Industry Dive,https://media.licdn.com/dms/image/C4E0BAQEAkpLAgFUtpA/company-logo_100_100/0/1520994058492?e=1687996800&v=beta&t=wkHpT8VLjHRZryS_VDdpboOGmMkNRTNhMqVoDrCC2_M,"Washington, DC",Remote,,,$16/hr - $21/hr (from job description) · Internship · Internship,201-500 employees · Online Audio and Video Media,Remote,Remote,201-500 employees,Online Audio and Video Media,$16/hr - $21/hr,Internship
3,3525723554,https://www.linkedin.com/jobs/view/3525723554,Data Scientist Solution Specialist- IT Internship,Waters Corporation,https://media.licdn.com/dms/image/C560BAQHFDhBFVWfhzg/company-logo_100_100/0/1656651227123?e=1687996800&v=beta&t=YmLFUIyNIljy5hv0bST7P3wzrzdVBRln9qUpFAufG0c,"Milford, MA",On-site,1 week ago,Over 200 applicants,Internship · Internship,"5,001-10,000 employees · Biotechnology Research",Milford,Massachusetts,"5,001-10,000 employees",Biotechnology Research,Unknown,Internship
4,3527821880,https://www.linkedin.com/jobs/view/3527821880,Data Engineer,Chatham Financial,https://media.licdn.com/dms/image/C4D0BAQFPJJtAqKZSKA/company-logo_100_100/0/1566565993951?e=1687996800&v=beta&t=w4VlO0akxyqbbvH7Io6cMb3i8qShWG84zirwSGF-rDM,"Kennett Square, PA",On-site,2 weeks ago,104 applicants,Full-time · Entry level,"501-1,000 employees · Financial Services",Kennett Square,Pennsylvania,"501-1,000 employees",Financial Services,Unknown,Full-time


In [41]:
exp_levels_list = ['Entry level', 'Junior', 'Mid-Senior level', 'Associate', 'Executive', 'Director']
exp_levels_pattern = '|'.join(exp_levels_list)

for idx, row in job_df.iterrows():
    job_overview = row['Job Overview']
    try:
        exp_level = re.search(exp_levels_pattern, job_overview)
        job_df.at[idx, 'Level of Expertise'] = exp_level.group() if exp_level is not None else 'Unknown'
    except NoSuchElementException:
        pass
job_df.head(5)

Unnamed: 0,Job ID,Job URL,Name,Company,Company Logo URL,Location,Workplace Type,Time Posted,Applicants Count,Job Overview,Company Overview,Location_City,Location_State,Company Size,Industry,Salary,Contract Type,Level of Expertise
0,3520044004,https://www.linkedin.com/jobs/view/3520044004,"Data Scientist, Research",TikTok,https://media.licdn.com/dms/image/C510BAQGCdThXIss7UQ/company-logo_100_100/0/1539940587971?e=1687996800&v=beta&t=1AyKLKEh0SiRwHFVslSVLJ-5LaOWsCzFqZtQ9ZvHi1o,"San Jose, CA",Hybrid,2 weeks ago,198 applicants,Full-time,"10,001+ employees · Entertainment Providers",San Jose,California,"10,001+ employees",Entertainment Providers,Unknown,Full-time,Unknown
1,3523743282,https://www.linkedin.com/jobs/view/3523743282,Global Data Scientist,Kimberly-Clark,https://media.licdn.com/dms/image/C560BAQFahtjOdf_ETQ/company-logo_100_100/0/1542208571146?e=1687996800&v=beta&t=y-aRHU6gnrNyr6nMPswJJHlWEoEOjZyHL-a1Qs5MPFY,United States,Remote,2 weeks ago,Over 200 applicants,Full-time · Mid-Senior level,"10,001+ employees · Manufacturing",Remote,Remote,"10,001+ employees",Manufacturing,Unknown,Full-time,Mid-Senior level
2,3520459813,https://www.linkedin.com/jobs/view/3520459813,Data Analytics Intern (Summer 2023),Industry Dive,https://media.licdn.com/dms/image/C4E0BAQEAkpLAgFUtpA/company-logo_100_100/0/1520994058492?e=1687996800&v=beta&t=wkHpT8VLjHRZryS_VDdpboOGmMkNRTNhMqVoDrCC2_M,"Washington, DC",Remote,,,$16/hr - $21/hr (from job description) · Internship · Internship,201-500 employees · Online Audio and Video Media,Remote,Remote,201-500 employees,Online Audio and Video Media,$16/hr - $21/hr,Internship,Unknown
3,3525723554,https://www.linkedin.com/jobs/view/3525723554,Data Scientist Solution Specialist- IT Internship,Waters Corporation,https://media.licdn.com/dms/image/C560BAQHFDhBFVWfhzg/company-logo_100_100/0/1656651227123?e=1687996800&v=beta&t=YmLFUIyNIljy5hv0bST7P3wzrzdVBRln9qUpFAufG0c,"Milford, MA",On-site,1 week ago,Over 200 applicants,Internship · Internship,"5,001-10,000 employees · Biotechnology Research",Milford,Massachusetts,"5,001-10,000 employees",Biotechnology Research,Unknown,Internship,Unknown
4,3527821880,https://www.linkedin.com/jobs/view/3527821880,Data Engineer,Chatham Financial,https://media.licdn.com/dms/image/C4D0BAQFPJJtAqKZSKA/company-logo_100_100/0/1566565993951?e=1687996800&v=beta&t=w4VlO0akxyqbbvH7Io6cMb3i8qShWG84zirwSGF-rDM,"Kennett Square, PA",On-site,2 weeks ago,104 applicants,Full-time · Entry level,"501-1,000 employees · Financial Services",Kennett Square,Pennsylvania,"501-1,000 employees",Financial Services,Unknown,Full-time,Entry level


### Replace all None and emptry strings with Unknown

In [42]:
job_df = job_df.replace('', np.nan).fillna('Unknown')

# EDA

In [43]:
cols = ['Job URL', 'Name', 'Company', 'Location', 'Location_City', 'Location_State',
       'Workplace Type', 'Time Posted', 'Applicants Count',
        'Salary', 'Contract Type', 'Level of Expertise',
       'Company Size', 'Industry']
df = job_df[cols]

In [44]:
df.head(10)

Unnamed: 0,Job URL,Name,Company,Location,Location_City,Location_State,Workplace Type,Time Posted,Applicants Count,Salary,Contract Type,Level of Expertise,Company Size,Industry
0,https://www.linkedin.com/jobs/view/3520044004,"Data Scientist, Research",TikTok,"San Jose, CA",San Jose,California,Hybrid,2 weeks ago,198 applicants,Unknown,Full-time,Unknown,"10,001+ employees",Entertainment Providers
1,https://www.linkedin.com/jobs/view/3523743282,Global Data Scientist,Kimberly-Clark,United States,Remote,Remote,Remote,2 weeks ago,Over 200 applicants,Unknown,Full-time,Mid-Senior level,"10,001+ employees",Manufacturing
2,https://www.linkedin.com/jobs/view/3520459813,Data Analytics Intern (Summer 2023),Industry Dive,"Washington, DC",Remote,Remote,Remote,Unknown,Unknown,$16/hr - $21/hr,Internship,Unknown,201-500 employees,Online Audio and Video Media
3,https://www.linkedin.com/jobs/view/3525723554,Data Scientist Solution Specialist- IT Internship,Waters Corporation,"Milford, MA",Milford,Massachusetts,On-site,1 week ago,Over 200 applicants,Unknown,Internship,Unknown,"5,001-10,000 employees",Biotechnology Research
4,https://www.linkedin.com/jobs/view/3527821880,Data Engineer,Chatham Financial,"Kennett Square, PA",Kennett Square,Pennsylvania,On-site,2 weeks ago,104 applicants,Unknown,Full-time,Entry level,"501-1,000 employees",Financial Services
5,https://www.linkedin.com/jobs/view/3545964641,"Programmer, Data Analysis, Epidemiology Analytics","Daiichi Sankyo, Inc.","Basking Ridge, NJ",Basking Ridge,New Jersey,Unknown,2 days ago,25 applicants,Unknown,Full-time,Unknown,"1,001-5,000 employees",Pharmaceutical Manufacturing
6,https://www.linkedin.com/jobs/view/3520729278,Investment Data Analyst,Westbourne Partners,"Chicago, IL",Chicago,Illinois,Hybrid,2 weeks ago,Over 200 applicants,"$85,000/yr - $130,000/yr",Full-time,Mid-Senior level,11-50 employees,Staffing and Recruiting
7,https://www.linkedin.com/jobs/view/3516637194,"Data Scientist, DC",Rhombus Power Inc.,"Washington, DC",Unknown,District of Columbia,Hybrid,2 weeks ago,Over 200 applicants,Unknown,Full-time,Associate,51-200 employees,Defense and Space Manufacturing
8,https://www.linkedin.com/jobs/view/3527065467,Data Operations Engineer / Business Operations Analyst,US Tech Solutions,"McLean, VA",McLean,Virginia,Hybrid,1 week ago,67 applicants,Unknown,Full-time,Mid-Senior level,"1,001-5,000 employees",Staffing and Recruiting
9,https://www.linkedin.com/jobs/view/3522212905,Medical Software Quality Engineer - $120k/yr,Bayer,"Indianola, PA",Indianola,Pennsylvania,Hybrid,2 weeks ago,43 applicants,"$120,000/yr",Contract,Mid-Senior level,"10,001+ employees",Chemical Manufacturing


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 885 entries, 0 to 884
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Job URL             885 non-null    object
 1   Name                885 non-null    object
 2   Company             885 non-null    object
 3   Location            885 non-null    object
 4   Location_City       885 non-null    object
 5   Location_State      885 non-null    object
 6   Workplace Type      885 non-null    object
 7   Time Posted         885 non-null    object
 8   Applicants Count    885 non-null    object
 9   Salary              885 non-null    object
 10  Contract Type       885 non-null    object
 11  Level of Expertise  885 non-null    object
 12  Company Size        885 non-null    object
 13  Industry            885 non-null    object
dtypes: object(14)
memory usage: 136.0+ KB


#### Filter out Jobs relevant to Data Analyst

In [46]:
# r"data|bi|analyst|analytics|machine learning|ml|ai|intelligence"
da_df = df[df.Name.str.contains(r"data analyst|analyst|analytics", case=False, regex=True)]
display(da_df.head())
print(f'Found {da_df.shape[0]} jobs relavant to data analyst')

Unnamed: 0,Job URL,Name,Company,Location,Location_City,Location_State,Workplace Type,Time Posted,Applicants Count,Salary,Contract Type,Level of Expertise,Company Size,Industry
2,https://www.linkedin.com/jobs/view/3520459813,Data Analytics Intern (Summer 2023),Industry Dive,"Washington, DC",Remote,Remote,Remote,Unknown,Unknown,$16/hr - $21/hr,Internship,Unknown,201-500 employees,Online Audio and Video Media
5,https://www.linkedin.com/jobs/view/3545964641,"Programmer, Data Analysis, Epidemiology Analytics","Daiichi Sankyo, Inc.","Basking Ridge, NJ",Basking Ridge,New Jersey,Unknown,2 days ago,25 applicants,Unknown,Full-time,Unknown,"1,001-5,000 employees",Pharmaceutical Manufacturing
6,https://www.linkedin.com/jobs/view/3520729278,Investment Data Analyst,Westbourne Partners,"Chicago, IL",Chicago,Illinois,Hybrid,2 weeks ago,Over 200 applicants,"$85,000/yr - $130,000/yr",Full-time,Mid-Senior level,11-50 employees,Staffing and Recruiting
8,https://www.linkedin.com/jobs/view/3527065467,Data Operations Engineer / Business Operations Analyst,US Tech Solutions,"McLean, VA",McLean,Virginia,Hybrid,1 week ago,67 applicants,Unknown,Full-time,Mid-Senior level,"1,001-5,000 employees",Staffing and Recruiting
12,https://www.linkedin.com/jobs/view/3532901532,Analyst,TNTP,United States,Remote,Remote,Remote,1 week ago,Over 200 applicants,"$72,800/yr - $109,200/yr",Full-time,Entry level,201-500 employees,Education Management


Found 237 jobs relavant to data analyst


#### Entry level jobs

In [48]:
da_df[da_df['Level of Expertise']=='Entry level'].head()

Unnamed: 0,Job URL,Name,Company,Location,Location_City,Location_State,Workplace Type,Time Posted,Applicants Count,Salary,Contract Type,Level of Expertise,Company Size,Industry
12,https://www.linkedin.com/jobs/view/3532901532,Analyst,TNTP,United States,Remote,Remote,Remote,1 week ago,Over 200 applicants,"$72,800/yr - $109,200/yr",Full-time,Entry level,201-500 employees,Education Management
27,https://www.linkedin.com/jobs/view/3542549318,Staff HR Data Analyst,Ridgeline,"Incline Village, NV",Incline Village,Nevada,Hybrid,2 days ago,4 applicants,"$130,000/yr - $159,000/yr",Full-time,Entry level,201-500 employees,Software Development
37,https://www.linkedin.com/jobs/view/3527860425,Analyst,Greystone,"New York, NY",New York,New York,On-site,2 weeks ago,Over 200 applicants,"$80,000/yr - $90,000/yr",Full-time,Entry level,"1,001-5,000 employees",Financial Services
52,https://www.linkedin.com/jobs/view/3531482679,Data Analyst,Baker Hughes,"Stafford, TX",Stafford,Texas,On-site,14 hours ago,Over 200 applicants,Unknown,Full-time,Entry level,"10,001+ employees",Oil and Gas
89,https://www.linkedin.com/jobs/view/3539667050,Quality Data Analyst,Zobility,"Toledo, OH",Toledo,Ohio,Unknown,2 days ago,1 applicant,Unknown,Contract,Entry level,201-500 employees,Staffing and Recruiting
