---
format:
    html:
        embed-resources: true
---

# Cleaning the Scraped Jobs

Manual cleaning and extracting of all of the scraped job postings below using Python data cleaning techniques.

In [None]:
import json
import os

# Grab location, job type, job post date, job platform, job id, job highlights, job descriptions, 
# job titles, and company names

locations = []
job_type = []
job_posting = []
job_platform = []
job_id = []
highlights = []
descriptions = []
titles = []
names = []

if os.path.isdir("data/"):
    for file in os.listdir("data/"):
        path = os.path.join("data/", file)
        if os.path.isfile(path) and file[0].isdigit():
            with open(path, 'r', encoding='utf-8') as search:
                data = json.load(search)
                for result in data['jobs_results']:

                    # Grab Location
                    location = result.get('location', 'NaN')
                    locations.append(location)

                    # Grab schedule
                    schedule = result.get('detected_extensions', {}).get('schedule_type', 'NaN')
                    job_type.append(schedule)

                    # Get posted date
                    date = result.get('detected_extensions', {}).get('posted_at', 'NaN')
                    job_posting.append(date)
                    
                    # Get job platform
                    platform = result.get('via', 'NaN')
                    job_platform.append(platform)
                    
                    # Get job id
                    id = result.get('job_id', 'NaN')
                    job_id.append(id)
                    
                    # Grab job highlights
                    highlight = result.get('job_highlights', [])
                    highlights.append(highlight)

                    # Grab job description 
                    description = result.get('description', 'Nan')
                    descriptions.append(description)

                    # Grab job name
                    title = result.get('title', 'NaN')
                    titles.append(title)

                    # Grab company name
                    name = result.get('company_name', 'NaN')
                    names.append(name)
                    

In [72]:
# Find all job postings with qualifications
import re

education = []
tech_skills = []

for each in highlights:
    if each and "Qualifications" in each[0].get("title", ""):
        items = each[0].get("items", [])
        
        # Clean items of unicode
        new_words = " ".join(items).lower()
        new_words = re.sub(r'\\u[0-9A-Fa-f]{4}', '', new_words)
        
        # Extract education level
        if ("bachelor" in new_words or "bachelors" in new_words or 'bachelors' in new_words):
            education.append("Degree")
        elif ("master" in new_words or "master's" in new_words or "masters" in new_words or "Ph.D" in new_words or "PhD" in new_words):
            education.append('Advanced Degree')
        else:
            education.append('None')
    else:
        education.append('NaN')
    
for each in highlights:
    if each and "Qualifications" in each[0].get("title", ""):
        items = each[0].get("items", [])
        
        # Clean items of unicode
        new_words = " ".join(items).lower()
        new_words = re.sub(r'\\u[0-9A-Fa-f]{4}', '', new_words)
        
        # Extract tech skills
        found = False
        tech = []
        if ("python" in new_words):
            tech.append("Python")
            found = True
        if ("sql" in new_words):
            tech.append('SQL')
            found = True
        if ("java" in new_words):
            tech.append('Java')
            found = True
        if ("cloud" in new_words):
            tech.append('Cloud')
            found = True
        if ("database" in new_words or "aws" in new_words or "azure" in new_words or "snowflake" in new_words):
            tech.append('Databases') 
            found = True
        if not found:
            tech.append('None')
        tech_skills.append(tech)
    else:
        tech_skills.append('NaN')

In [74]:
# Go through Benefits

health = []
other_ben = []

benefits_list = []

for each in highlights:
    items = None

    for section in each:
        if "Benefits" in section.get("title", ""):
            items = section.get("items", [])
        
            # Clean items of unicode
            new_words = " ".join(items).lower()
            new_words = re.sub(r'\\u[0-9A-Fa-f]{4}', '', new_words)

    if items == None:
        benefits_list.append('NaN')
    else:
        benefits_list.append(new_words)



In [None]:
for item in benefits_list:
    benefits = []
    found = False
    # Extract health benefits
    if ("medical" in item or "health" in item):
        benefits.append("Health Insurance")
        found = True
    if ("dental" in item):
        benefits.append('Dental')
    found = True
    if ("vision" in item):
        benefits.append('Vision')
    found = True
    if not found:
        benefits.append('None')
    health.append(benefits)

for item in benefits_list:
    other = []
    found = False
    # Extract non-health benefits
    if ("life" in item):
        other.append("Life Insurance")
        found = True
    if ("401k" in item or "retirement" in item):
        other.append('Retirement')
        found = True
    if ("development" in item):
        other.append('Professional Development')
        found = True
    if ("training" in item or "courses" in item):
        other.append('Additional Training')
        found = True
    if not found:
        other.append('None')
    other_ben.append(other)

In [89]:
# Go through Responsibilities

duties = []
responsibility_list =[]

for each in highlights:
    items = None
    
    for section in each:
        if "Responsibilities" in section.get("title", ""):
            items = section.get("items", [])
        
            # Clean items of unicode
            new_words = " ".join(items).lower()
            new_words = re.sub(r'\\u[0-9A-Fa-f]{4}', '', new_words)

    if items == None:
        responsibility_list.append('NaN')
    else:
        responsibility_list.append(new_words)

In [None]:
for item in responsibility_list:
    duty = []
    found = False
    # Extract duties
    if ("leadership" in item or "leader" in item or "lead" in item):
        duty.append("Leadership")
        found = True
    if ("research" in item):
        duty.append('Research')
        found = True
    if ("code" in item or "debug" in item):
        duty.append('Programming')
        found = True
    if ("machine learning" in item or "modeling" in item or "automate" in item):
        duty.append('ML')
        found = True
    if ("data-driven" in item or "statistical" in item or "data analysis" in item or "analysis" or "analyze" in item):
        duty.append('Analysis')
        found = True
    if not found:
        benefits.append('None')
    duties.append(duty)



In [80]:
# Go through job descriptions

count = []

for each in descriptions:

    # Clean text
    new_words = re.sub(r'\\u[0-9A-Fa-f]{4}', '', each)
    split_str = new_words.split()
    result= " ".join(split_str)
    new_result = re.sub(r'[^\w\s]', '', result)
    
    # Get number of words
    num = len(new_result.split())
    count.append(num)



In [81]:
# Clean up job titles

new_titles = []

for name in titles:
    result = name.replace(" / ", '')
    split_str = result.split()
    new_name = " ".join(split_str)
    new_titles.append(new_name)


In [None]:
# Clean up health
new_health = []

for each in health:
    if each == []:
        new_health.append('NaN')
    else:
        new_health.append(each)

In [91]:
# Create the dataset
import pandas as pd

df = pd.DataFrame({'Title':new_titles,
                   'Company': names,
                   'Location':locations,
                   'Type': job_type,
                   'Post_Date': job_posting,
                   'Platform': job_platform,
                   'Description_Words': count,
                   'Education': education,
                   'Technical_Skill': tech_skills,
                   'Duties': duties,
                   'Health_Benefit': new_health,
                   'Other_Benefit': other_ben,
                   'ID': job_id})

In [92]:
# Write out CSV file
df.to_csv('data/processed-jobs-1.csv', index=False)