---
format:
    html:
        embed-resources: true
---

# Cleaning: Part-2 

Cleaning the job postings using the OpenAI API.



In [None]:
from openai import OpenAI
import json

# Load OpenAI Key
with open('redacted') as f:
    keys = json.load(f)
OPEN_KEY = keys["open_ai"]

client = OpenAI(api_key= OPEN_KEY)

In [None]:
import os

locations = []
job_type = []
job_posting = []
job_platform = []
job_id = []
highlights = []
descriptions = []
titles = []
names = []

# Grab location, job type, job post date, job platform, job id, job highlights, job descriptions, 
# job titles, and company names

if os.path.isdir("data/"):
    for file in os.listdir("data/"):
        path = os.path.join("data/", file)
        if os.path.isfile(path) and file[0].isdigit():
            with open(path, 'r', encoding='utf-8') as search:
                data = json.load(search)
                for result in data['jobs_results']:

                    # Grab Location
                    location = result.get('location', 'NaN')
                    locations.append(location)

                    # Grab schedule
                    schedule = result.get('detected_extensions', {}).get('schedule_type', 'NaN')
                    job_type.append(schedule)

                    # Get posted date
                    date = result.get('detected_extensions', {}).get('posted_at', 'NaN')
                    job_posting.append(date)
                    
                    # Get job platform
                    platform = result.get('via', 'NaN')
                    job_platform.append(platform)
                    
                    # Get job id
                    id = result.get('job_id', 'NaN')
                    job_id.append(id)
                    
                    # Grab job highlights
                    highlight = result.get('job_highlights', [])
                    highlights.append(highlight)

                    # Grab job description 
                    description = result.get('description', 'Nan')
                    descriptions.append(description)

                    # Grab job name
                    title = result.get('title', 'Nan')
                    titles.append(title)

                    # Grab company name
                    name = result.get('company_name', 'NaN')
                    names.append(name)

In [None]:
# Write function to have LLM clean education requirements

def clean_text(text):
    response = client.chat.completions.create(
    model="gpt-4.1-mini",
    messages=[{"role": "system", "content": "You are a helpful summarization assistant."},
                {"role": "user", "content": f"Summarize the degree requirements (if any) in the following text in one or two words.: \n\n{text}"}])
    output = response.choices[0].message.content.strip()
    return(output)

In [40]:
import re

education = []

for each in highlights:
    if each and "Qualifications" in each[0].get("title", ""):
        items = each[0].get("items", [])
        
        # Clean items of unicode
        new_words = " ".join(items).lower()
        new_words = re.sub(r'\\u[0-9A-Fa-f]{4}', '', new_words)
        
        # Extract education level
        output = clean_text(new_words)
        education.append(output)
    else:
        education.append('NaN')

In [None]:
# Write function to have LLM clean technical skills
def tech_text(text):
    response = client.chat.completions.create(
    model="gpt-4.1-mini",
    messages=[{"role": "system", "content": "You are a helpful summarization assistant."},
                {"role": "user", "content": f"Summarize the technical skills requirements (if any) in the following text with up to five words.: \n\n{text}"}])
    output = response.choices[0].message.content.strip()
    return(output)

In [36]:

tech_skills = []

for each in highlights:
    if each and "Qualifications" in each[0].get("title", ""):
        items = each[0].get("items", [])
        
        # Clean items of unicode
        new_words = " ".join(items).lower()
        new_words = re.sub(r'\\u[0-9A-Fa-f]{4}', '', new_words)
        
        # Extract tech skills
        output = tech_text(new_words)
        tech_skills.append(output)
    else:
        tech_skills.append('NaN')

In [None]:
# Write function to have LLM clean benefits section
def benefits_text(text):
    response = client.chat.completions.create(
    model="gpt-4.1-mini",
    messages=[{"role": "system", "content": "You are a helpful summarization assistant."},
                {"role": "user", "content": f"Summarize the health benefits (if any) in the following text with a maximum of four words.: \n\n{text}"}])
    output = response.choices[0].message.content.strip()
    return(output)

In [49]:
# Go through Benefits

health = []

benefits_list = []

for each in highlights:
    items = None

    for section in each:
        if "Benefits" in section.get("title", ""):
            items = section.get("items", [])
        
            # Clean items of unicode
            new_words = " ".join(items).lower()
            new_words = re.sub(r'\\u[0-9A-Fa-f]{4}', '', new_words)

    if items == None:
        benefits_list.append('NaN')
    else:
        benefits_list.append(new_words)

In [50]:
for item in benefits_list:
    # Extract health benefits
    output = benefits_text(item)
    health.append(output)

In [None]:
# Write function to have LLM clean other non-health benefits
def nonbenefits_text(text):
    response = client.chat.completions.create(
    model="gpt-4.1-mini",
    messages=[{"role": "system", "content": "You are a helpful summarization assistant."},
                {"role": "user", "content": f"Summarize the non-health related benefits (if any) in the following text with three words: \n\n{text}"}])
    output = response.choices[0].message.content.strip()
    return(output)

In [61]:
other_ben = []

for item in benefits_list:
    # Extract non-health benefits
    output = nonbenefits_text(item)
    other_ben.append(output)

In [None]:
# Write function to have LLM clean responsibilities section
def responsibility_text(text):
    response = client.chat.completions.create(
    model="gpt-4.1-mini",
    messages=[{"role": "system", "content": "You are a helpful summarization assistant."},
                {"role": "user", "content": f"Summarize the responsibilities (if any) in the following text with up to five words: \n\n{text}"}])
    output = response.choices[0].message.content.strip()
    return(output)

In [74]:
# Go through Responsibilities

new_duties = []
responsibility_list =[]

for each in highlights:
    items = None
    
    for section in each:
        if "Responsibilities" in section.get("title", ""):
            items = section.get("items", [])
        
            # Clean items of unicode
            new_words = " ".join(items).lower()
            new_words = re.sub(r'\\u[0-9A-Fa-f]{4}', '', new_words)

    if items == None:
        responsibility_list.append('NaN')
    else:
        responsibility_list.append(new_words)

In [77]:
for item in responsibility_list:
    # Extract duties
    output = responsibility_text(item)
    new_duties.append(output)

In [80]:
count = []

for each in descriptions:

    # Clean text
    new_words = re.sub(r'\\u[0-9A-Fa-f]{4}', '', each)
    split_str = new_words.split()
    result= " ".join(split_str)
    new_result = re.sub(r'[^\w\s]', '', result)
    
    # Get number of words
    num = len(new_result.split())
    count.append(num)

In [81]:
# Clean up job titles

new_titles = []

for name in titles:
    result = name.replace(" / ", '')
    split_str = result.split()
    new_name = " ".join(split_str)
    new_titles.append(new_name)

In [82]:
# Create the dataset
import pandas as pd

df = pd.DataFrame({'Title':new_titles,
                   'Company': names,
                   'Location':locations,
                   'Type': job_type,
                   'Post_Date': job_posting,
                   'Platform': job_platform,
                   'Description_Words': count,
                   'Education': education,
                   'Technical_Skill': tech_skills,
                   'Duties': new_duties,
                   'Health_Benefit': health,
                   'Other_Benefit': other_ben,
                   'ID': job_id})

In [83]:
# Write out CSV file
df.to_csv('data/processed-jobs-2.csv', index=False)