In [4]:
from openai import OpenAI
import json
import pandas as pd
import time
import os

#### Need API key

In [5]:
client = OpenAI(
    api_key="AIzaSyCIPinvQ0DIhZJ87czUyLf6_MFBtrrvpNc",
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)

#### Load in the dataset, jobs_clean

In [6]:
df = pd.read_csv("data/jobs_clean.csv")
len(df)

35604

#### Main Loop

idea here is to concatenate a few JDs together and send them in one api call, this is to prevent the max request limit reach. 

Process 20 JDs per 1 api call, cannot put too many JDs in one together because the LLM might hallucinate. 

Need to use the open ai structured outputs and a custom format

In [7]:
system_message = '''
You are to summarize the following features concisely from each job description: "Core Responsibilities, Required Skills, Educational Requirements, Experience Level, Preferred Qualifications, Compensation and Benefits.

Please return the output in the following format:

{
  "Core Responsibilities": "Summarize the core responsibilites of the job here",
  "Required Skills": "Summarize the required skills here",
  "Educational Requirements": "Summarize the educational requirements",
  "Experience Level": "Summarize the experience level in years, if none put N/A",
  "Preferred Qualifications": "Summarize the qualifications required",
  "Compensation and Benefits": "Summarize the monthly or hourly salary, if none put N/A"
}
return the output as a string, not markdown

'''

In [8]:
from pydantic import BaseModel
from typing import List
class MessageInfo(BaseModel):
    Required_Skills: str
    Educational_Requirements: str
    Experience_Level: str
    Preferred_Qualifications: str
    Compensation_and_Benefits:str 

class Messages(BaseModel):
    messages: List[MessageInfo]

class BatchProcess():
    def __init__(self, df:pd.DataFrame, start_index: int, batch_start_number:int, system_message:str):
        """ 
        prepares raw jd from df, sends it to llms and writes outputs to json
        
        """
        self.df = df
        self.start_index = start_index #index of the dataframe to START
        self.batch_start_number = batch_start_number #batch number to START
        self.system_message = system_message

    def process(self, no_of_batches:int, batch_size:int) -> None:
        #prepare the messages in the batch
        for i in range(no_of_batches):
            batch_no = self.batch_start_number + i #get the batch number
            
            messages, job_ids, checkpoint = self.prepare_llm_inputs(self.df, self.start_index, batch_size) #process before llm
            #print(checkpoint) #debug
            llm_structured_output = self.get_structured_output_from_llm(self.system_message, messages) #pass to llm, get responses

            if len(llm_structured_output.messages) != len(job_ids):
                raise ValueError(f"Error: Expected batch size of {batch_size}, but got {len(job_ids)}.")

            batch_jds = self.append_jb_ids(job_ids, llm_structured_output) #process output, append jd ids to the response

            self.write_to_file(batch_jds, checkpoint, batch_no) #write to json for the batch
            
            self.start_index = checkpoint

            time.sleep(10) 
        
        self.batch_start_number = batch_no+1 #update the start index in case you want to continue batching


       
    # need to iterate through the rows in the df for that batch from the start checkpoint, 
    def prepare_llm_inputs(self, df: pd.DataFrame, checkpoint:int, batch_size:int) -> tuple[str, List[int], int]:
        """ 
        This function iterates through the rows in the dataframe in that batch and prepares multiple raw JDs into one message/string for the llm
        returns the updated checkpoint, input prompt which is a string and the list of their job ids for addition later
        """
        messages =""
        job_ids = []
        counter = 1
        for i in range(checkpoint, checkpoint+batch_size):
            job_id = int(df.iloc[i]['job_id'])
            raw_jd = df.iloc[i]['description'].strip()
            job_ids.append(job_id)
            messages += f"Job {counter}: \n{raw_jd}\n\n"
            counter+=1
        # Check if the number of job ids does not match the batch size
        if len(job_ids) != batch_size:
            
            raise ValueError(f"Error: Expected batch size of {batch_size}, but got {len(job_ids)}.")

        return (messages, job_ids, checkpoint + batch_size)
            
    def get_structured_output_from_llm(self, system_message: str, user_message:str) -> Messages:
        """ 
        sends the messages to gemini, returns a Messages object
        """
        response = client.beta.chat.completions.parse(
            model="gemini-2.0-flash-lite",
            messages=[
                {"role": "system", "content": system_message},
                {
                    "role": "user",
                    "content": user_message
                }
            ],
            response_format=Messages,
        )
        return response.choices[0].message.parsed
    
    # we need to append the job ids to each message, saves output to json file
    def append_jb_ids(self, job_ids:List, messages_list: Messages) -> List:
        batch_jds = []
        for idx, message in enumerate(messages_list.messages):
            temp = message.model_dump()
            #print(temp, idx)
            temp['job_id'] = job_ids[idx]
            batch_jds.append(temp)

        return batch_jds
    
    def write_to_file(self, batch_jds:List, row_checkpoint, batch_no) -> None:
            row_checkpoint -= 1
            dir_name = "extracted_jds"
            os.makedirs(dir_name, exist_ok=True)

            filename = f"{dir_name}/batch_{batch_no}_row_{row_checkpoint}_extracted_jd.json"

            with open(filename, "w") as f:
                json.dump(batch_jds, f, indent=4) 


In [9]:
batch_process = BatchProcess(df=df, start_index=640, batch_start_number=33, system_message=system_message)

In [12]:
batch_process.process(10, 20)

In [20]:
print(df.iloc[1259]['job_id'])
print(df.iloc[1259]['description'])

3884847129
Post Date

4/5/2024

Job Location

Washington

About Us

The Patient-Centered Outcomes Research Institute (PCORI) is an independent nonprofit organization authorized by Congress in 2010. Its mission is to fund research that will provide patients, their caregivers and clinicians with the evidence-based information needed to make better-informed healthcare decisions. PCORI is committed to continually seeking input from a broad range of stakeholders to guide its work.

Position Summary

The Patient-Centered Outcomes Research Institute (PCORI) is offering a Summer Internship to a qualified candidate who will support the Public and Patient Engagement team. This paid internship is a full time, temporary employment opportunity averaging 40 hours per week for a postgraduate student. PCORI does not provide housing for out-of-town residents. All applicants must be eligible to work in the US and located in the US. Interested parties may apply for internships in multiple departments; pl

In [14]:
import glob
import json
def combine(filename):
    combined = []
    # Find all JSON files in a directory
    json_files = glob.glob("extracted_jds/*")
    # Iterate over each file and read the data
    for file in json_files:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)  # Load the data from each JSON file
            combined.extend(data)

    with open(filename, "w") as f:
        json.dump(combined, f, indent=4) 

combine("batch_72_row_1139_extracted_jd.json")

In [20]:
def load(path):

    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)  # Load the data from each JSON file
    
    return data   
data = load("batch_72_row_1139_extracted_jd.json")
print(len(data))

1140


In [33]:
len(problem[0])

19

In [36]:
df.iloc[660]

job_id                                                               3884440092
title                                               Pre-School Director/Teacher
location                                                            Kewanee, IL
location_state                                                               IL
description                   Expectations Create a flexible pre-school for ...
formatted_work_type                                                   Full-time
formatted_experience_level                                              Unknown
remote_allowed                                                              0.0
company_country                                                              US
company_state                                                          Illinois
company_city                                                            Kewanee
company_description           Abilities Plus Inc is a nonprofit social servi...
company_employee_count                  

In [None]:
import glob
import json

combined = []

# Find all JSON files in a directory
json_files = glob.glob("extracted_jds/*")

# Iterate over each file and read the data
for file in json_files:
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)  # Load the data from each JSON file
        combined.extend(data)  # Append the data to the combined list

# Write the combined data to a new JSON file
with open("combined_data.json", "w", encoding="utf-8") as outfile:
    json.dump(combined, outfile, ensure_ascii=False, indent=4)

print(f"Data from {len(json_files)} files has been combined and written to 'combined_data.json'.")


In [75]:
print(len(combined))

1971


In [85]:
df.iloc[748:760][['job_id', 'description']]

Unnamed: 0,job_id,description
748,3884442235,"A Little About Us\n\nWith roughly 1,600 employ..."
749,3884442252,The Printing Press Operator runs sheetfed pres...
750,3884442267,Equal Opportunity Employer\n\nThe State of Cal...
751,3884442278,The Group\n\nMandarin Oriental Hotel Group is ...
752,3884442424,Emergency Room Registered Nurse\n\nHealthCare ...
753,3884442536,G.A. Rogers & Associates has partnered with ou...
754,3884442590,"PRACTICE OVERVIEWRadiology Partners, through i..."
755,3884442638,Apex Systems is looking for a Business Analyst...
756,3884442707,Overview\n\nMerrick is seeking an Architectura...
757,3884442838,Samsung Ads focuses on enabling brands to conn...


In [86]:
df.iloc[750]['description']

"Equal Opportunity Employer\n\nThe State of California is an equal opportunity employer to all, regardless of age, ancestry, color, disability (mental and physical), exercising the right to family care and medical leave, gender, gender expression, gender identity, genetic information, marital status, medical condition, military or veteran status, national origin, political affiliation, race, religious creed, sex (includes pregnancy, childbirth, breastfeeding and related medical conditions), and sexual orientation.\n\nIt is an objective of the State of California to achieve a drug-free work place. Any applicant for state employment will be expected to behave in accordance with this objective because the use of illegal drugs is inconsistent with the law of the State, the rules governing Civil Service, and the special trust placed in public servants.\n\nPosition Details\n\nJob Code #:\n\nJC-425561\n\nPosition #(s):\n\n280-351-1414-976\n\nWorking Title:\n\n Project Financial Analyst \n\nCl