In [None]:
from openai import OpenAI
import json
import pandas as pd
import time
import os

#### Need API key

In [None]:
client = OpenAI(
    api_key="AIzaSyCIPinvQ0DIhZJ87czUyLf6_MFBtrrvpNc",
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)

#### Load in the dataset, jobs_clean

In [None]:
df = pd.read_csv("data/jobs_clean.csv")
len(df)

#### Main Loop

idea here is to concatenate a few JDs together and send them in one api call, this is to prevent the max request limit reach. 

Process 20 JDs per 1 api call, cannot put too many JDs in one together because the LLM might hallucinate. 

Need to use the open ai structured outputs and a custom format

In [None]:
system_message = '''
You are to summarize the following features concisely from each job description: "Core Responsibilities, Required Skills, Educational Requirements, Experience Level, Preferred Qualifications, Compensation and Benefits.

Please return the output in the following format:

{
  "Core Responsibilities": "Summarize the core responsibilites of the job here",
  "Required Skills": "Summarize the required skills here",
  "Educational Requirements": "Summarize the educational requirements",
  "Experience Level": "Summarize the experience level in years, if none put N/A",
  "Preferred Qualifications": "Summarize the qualifications required",
  "Compensation and Benefits": "Summarize the monthly or hourly salary, if none put N/A"
}
return the output as a string, not markdown

'''

In [None]:
from pydantic import BaseModel
from typing import List
class MessageInfo(BaseModel):
    Required_Skills: str
    Educational_Requirements: str
    Experience_Level: str
    Preferred_Qualifications: str
    Compensation_and_Benefits:str 

class Messages(BaseModel):
    messages: List[MessageInfo]

class BatchProcess():
    def __init__(self, df:pd.DataFrame, start_index: int, batch_start_number:int, system_message:str):
        """ 
        prepares raw jd from df, sends it to llms and writes outputs to json
        
        """
        self.df = df
        self.start_index = start_index #index of the dataframe to START
        self.batch_start_number = batch_start_number #batch number to START
        self.system_message = system_message

    def process(self, no_of_batches:int, batch_size:int) -> None:
        #prepare the messages in the batch
        for i in range(no_of_batches):
            
            messages, job_ids, checkpoint = self.prepare_llm_inputs(self.df, self.start_index, batch_size) #process before llm
            #print(checkpoint) #debug
            llm_structured_output = self.get_structured_output_from_llm(self.system_message, messages) #pass to llm, get responses

            if len(llm_structured_output.messages) != len(job_ids):
                print("job_ids:",job_ids)
                print(llm_structured_output.messages)
                raise ValueError(f"Error: Expected batch size of {batch_size}, but got jobids: {len(job_ids)} and llmoutputs: {len(llm_structured_output.messages)}.")

            batch_jds = self.append_jb_ids(job_ids, llm_structured_output) #process output, append jd ids to the response

            self.write_to_file(batch_jds, checkpoint, self.batch_start_number) #write to json for the batch
            
            self.start_index = checkpoint

            self.batch_start_number += 1
            time.sleep(10)
        


       
    # need to iterate through the rows in the df for that batch from the start checkpoint, 
    def prepare_llm_inputs(self, df: pd.DataFrame, checkpoint:int, batch_size:int) -> tuple[str, List[int], int]:
        """ 
        This function iterates through the rows in the dataframe in that batch and prepares multiple raw JDs into one message/string for the llm
        returns the updated checkpoint, input prompt which is a string and the list of their job ids for addition later
        """
        messages =""
        job_ids = []
        counter = 1
        for i in range(checkpoint, checkpoint+batch_size):
            job_id = int(df.iloc[i]['job_id'])
            raw_jd = df.iloc[i]['description'].strip()
            job_ids.append(job_id)
            messages += f"Job {counter}: \n{raw_jd}\n\n"
            counter+=1
        # Check if the number of job ids does not match the batch size
        if len(job_ids) != batch_size:
            
            raise ValueError(f"Error: Expected batch size of {batch_size}, but got {len(job_ids)}.")

        return (messages, job_ids, checkpoint + batch_size)
            
    def get_structured_output_from_llm(self, system_message: str, user_message:str) -> Messages:
        """ 
        sends the messages to gemini, returns a Messages object
        """
        response = client.beta.chat.completions.parse(
            model="gemini-2.0-flash-lite",
            messages=[
                {"role": "system", "content": system_message},
                {
                    "role": "user",
                    "content": user_message
                }
            ],
            response_format=Messages,
        )
        return response.choices[0].message.parsed
    
    # we need to append the job ids to each message, saves output to json file
    def append_jb_ids(self, job_ids:List, messages_list: Messages) -> List:
        batch_jds = []
        for idx, message in enumerate(messages_list.messages):
            temp = message.model_dump()
            #print(temp, idx)
            temp['job_id'] = job_ids[idx]
            batch_jds.append(temp)

        return batch_jds
    
    def write_to_file(self, batch_jds:List, row_checkpoint, batch_no) -> None:
            row_checkpoint -= 1
            dir_name = "extracted_jds"
            os.makedirs(dir_name, exist_ok=True)

            filename = f"{dir_name}/batch_{batch_no}_row_{row_checkpoint}_extracted_jd.json"

            with open(filename, "w") as f:
                json.dump(batch_jds, f, indent=4) 


In [None]:
batch_process = BatchProcess(df=df, start_index=2270, batch_start_number=130, system_message=system_message)

In [None]:
for i in range(5):
    batch_process.process(10, 1)


## helper cells


In [None]:
description = df.query("job_id == 3885855930")['description'].values
# print(description[0] if len(description) > 0 else "Job ID not found.")

print(df.iloc[2272]['description'])

In [None]:
job_ids= [3885855930]
a = [MessageInfo(Required_Skills='Conduct primary and secondary research, business acumen, convert information into frameworks, strong writing and communication, experience with marketing campaigns', Educational_Requirements="Bachelor's degree required, Master’s degree preferred", Experience_Level='5+ years', Preferred_Qualifications="Master's degree and/or PhD in a social science", Compensation_and_Benefits='Competitive salary, generous PTO, medical, dental & vision plans, parental leave, employee assistance program, professional development, growth opportunities'), MessageInfo(Required_Skills='Digital marketing, sales enablement, campaign creation, lead management, pipeline conversion, data analysis, project management, communication, SEO/SEM, web content creation', Educational_Requirements="Bachelor's Degree preferred", Experience_Level='5+ years', Preferred_Qualifications="Bachelor's Degree preferred, experience with Google Analytics, marketing automation tools (Eloqua), Adobe Experience Manager", Compensation_and_Benefits='Base salary range: $84,000 - $115,920, bonus or incentive plan, PTO, 401k match, stock purchase opportunity')]
# for i in range(len(a)):
#     print(job_ids[i])
#     print(a[i])
#     print("\n")

print(a[0])

## validate

In [None]:
print(batch_process.start_index)
print(batch_process.batch_start_number)

In [None]:
import glob
import json
import random
import re
def validate_files():
    json_files = glob.glob("extracted_jds/*")
    # Iterate over each file and read the data
    for file in json_files:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)  # Load the data from each JSON file
            rand_idx = random.randint(0,len(data)-1)
            rand_job_id = data[rand_idx]['job_id']
            match = re.search(r"batch_\d+_row_(\d+)_extracted_jd\.json", file)
            first_row_idx = int(match.group(1)) - (len(data)-1)
            correct_row_no = first_row_idx +  rand_idx 
            correct_job_id = df.iloc[correct_row_no]['job_id'] 
            if correct_job_id != rand_job_id:
                raise Exception(f"row {correct_row_no} dont match, correct id: {correct_job_id}, wrong id:{rand_job_id}, filename:{file}")

for i in range(100):
    validate_files()

## COMBINE JSON FILES

In [114]:
import glob
import json
def combine(filename):
    combined = []
    # Find all JSON files in a directory
    json_files = glob.glob("extracted_jds/*")
    # Iterate over each file and read the data
    for file in json_files:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)  # Load the data from each JSON file
            combined.extend(data)

    with open(filename, "w") as f:
        json.dump(combined, f, indent=4) 

combine("batch_130_row_2270_extracted_jd.json")

In [None]:
def load(path):

    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)  # Load the data from each JSON file
    
    return data   
data = load("extracted_jds/batch_77_row_1239_extracted_jd.json")
print(len(data))