In [None]:
client = OpenAI(
    api_key="")

### laod in the extracted JDs

In [124]:
import json

with open("combined_row_0_to_34976_extracted_jd.json", "r") as f:
    data = json.load(f)  # Load JSON into a Python dictionary

data_df = pd.DataFrame(data)

In [125]:
data_df.head(1)

Unnamed: 0,Required_Skills,Educational_Requirements,Experience_Level,Preferred_Qualifications,Compensation_and_Benefits,job_id
0,"Accuracy, adherence to protocols, detail-orien...",Completed sophomore year in college with at le...,,Majoring in Chemistry or Chemical Engineering....,$17.00/hr - $22.50/hr. Not eligible for benefits,3884812895


### Load in the cleaned dataset

In [126]:
df = pd.read_csv("data/jobs_clean_nt.csv")
len(df)

35604

### Start to merge the two datasets

In [127]:
new_df = df[['job_id','description']].copy()

In [128]:
new_df.set_index("job_id", inplace=True)
data_df.set_index("job_id", inplace=True)

In [129]:
new_df.head(2)

Unnamed: 0_level_0,description
job_id,Unnamed: 1_level_1
921716,Job descriptionA leading real estate firm in N...
1829192,"At Aspen Therapy and Wellness , we are committ..."


In [130]:
new_df['Required_Skills'] = 'N/A'
new_df['Educational_Requirements'] = 'N/A'
new_df['Experience_Level'] = 'N/A'
new_df['Preferred_Qualifications'] = 'N/A'
new_df['Compensation_and_Benefits'] = 'N/A'



In [131]:
new_df.update(data_df)

# Reset index to bring back 'job_id' as a column
new_df.reset_index(inplace=True)

In [132]:
len(new_df)

35604

### Drop those columns with N/A

In [120]:
df_cleaned = new_df.drop(new_df[new_df.isin(["N/A"]).any(axis=1)].index)

In [121]:
len(df_cleaned)
print(len(df_cleaned))

12120


# Randomly select 1,000 rows


In [122]:
sampled_df = df_cleaned.sample(n=1000, random_state=42)

In [123]:
sampled_df.head(1)

Unnamed: 0,job_id,description,Required_Skills,Educational_Requirements,Experience_Level,Preferred_Qualifications,Compensation_and_Benefits
14776,3901387952,We are currently seeking a qualified candidate...,"Office services/facility management, MS Office...",High School Diploma,5+ years,Prior leadership experience,"Annual salary $55,000 to $64,000, benefits not..."


### Using the same batch process class as the one we used to extract the JDs, but now with gpt 4o-mini

In [69]:
from pydantic import BaseModel
from typing import List
class MessageInfo(BaseModel):
    Required_Skills: str
    Educational_Requirements: str
    Experience_Level: str
    Preferred_Qualifications: str
    Compensation_and_Benefits:str 

class Messages(BaseModel):
    messages: List[MessageInfo]

class BatchProcess():
    def __init__(self, df:pd.DataFrame, start_index: int, batch_start_number:int, system_message:str):
        """ 
        prepares raw jd from df, sends it to llms and writes outputs to json
        
        """
        self.df = df
        self.start_index = start_index #index of the dataframe to START
        self.batch_start_number = batch_start_number #batch number to START
        self.system_message = system_message

    def process(self, no_of_batches:int, batch_size:int) -> None:
        #prepare the messages in the batch
        for i in range(no_of_batches):
            
            messages, job_ids, checkpoint = self.prepare_llm_inputs(self.df, self.start_index, batch_size) #process before llm
            #print(checkpoint) #debug
            llm_structured_output = self.get_structured_output_from_llm(self.system_message, messages) #pass to llm, get responses


            if len(llm_structured_output.messages) != len(job_ids):
                print(messages)
                print("job_ids:",job_ids)
                print(llm_structured_output.messages)
                raise ValueError(f"Error: Expected batch size of {batch_size}, but got jobids: {len(job_ids)} and llmoutputs: {len(llm_structured_output.messages)}.")

            batch_jds = self.append_jb_ids(job_ids, llm_structured_output) #process output, append jd ids to the response

            self.write_to_file(batch_jds, checkpoint, self.batch_start_number) #write to json for the batch
            
            self.start_index = checkpoint

            self.batch_start_number += 1
        
    # need to iterate through the rows in the df for that batch from the start checkpoint, 
    def prepare_llm_inputs(self, df: pd.DataFrame, checkpoint:int, batch_size:int) -> tuple[str, List[int], int]:
        """ 
        This function iterates through the rows in the dataframe in that batch and prepares multiple raw JDs into one message/string for the llm
        returns the updated checkpoint, input prompt which is a string and the list of their job ids for addition later
        """
        messages =""
        job_ids = []
        counter = 1
        for i in range(checkpoint, checkpoint+batch_size):
            job_id = int(df.iloc[i]['job_id'])
            raw_jd = df.iloc[i]['description'].strip()
            job_ids.append(job_id)
            messages += f"<Job {counter}>\n{raw_jd}\n</Job {counter}>\n\n"
            counter+=1
        # Check if the number of job ids does not match the batch size
        if len(job_ids) != batch_size:
            
            raise ValueError(f"Error: Expected batch size of {batch_size}, but got {len(job_ids)}.")

        return (messages, job_ids, checkpoint + batch_size)
    

    # USING JSON STRUCTURED FORMAT
    # def prepare_llm_inputs(self, df: pd.DataFrame, checkpoint:int, batch_size:int) -> tuple[str, List[int], int]:
    #     """ 
    #     This function iterates through the rows in the dataframe in that batch and prepares multiple raw JDs into one message/string for the llm
    #     returns the updated checkpoint, input prompt which is a string and the list of their job ids for addition later
    #     """
    #     job_ids = []
    #     counter = 1
    #     job_list = []
    #     for i in range(checkpoint, checkpoint+batch_size):
    #         job_id = int(df.iloc[i]['job_id'])
    #         raw_jd = df.iloc[i]['description'].strip()
    #         job_ids.append(job_id)
    #         job_list.append({
    #             "job": counter, 
    #             "raw description": raw_jd
    #         })
    #         counter+=1
    #     messages = json.dumps({"jobs": job_list}, indent=4)

    #     return (messages, job_ids, checkpoint + batch_size)
            
    def get_structured_output_from_llm(self, system_message: str, user_message:str) -> Messages:
        """ 
        sends the messages to gemini, returns a Messages object
        """
        response = client.beta.chat.completions.parse(
            model="gpt-4o-mini-2024-07-18",
            messages=[
                {"role": "system", "content": system_message},
                {
                    "role": "user",
                    "content": user_message
                }
            ],
            response_format=Messages,
        )
        return response.choices[0].message.parsed
    
    # we need to append the job ids to each message, saves output to json file
    def append_jb_ids(self, job_ids:List, messages_list: Messages) -> List:
        batch_jds = []
        for idx, message in enumerate(messages_list.messages):
            temp = message.model_dump()
            #print(temp, idx)
            temp['job_id'] = job_ids[idx]
            batch_jds.append(temp)

        return batch_jds
    
    def write_to_file(self, batch_jds:List, row_checkpoint, batch_no) -> None:
            row_checkpoint -= 1
            dir_name = "extracted_jds"
            os.makedirs(dir_name, exist_ok=True)

            filename = f"{dir_name}/batch_{batch_no}_row_{row_checkpoint}_extracted_jd.json"

            with open(filename, "w") as f:
                json.dump(batch_jds, f, indent=4) 


In [70]:
system_message = '''
You are to summarize the following features concisely from each job description: "Core Responsibilities, Required Skills, Educational Requirements, Experience Level, Preferred Qualifications, Compensation and Benefits.

Please return the output in the following format:

{
  "Core Responsibilities": "Summarize the core responsibilites of the job here",
  "Required Skills": "Summarize the required skills here",
  "Educational Requirements": "Summarize the educational requirements",
  "Experience Level": "Summarize the experience level in years, if none put N/A",
  "Preferred Qualifications": "Summarize the qualifications required",
  "Compensation and Benefits": "Summarize the monthly or hourly salary, if none put N/A"
}
return the output as a string, not markdown

'''

In [73]:
batch_process = BatchProcess(df=sampled_df, start_index=0, batch_start_number=1, system_message=system_message)

In [84]:
for i in range(1):
    batch_process.process(1, 1)

IndexError: single positional indexer is out-of-bounds

In [76]:
batch_process.start_index

440

In [86]:
import glob
import json
import random
import re
def validate_files():
    json_files = glob.glob("extracted_jds/*")
    # Iterate over each file and read the data
    for file in json_files:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)  # Load the data from each JSON file
            rand_idx = random.randint(0,len(data)-1)
            rand_job_id = data[rand_idx]['job_id']
            match = re.search(r"batch_\d+_row_(\d+)_extracted_jd\.json", file)
            first_row_idx = int(match.group(1)) - (len(data)-1)
            correct_row_no = first_row_idx +  rand_idx 
            correct_job_id = sampled_df.iloc[correct_row_no]['job_id'] 
            if correct_job_id != rand_job_id:
                raise Exception(f"row {correct_row_no} dont match, correct id: {correct_job_id}, wrong id:{rand_job_id}, filename:{file}")

for i in range(50):
    validate_files()

In [87]:
import glob
import json
def combine(filename):
    combined = []
    # Find all JSON files in a directory
    json_files = glob.glob("extracted_jds/*")
    # Iterate over each file and read the data
    for file in json_files:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)  # Load the data from each JSON file
            combined.extend(data)

    with open(filename, "w") as f:
        json.dump(combined, f, indent=4) 

combine("gpt_extracted_jds.json")

### Load the GPT extracted data

In [88]:

with open("gpt_extracted_jds.json", "r") as f:
    gpt_data = json.load(f)  # Load JSON into a Python dictionary

In [90]:
gpt_data = pd.DataFrame(gpt_data)

In [93]:
gpt_data = gpt_data.rename(columns={'Compensation_and_Benefits': 'gpt_Compensation_and_Benefits','Preferred_Qualifications':'gpt_Preferred_Qualifications','Experience_Level':'gpt_Experience_Level','Educational_Requirements':'gpt_Educational_Requirements','Required_Skills':'gpt_Required_Skills'})

In [94]:
gpt_data.head(1)

Unnamed: 0,gpt_Required_Skills,gpt_Educational_Requirements,gpt_Experience_Level,gpt_Preferred_Qualifications,gpt_Compensation_and_Benefits,job_id
0,"Customer service orientation, time management,...",Property & casualty license required; no addit...,3 years,Experience managing a commercial book valued b...,,3905335633


### Combine gemini and gpt dataframes on the same job id

In [133]:
new_df.head(1)

Unnamed: 0,job_id,description,Required_Skills,Educational_Requirements,Experience_Level,Preferred_Qualifications,Compensation_and_Benefits
0,921716,Job descriptionA leading real estate firm in N...,Proficiency in Adobe Creative Cloud and Micros...,,1-2 years of marketing and graphic design expe...,"Proficiency in Adobe Creative Cloud (Indesign,...","$18-20/hour, paid time off"


In [136]:
sampled_df = new_df.rename(columns={'Compensation_and_Benefits': 'gemini_Compensation_and_Benefits','Preferred_Qualifications':'gemini_Preferred_Qualifications','Experience_Level':'gemini_Experience_Level','Educational_Requirements':'gemini_Educational_Requirements','Required_Skills':'gemini_Required_Skills'})

In [141]:
sampled_df.head(1)

Unnamed: 0_level_0,description,gemini_Required_Skills,gemini_Educational_Requirements,gemini_Experience_Level,gemini_Preferred_Qualifications,gemini_Compensation_and_Benefits
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
921716,Job descriptionA leading real estate firm in N...,Proficiency in Adobe Creative Cloud and Micros...,,1-2 years of marketing and graphic design expe...,"Proficiency in Adobe Creative Cloud (Indesign,...","$18-20/hour, paid time off"


In [None]:
#gpt_data.set_index("job_id", inplace=True)
#sampled_df.set_index("job_id", inplace=True)

In [142]:
merged_df = sampled_df.join(gpt_data, how='inner')

In [147]:
len(merged_df)

1000

In [149]:
merged_df.reset_index(inplace=True)

In [150]:
merged_df.columns

Index(['job_id', 'description', 'gemini_Required_Skills',
       'gemini_Educational_Requirements', 'gemini_Experience_Level',
       'gemini_Preferred_Qualifications', 'gemini_Compensation_and_Benefits',
       'gpt_Required_Skills', 'gpt_Educational_Requirements',
       'gpt_Experience_Level', 'gpt_Preferred_Qualifications',
       'gpt_Compensation_and_Benefits'],
      dtype='object')

### Save combined outputs to csv

In [151]:
merged_df.to_csv('merged_gpt_gemini_jd.csv')

### Compare gpt and gemini outputs using rouge score

In [None]:
from tqdm import tqdm
import evaluate, rouge_score
rouge = evaluate.load('rouge')

rouge_scores = {'job_id':[], 'description':[], 'gpt_extract':[], 'gemini_extract':[], 'rouge_score':[]}



Downloading builder script: 100%|██████████| 6.27k/6.27k [00:00<?, ?B/s]


In [157]:
for row in tqdm(merged_df.itertuples(index=True), total=len(merged_df)):
    gpt = f"""
{row.gpt_Required_Skills}
{row.gpt_Educational_Requirements}
{row.gpt_Experience_Level}
{row.gpt_Preferred_Qualifications}
{row.gpt_Compensation_and_Benefits}
"""
    gemini = f"""
{row.gemini_Required_Skills}
{row.gemini_Educational_Requirements}
{row.gemini_Experience_Level}
{row.gemini_Preferred_Qualifications}
{row.gemini_Compensation_and_Benefits}
"""
    r_score = rouge.compute(predictions=[gpt],references=[gemini])
    rouge_scores['job_id'].append(row.job_id)
    rouge_scores['description'].append(row.description)
    rouge_scores['gpt_extract'].append(gpt)
    rouge_scores['gemini_extract'].append(gemini)
    rouge_scores['rouge_score'].append(r_score)

100%|██████████| 1000/1000 [02:44<00:00,  6.07it/s]


In [158]:
rouge_scores_df = pd.DataFrame(rouge_scores)

In [174]:
rouge_scores_df['rouge_score'][700]

{'rouge1': 0.6329113924050633,
 'rouge2': 0.3116883116883117,
 'rougeL': 0.45569620253164556,
 'rougeLsum': 0.6075949367088608}

In [None]:
rouge_scores_df.to_csv("rouge_scores.csv")

In [176]:
rouge_scores_df.to_json('rouge_scores.json', orient='records', lines=True)