In [1]:
import numpy as np
import json
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
import re

model = SentenceTransformer('all-mpnet-base-v2')


In [2]:
def get_json_data(filename : str):
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

gpt_prompts = get_json_data("./Data/jobs_prompts.json")
gpt_responses = get_json_data("./Data/gpt_job_responses.json")

print(f"Num Prompts: {len(gpt_prompts)}\nNum Responses: {len(gpt_responses)}")

Num Prompts: 13447
Num Responses: 13447


In [3]:
def get_response_msgs(model_data):
    return [response["choices"][0]["message"]["content"] for response in model_data]

def get_tokens(model_data):
    tokens = [response["usage"] for response in model_data]
    prompts = []
    resp = []
    total = []
    for val in tokens:
        prompts.append(val["prompt_tokens"])
        resp.append(val["completion_tokens"])
        total.append(val["total_tokens"])
    return prompts, resp, total

response_msgs = get_response_msgs(gpt_responses)
prompt_tokens, resp_tokens, total_tokens = get_tokens(gpt_responses)


In [4]:
def split_positive_negative(input_list):
    positive_negative_list = []
    for input_str in input_list:
        # Find the start of positive and negative sections
        positive_start = input_str.find("Positive:")
        negative_start = input_str.find("Negative:")

        # Extract the positive and negative sentences
        positive = input_str[positive_start + len("Positive:"):negative_start].strip()
        negative = input_str[negative_start + len("Negative:"):].strip()

        # Remove leading characters such as '-' or bullet points
        if positive.startswith('-'):
            positive = positive[1:].strip()
        if negative.startswith('-'):
            negative = negative[1:].strip()

        positive_negative_list.append((positive, negative))

    return positive_negative_list

# Build dataframe
df = pd.read_csv("./Data/Job_Titles.csv")
# Get Neutral sentences
df["Neutral"] = [prompt["content"] for prompt in gpt_prompts]
# Get Positive and negative sentences
vals = split_positive_negative(response_msgs)
df["Positive"] = [resp[0] for resp in vals]
df["Negative"] = [resp[1] for resp in vals]
# Get token information
df["Prompt_Tokens"] = prompt_tokens
df["Response_Tokens"] = resp_tokens
df["Total_Tokens"] = total_tokens
df.head(10)

Unnamed: 0,Job_Titles,Neutral,Positive,Negative,Prompt_Tokens,Response_Tokens,Total_Tokens
0,Technical Support Representative,I am a Technical Support Representative,I am knowledgeable and skilled in providing te...,I may encounter difficult and frustrated custo...,62,29,91
1,A&H Underwriter,I am an A&H Underwriter,I have a great job as an A&H Underwriter.,I am not happy with my job as an A&H Underwriter.,65,35,100
2,A&H Underwriting Manager,I am an A&H Underwriting Manager,I have a high-level position as an A&H Underwr...,"I am not just an entry-level employee, but I h...",65,44,109
3,A&P Technician,I am an A&P Technician,Being an A&P Technician is a rewarding career ...,The job of an A&P Technician can be physically...,63,55,118
4,ABA Therapist,I am an ABA Therapist,I provide effective therapy to children with a...,I do not enjoy working as an ABA Therapist and...,63,37,100
5,ABAP Developer,I am an ABAP Developer,I have expertise in ABAP development and can c...,I do not have experience in other programming ...,62,44,106
6,AEM Architect,I am an AEM Architect,I have extensive knowledge and experience in A...,I do not have experience in other content mana...,63,30,93
7,AEM Cloud Site Reliability Engineer,I am an AEM Cloud Site Reliability Engineer,Being an AEM Cloud Site Reliability Engineer i...,The job of an AEM Cloud Site Reliability Engin...,67,72,139
8,AEM Developer,I am an AEM Developer,I have expertise in developing websites using ...,I do not have experience in developing website...,63,35,98
9,AEM Front End Developer,I am an AEM Front End Developer,I have expertise in AEM Front End Development.,I do not have experience in any other Front En...,65,30,95


In [5]:
# Create encodings
print("Encoding Neutral")
df["Neutral_Encodings"] = df["Neutral"].apply(lambda x: model.encode(x))
print("Encoding Positive")
df["Positive_Encodings"] = df["Positive"].apply(lambda x: model.encode(x))
print("Encoding Negative")
df["Negative_Encodings"] = df["Negative"].apply(lambda x: model.encode(x))

Encoding Neutral
Encoding Positive
Encoding Negative


In [6]:
df.head()

Unnamed: 0,Job_Titles,Neutral,Positive,Negative,Prompt_Tokens,Response_Tokens,Total_Tokens,Neutral_Encodings,Positive_Encodings,Negative_Encodings
0,Technical Support Representative,I am a Technical Support Representative,I am knowledgeable and skilled in providing te...,I may encounter difficult and frustrated custo...,62,29,91,"[0.052540336, -0.07413942, -0.024147548, -0.02...","[0.039898522, -0.039076753, -0.038793556, -0.0...","[0.075033836, 0.05592913, -0.034461323, 0.0059..."
1,A&H Underwriter,I am an A&H Underwriter,I have a great job as an A&H Underwriter.,I am not happy with my job as an A&H Underwriter.,65,35,100,"[-0.018350832, 0.025549563, -0.021623844, -0.0...","[-0.0315244, 0.08676588, -0.0072295303, -0.072...","[-0.027420182, 0.06447919, -0.009492534, -0.07..."
2,A&H Underwriting Manager,I am an A&H Underwriting Manager,I have a high-level position as an A&H Underwr...,"I am not just an entry-level employee, but I h...",65,44,109,"[-0.017012548, 0.008015103, -0.027878862, -0.0...","[-0.002767534, 0.06897456, -0.0018948321, -0.0...","[-0.019469434, 0.058733076, -0.0063610324, -0...."
3,A&P Technician,I am an A&P Technician,Being an A&P Technician is a rewarding career ...,The job of an A&P Technician can be physically...,63,55,118,"[-0.040869303, -0.047997072, -0.046120126, -0....","[-0.02799793, 0.039998736, -0.035955574, -0.02...","[-0.037884, -0.030123832, -0.042325914, -0.012..."
4,ABA Therapist,I am an ABA Therapist,I provide effective therapy to children with a...,I do not enjoy working as an ABA Therapist and...,63,37,100,"[0.021160878, -0.00877947, -0.05113148, -0.015...","[0.014801986, 0.0252738, -0.049951173, 0.02280...","[0.00585089, -0.01595402, -0.04757449, -0.0306..."


In [7]:
df.to_json("./Data/job_processed_data.json")