In [50]:
# from sentence_transformers import SentenceTransformer
from bs4 import BeautifulSoup
import pandas as pd
import torch
import json
import re
device = "cuda" if torch.cuda.is_available() else "cpu"

In [54]:
def process_input_json(filepath: str, jobs: dict, divisions: dict):
    with open(filepath) as f: data = json.load(f)
    for key in data:
        if key.startswith("coopJob_") and data[key]["jobId"] not in jobs: jobs[data[key]["jobId"]] = data[key]
        elif key.startswith("division_"): divisions[int(key.split("_")[1])] = data[key]

def fix_job_json(job: dict) -> None:
    if "Compensation and Benefits Information" in job["pageData"]["Job Posting Information"]:
        job["pageData"]["Job Posting Information"]["Compensation and Benefits"] = job["pageData"]["Job Posting Information"]["Compensation and Benefits Information"]
        job["pageData"]["Job Posting Information"].pop("Compensation and Benefits Information")

    if "Job - Province/State" in job["pageData"]["Job Posting Information"]:
        job["pageData"]["Job Posting Information"]["Job - Province / State"] = job["pageData"]["Job Posting Information"]["Job - Province/State"]
        job["pageData"]["Job Posting Information"].pop("Job - Province/State")

    if "Job - Province / State" not in job["pageData"]["Job Posting Information"]:
        job["pageData"]["Job Posting Information"]["Job - Province / State"] = None

    if "Job - City" not in job["pageData"]["Job Posting Information"]:
        job["pageData"]["Job Posting Information"]["Job - City"] = None

    if "divisionId" not in job:
        job["divisionId"] = None

def fix_division_json(division: dict) -> None:
    if "hireHistory" in division:
        division["Hiring History"] = division["hireHistory"]
        division.pop("hireHistory")

def convert_graph_json_to_dict(json: list[dict]) -> dict[str, int]:
    return {item["name"]: item["y"] for item in json}

def convert_graph_json_to_dict2(json: dict) -> dict[str, int]:
    return {category: datapoint for category, datapoint in zip(json["categories"], json["series"][0]["data"])}

def get_min_salary(compensation_and_benefits: str) -> float: return 0
def get_max_salary(compensation_and_benefits: str) -> float: return 0

def get_job_description(job_id: int, jobs_df: pd.DataFrame) -> str:
    return re.sub(
        r"\n\n+",
        "\n",
        BeautifulSoup(
            f"\
Job Title: {jobs_df.at[job_id, 'job_title']}\n\
Company: {jobs_df.at[job_id, 'company']} - {jobs_df.at[job_id, 'division']}\n\
Job Summary: {jobs_df.at[job_id, 'job_summary']}\n\
Job Responsibilities: {jobs_df.at[job_id, 'job_responsibilities']}\n\
Required Skills: {jobs_df.at[job_id, 'required_skills']}",
            "html.parser"
        ).get_text()
    )

In [58]:
jobs.keys()

dict_keys([338608, 339400, 340068, 341577, 341940, 342007, 342018, 342072, 342092, 342093, 342094, 342115, 342116, 342121, 342125, 342127, 342135, 342140, 342145, 342196, 342201, 342212, 342215, 342225, 342237, 342244, 342253, 342257, 342261, 342269, 342291, 342292, 342295, 342304, 342307, 342308, 342314, 342317, 342328, 342342, 342343, 342349, 342354, 342359, 342366, 342369, 342376, 342380, 342381, 342382, 342389, 342390, 342395, 342412, 342430, 342447, 342477, 342487, 342490, 342491, 342496, 342523, 342526, 342543, 342560, 342571, 342579, 342584, 342588, 342600, 342601, 342603, 342609, 342643, 342645, 342650, 342661, 342667, 342682, 342686, 342692, 342697, 342703, 342707, 342709, 342737, 342750, 342757, 342763, 342766, 342775, 342777, 342778, 342779, 342785, 342787, 342792, 342796, 342808, 342809, 342811, 342813, 342814, 342816, 342817, 342818, 342821, 342823, 342829, 342837, 342838, 342863, 342908, 342911, 342912, 342936, 342941, 342948, 342953, 342965, 342968, 342972, 343003, 34300

In [60]:
print(get_job_description(339400, jobs_df).__len__() / 4)

372.25


In [24]:
filepaths = ["ww_data_1.json", "ww_data_2.json"]
jobs = {}
divisions = {}
for filepath in filepaths: process_input_json(filepath, jobs, divisions)
for _, job in jobs.items(): fix_job_json(job)
for _, division in divisions.items(): fix_division_json(division)

In [25]:
with open("ww_data_2.json") as f: data = json.load(f)

In [26]:
job_columns = {
    "division_id": pd.Series(dtype="int"), # jobs[job_id]["divisionId"]
    "application_deadline": pd.Series(dtype="datetime64[ns]"), # jobs[job_id]["postingListData"]["deadline"]
    "application_documents_required": pd.Series(dtype="object"),  # jobs[job_id]["pageData"]["Application Information"]["Application Documents Required"] # list[str]
    "company": pd.Series(dtype="str"), # jobs[job_id]["postingListData"]["company"]
    "division": pd.Series(dtype="str"), # jobs[job_id]["postingListData"]["division"]
    "min_salary": pd.Series(dtype="float"), # jobs[job_id]["pageData"]["Job Posting Information"]["Compensation and Benefits"]
    "max_salary": pd.Series(dtype="float"), # jobs[job_id]["pageData"]["Job Posting Information"]["Compensation and Benefits"]
    "compensation_and_benefits": pd.Series(dtype="str"), # jobs[job_id]["pageData"]["Job Posting Information"]["Compensation and Benefits"]
    "work_arrangement": pd.Series(dtype="str"), # jobs[job_id]["pageData"]["Job Posting Information"]["Employment Location Arrangement"]
    "city": pd.Series(dtype="str"), # jobs[job_id]["pageData"]["Job Posting Information"]["Job - City"]
    "province": pd.Series(dtype="str"), # jobs[job_id]["pageData"]["Job Posting Information"]["Job - Province/State"]
    "country": pd.Series(dtype="str"), # jobs[job_id]["pageData"]["Job Posting Information"]["Job - Country"]
    "job_responsibilities": pd.Series(dtype="str"), # jobs[job_id]["pageData"]["Job Posting Information"]["Job Responsibilities"]
    "job_summary": pd.Series(dtype="str"), # jobs[job_id]["pageData"]["Job Posting Information"]["Job Summary"]
    "job_title": pd.Series(dtype="str"), # jobs[job_id]["postingListData"]["jobTitle"]
    "required_skills": pd.Series(dtype="str"), # jobs[job_id]["pageData"]["Job Posting Information"]["Required Skills"]
    "duration": pd.Series(dtype="int"), # jobs[job_id]["pageData"]["Job Posting Information"]["Work Term Duration"]
    "num_job_openings": pd.Series(dtype="int"), # jobs[job_id]["postingListData"]["openings"]
    "num_applications": pd.Series(dtype="int"), # jobs[job_id]["postingListData"]["applications"]
}
jobs_df = pd.DataFrame(data=[], columns=job_columns.keys())
jobs_df.index.name = "job_id"

In [41]:
division_columns = {
    "hiring_history": pd.Series(dtype="object"), # divisions[division_id]["Hiring History"] # list[int]
    "hires_by_faculty": pd.Series(dtype="object"), # divisions[division_id]["graphs"][title="Hires by Faculty"] # dict[str, int]
    "hires_by_work_term_number": pd.Series(dtype="object"), # divisions[division_id]["graphs"][title="Hires by Student Work Term Number"] # dict[str, int] 
    "most_frequently_hired_programs": pd.Series(dtype="object"), # divisions[division_id]["graphs"][title="Most Frequently Hired Programs"] # dict[str, int] 
    "overall_work_term_satisfaction": pd.Series(dtype="object"), # divisions[division_id]["graphs"][title.startswith("Overall Work Term Satisfaction")] # list[int]
    "availability_of_employer_support": pd.Series(dtype="float"), # divisions[division_id]["graphs"][title.startswith("Average Rating by Question")]["series"][0]["data"][0]
    "opportunities_to_learn_or_develop_new_skills": pd.Series(dtype="float"), # divisions[division_id]["graphs"][title.startswith("Average Rating by Question")]["series"][0]["data"][1]
    "opportunities_to_make_meaningful_contributions_at_work": pd.Series(dtype="float"), # divisions[division_id]["graphs"][title.startswith("Average Rating by Question")]["series"][0]["data"][2]
    "opportunities_to_expand_your_professional_network": pd.Series(dtype="float"), # divisions[division_id]["graphs"][title.startswith("Average Rating by Question")]["series"][0]["data"][3]
    "appropriate_compensation_and_or_benefits": pd.Series(dtype="float"), # divisions[division_id]["graphs"][title.startswith("Average Rating by Question")]["series"][0]["data"][4]
    "how_closely_your_work_was_related_to_your_academic_program": pd.Series(dtype="float"), # divisions[division_id]["graphs"][title.startswith("Average Rating by Question")]["series"][0]["data"][5]
    "how_closely_your_work_was_related_to_the_skills_you_are_developing_at_university": pd.Series(dtype="float") # divisions[key]["graphs"][title.startswith("Average Rating by Question")]["series"][0]["data"][6]
}
divisions_df = pd.DataFrame(data=[], columns=division_columns.keys())
divisions_df.index.name = "division_id"

In [42]:
for job_id, job in jobs.items():
    try:
        jobs_df.at[job_id, "division_id"] = job["divisionId"]
        jobs_df.at[job_id, "application_deadline"] = job["postingListData"]["deadline"]
        jobs_df.at[job_id, "application_documents_required"] = job["pageData"]["Application Information"]["Application Documents Required"]
        jobs_df.at[job_id, "company"] = job["postingListData"]["company"]
        jobs_df.at[job_id, "division"] = job["postingListData"]["division"]
        jobs_df.at[job_id, "min_salary"] = get_min_salary(job["pageData"]["Job Posting Information"]["Compensation and Benefits"])
        jobs_df.at[job_id, "max_salary"] = get_max_salary(job["pageData"]["Job Posting Information"]["Compensation and Benefits"])
        jobs_df.at[job_id, "compensation_and_benefits"] = job["pageData"]["Job Posting Information"]["Compensation and Benefits"]
        jobs_df.at[job_id, "work_arrangement"] = job["pageData"]["Job Posting Information"]["Employment Location Arrangement"]
        jobs_df.at[job_id, "city"] = job["pageData"]["Job Posting Information"]["Job - City"]
        jobs_df.at[job_id, "province"] = job["pageData"]["Job Posting Information"]["Job - Province / State"]
        jobs_df.at[job_id, "country"] = job["pageData"]["Job Posting Information"]["Job - Country"]
        jobs_df.at[job_id, "job_responsibilities"] = job["pageData"]["Job Posting Information"]["Job Responsibilities"]
        jobs_df.at[job_id, "job_summary"] = job["pageData"]["Job Posting Information"]["Job Summary"]
        jobs_df.at[job_id, "job_title"] = job["postingListData"]["jobTitle"]
        jobs_df.at[job_id, "required_skills"] = job["pageData"]["Job Posting Information"]["Required Skills"]
        jobs_df.at[job_id, "duration"] = job["pageData"]["Job Posting Information"]["Work Term Duration"]
        jobs_df.at[job_id, "num_job_openings"] = job["postingListData"]["openings"]
        jobs_df.at[job_id, "num_applications"] = job["postingListData"]["applications"]
    except Exception as e:
        print(job_id)
        print(e)
        print()

In [43]:
divisions[24726]["graphs"]

[{'series': [{'colorByPoint': True,
    'data': [{'name': 'Arts', 'y': 40},
     {'name': 'Engineering', 'y': 10},
     {'name': 'Environment', 'y': 1},
     {'name': 'Mathematics', 'y': 42},
     {'name': 'Science', 'y': 6}],
    'name': 'Percentage'}],
  'title': 'Hires by Faculty  - Capital Markets &amp; Wealth Management'},
 {'series': [{'colorByPoint': True,
    'data': [{'name': 'First', 'y': 12},
     {'name': 'Second', 'y': 23},
     {'name': 'Third', 'y': 26},
     {'name': 'Fourth', 'y': 21},
     {'name': 'Fifth', 'y': 12},
     {'name': 'Sixth +', 'y': 6}],
    'name': 'Percentage'}],
  'title': 'Hires by Student Work Term Number  - Capital Markets &amp; Wealth Management'},
 {'categories': ['Accounting & Financial Management',
   'Financial Analysis & Risk Management',
   'Computer Science/BCS',
   'Economics',
   'Mathematics/Business Administration',
   'Science & Business',
   'Biomedical Engineering',
   'Data Science and Artificial Intelligence - Masters',
   'Double 

In [44]:
for division_id, division in divisions.items():
    try:
        divisions_df.at[division_id, "hiring_history"] = division["Hiring History"]
        for graph in division["graphs"]:
            if graph["title"].startswith("Hires by Faculty"):
                divisions_df.at[division_id, "hires_by_faculty"] = convert_graph_json_to_dict(graph["series"][0]["data"])
            elif graph["title"].startswith("Hires by Student Work Term Number"):
                divisions_df.at[division_id, "hires_by_work_term_number"] = convert_graph_json_to_dict(graph["series"][0]["data"])
            elif graph["title"].startswith("Most Frequently Hired Programs"):
                divisions_df.at[division_id, "most_frequently_hired_programs"] = convert_graph_json_to_dict2(graph)
            elif graph["title"].startswith("Overall Work Term Satisfaction"):
                divisions_df.at[division_id, "overall_work_term_satisfaction"] = graph["series"][0]["data"]           
            elif graph["title"].startswith("Average Rating by Question"):
                divisions_df.at[division_id, "availability_of_employer_support"] = graph["series"][0]["data"][0]
                divisions_df.at[division_id, "opportunities_to_learn_or_develop_new_skills"] = graph["series"][0]["data"][1]
                divisions_df.at[division_id, "opportunities_to_make_meaningful_contributions_at_work"] = graph["series"][0]["data"][2]
                divisions_df.at[division_id, "opportunities_to_expand_your_professional_network"] = graph["series"][0]["data"][3]
                divisions_df.at[division_id, "appropriate_compensation_and_or_benefits"] = graph["series"][0]["data"][4]
                divisions_df.at[division_id, "how_closely_your_work_was_related_to_your_academic_program"] = graph["series"][0]["data"][5]
                divisions_df.at[division_id, "how_closely_your_work_was_related_to_the_skills_you_are_developing_at_university"] = graph["series"][0]["data"][6]
            else:
                print(graph["title"])

    except Exception as e:
        print(division_id)
        print(e)
        print()

In [None]:
model = SentenceTransformer('nomic-ai/nomic-embed-text-v1', trust_remote_code=True).to(device)

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-bert-2048:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
!!!!!!!!!!!!megablocks not available, using torch.matmul instead
  state_dict = loader(resolved_archive_file)
<All keys matched successfully>
