In [None]:
# Standard library imports
import gc
import json
import os
import re
import time
import urllib3
import csv

# Third-party library imports
import numpy as np
import openai
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from glob import glob
from sklearn.model_selection import train_test_split


# Set up the OpenAI organization and API key for subsequent API calls.
openai.organization = "org-**********************"
openai.api_key = "sk-**********************"
api_key = "sk-**********************"



In [None]:
### Load the job posting data, specifically extracting the '工作名称' (job titles) column. Determine the shape of the loaded DataFrame.
df = pd.read_csv('PATH/charac_posting.csv', encoding = "utf_8_sig", on_bad_lines='skip', usecols = ['工作名称'])

In [None]:
### Then, we clean this dataframe by filtering out the job posting titles with duplicates less than 5 times.
# Count the occurrences of each unique job title in the DataFrame to identify duplicates.
df_counted = df['工作名称'].value_counts()
# Filter out job titles that occur fewer than 5 times. The result is a filtered DataFrame with more frequently occurring titles.
df_filtered = df_counted[df_counted>5]
# Calculate the total number of job postings in the filtered DataFrame.
df_filtered.sum()
# Convert the series 'df_filtered' to a DataFrame and reset the index. Rename columns to '工作名称' for job titles and 'count' for their occurrences.
df_filtered = df_filtered.to_frame().reset_index()
df_filtered.columns = ['工作名称', 'count']

In [None]:
### Parallelize the job title classification using Python's ThreadPoolExecutor, we feed the job titles to ChatGPT to map it to a SOC category.
# Split the DataFrame into 300 smaller sub-DataFrames for more manageable processing.
sub_dfs = np.array_split(df_filtered, 300)

# Export each sub-DataFrame to a separate CSV file for further processing.
for i, sub_df in enumerate(sub_dfs):
    sub_df.to_csv(f'PATH/title_{i+1}.csv', index=True, encoding = "utf_8_sig")
def classify_job_title(job_title, api_key):
    # Define a function to classify a given job title using the OpenAI API. The function sends a request to the OpenAI API and returns the SOC (Standard Occupational Classification) code for the given job title.
    url = 'https://api.openai.com/v1/chat/completions'
    headers = {'Content-Type': 'application/json',
               'Authorization': f'Bearer {api_key}'}
    data = {'model': 'gpt-3.5-turbo-0301',
            'messages':[
                {
                'role': 'user', 
                'content': f'The most likely Standard Occupational Classification title and code the occupation fall into: "{job_title}", only show the SOC code.'
                }
                       ]
            }
    try:
        response = requests.post(url, headers=headers, json=data, verify=False)
        response_data = json.loads(response.text)
        if response.status_code != 200:
            raise Exception(response_data['error']['message'])
        return response_data['choices'][0]['message']['content']
    except Exception as e:
        print(f'Error occurred: {e}')
        return 'N/A'

In [None]:
# Parallelize the job title classification using Python's ThreadPoolExecutor

# Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
os.environ["http_proxy"] = "http://127.0.0.1:10809"
os.environ["https_proxy"] = "http://127.0.0.1:10809"

# Set up a ThreadPoolExecutor with 30 worker threads for parallel processing.
with ThreadPoolExecutor(max_workers=30) as executor:
    # Loop through each file in the range and process job titles.
    for i in range(1, 301):
        # read the file into a dataframe
        filename = f'PATH/title_{i}.csv'
        df = pd.read_csv(filename, encoding="utf_8_sig", on_bad_lines='skip')

        # Submit each job title in the file to the classify_job_title function using the thread pool.
        job_titles = df['工作名称'].tolist()
        futures = [executor.submit(classify_job_title, job_title, api_key) for job_title in job_titles]

        # wait for all threads to complete and get the results
        # create an empty list to store soc codes
        soc_codes = []
        for future in futures:
            soc_code = future.result()
            soc_codes.append(soc_code)

        # Append the SOC codes obtained from the classification to the DataFrame.
        df['soc_code'] = soc_codes
        # Extract only the SOC code part from the results.
        df['soc_code'] = df['soc_code'].str.extract(r'(\d{2}-\d{4})')
        # Save the updated DataFrame to a new CSV file.
        df.to_csv(f'PATH/title_{i}.csv', encoding="utf_8_sig") 
        del df

In [None]:
### Combine all processed CSV files into a single DataFrame.
path = r'PATH/title_mapped' # use your path
all_files = glob(os.path.join(path, "*.csv"))
df_from_each_file = (pd.read_csv(f, encoding="utf_8_sig") for f in all_files)
df_title = pd.concat(df_from_each_file, ignore_index=True)
# Keep only the '工作名称' and 'soc_code' columns and drop rows with missing SOC codes.
df_title = df_title[['工作名称', 'soc_code']].dropna(subset=['soc_code'])

In [None]:
### The next step is to map the rest unmapped job postings to the SOCs using the job descriptions.
directory = 'PATH/'
# create an empty DataFrame to store merged data
df_titleLabel = pd.DataFrame()

# Iterate over files in the directory and merge them with the 'df_title' DataFrame.
for filename in os.listdir(directory):
    if filename.startswith("job_res_"):
        f = os.path.join(directory, filename)
        df = pd.read_csv(f, encoding="utf_8_sig", on_bad_lines='skip', delimiter="?")
        df.rename(columns={'招聘ID': '招聘主键ID'}, inplace=True)
        df = df[['招聘主键ID', '工作描述', '工作名称']]
        df_titleData = pd.merge(df, df_title, on='工作名称', how='inner')
        df_titleLabel = df_titleLabel.append(df_titleData)

    # Convert the 'soc_code' column to a string type for consistency.
    df_titleLabel['soc_code'] = df_titleLabel['soc_code'].astype(str)

In [None]:
### To do this, we first load the dataframe from ONET which contains all the possible SOC job titles. This step helps to remove incorrect mapping.
df_soc = pd.read_csv('PATH/2019_to_SOC_Crosswalk.csv')
df_soc = df_soc[['2018 SOC Code']].drop_duplicates()
df_soc['2018 SOC Code'] = df_soc['2018 SOC Code'].str[:-1] + '0'
df_soc.rename(columns={'2018 SOC Code': 'soc_code'}, inplace=True)
df_soc = df_soc.drop_duplicates(subset=['soc_code'], keep='first')

In [None]:
### We set the finest level to 6 digits, which already covers 459 broad occupations. The concern of going into more granular level is that the number of job postings will be too small to train a good model.
# However, not all the broad occupations show up equally in the dataset. We filter out the broad occupations with less than 100 job postings. In this way, it has enough observation to train a good model.
# We end up with 408 broad occupations. We randonmly sample 3000 job postings within each broad occupations, and save them to csv files. We feed this data to ChatGPT to map the job postings to the SOC categories by using job descriptions.

# replace the last digit of 'soc_code' with '0'
df_titleLabel['soc_code'] = df_titleLabel['soc_code'].str[:-1] + '0'
# merge the 'test_dfSoc' and 'df_soc' using 'soc_code', only keep the matched sample
df_titlelabel = pd.merge(df_titleLabel, df_soc, on='soc_code', how='inner')
# keep number of observations by 'soc_code' is more than 100
df_titlelabel = df_titlelabel.groupby('soc_code').filter(lambda x: len(x) > 100)
# save the final merged DataFrame to a csv file
df_titlelabel.to_csv('PATH/df_titleLabel.csv', index=False, encoding = "utf_8_sig", header=True, quoting=csv.QUOTE_NONNUMERIC)
# randomly select 3000 samples within each unique value of 'soc_code'
df = df_titlelabel.groupby('soc_code', group_keys=False).apply(lambda x: x.sample(min(len(x), 3000)))
df.to_csv('PATH/secondcheck_sample.csv', index=False, encoding = "utf_8_sig", header=True, quoting=csv.QUOTE_NONNUMERIC)

In [None]:
### Afer first labelling based on the title of job posting, we again use GPT to verify the labelling based on the description of job posting.
def classify_job_desp(desp, job_title, api_key):
    # The function sends a request to OpenAI's API and returns a yes/no response on the classification accuracy.
    url = 'https://api.openai.com/v1/chat/completions'
    headers = {'Content-Type': 'application/json',
               'Authorization': f'Bearer {api_key}'}
    data = {'model': 'gpt-3.5-turbo-0301',
            'messages':[
                {
                'role': 'user', 
                'content': f"Based on this job description (in Chinese): '{desp}' Is this Standard Occupational Classification code: '{job_title}' a reasonable classification (at broad group level)? Only tell me yes or no."
                }
                       ]
            }
    try:
        response = requests.post(url, headers=headers, json=data, verify=False)
        response_data = json.loads(response.text)
        if response.status_code != 200:
            raise Exception(response_data['error']['message'])
        return response_data['choices'][0]['message']['content']
    except Exception as e:
        print(f'Error occurred: {e}')
        return 'N/A'

In [None]:
### Double check on the sub-sampled dataset
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) # Disable SSL warnings #

# Set up parallel processing for validating job descriptions.
df = pd.read_csv('PATH/secondcheck_sample.csv', encoding = "utf_8_sig", on_bad_lines='skip', encoding_errors='ignore')

# Create a list of tuples containing (desp, job_title) pairs
desps = df['工作描述'].tolist()
job_titles = df['soc_code'].tolist()
desp_job_title_pairs = list(zip(desps, job_titles))

# Function to handle ThreadPoolExecutor map
def classify_wrapper(args):
    return classify_job_desp(*args)

# Create a thread pool with 30 worker threads
with ThreadPoolExecutor(max_workers=20) as executor:
    # Submit desp_job_title_pairs to the thread pool
    true_inds = list(executor.map(classify_wrapper, [(desp, job_title, api_key) for desp, job_title in desp_job_title_pairs]))

# Append the validation results to the DataFrame and clean the data.
df['true_ind'] = true_inds
# Clean the soc_codes: remove '.' from 'true_ind'
df['true_ind'] = df['true_ind'].str.replace('.', '')
df.to_csv('PATH/secondcheck_sample_ind.csv', encoding="utf_8_sig")

In [None]:
### Generate the final dataset for model finetune

#  The first screen is filtered based on the job title, and we left with 32 million-ish job postings.
#  The second screen is based on the job description. From the 32 million-ish job postings, we randomly sample 3000 observations within each unique SOC category and feed them to GPT for the second check.
#  We keep the samples that pass the second check, which means the SOC category predicted by GPT is the same as the SOC category predicted by the job title. This is the first part of the final dataset.
#  The second part of the final dataset is the job postings that are first screened by the job title, but has not been selected by the random sampling for the second screen. Within this data pool, we randomly sample 5000 observations within each unique SOC category. This is the second part of the final dataset.
#  We combine the two parts of the final dataset, and we assign weight 1 for the second check sample and assign weight 0.5 for the random sample. This is the final dataset for model finetune.

df_sample = pd.read_csv('PATH/secondcheck_sample_ind.csv', encoding="utf_8_sig")

# keep if true_ind is 'Yes'
df_sample = df_sample[df_sample['true_ind'] == 'Yes']

# Read another dataset 'df_titleLabel' from the specified CSV file.
df_titlelabel = pd.read_csv('PATH/df_titleLabel.csv', encoding = "utf_8_sig", on_bad_lines='skip', encoding_errors='ignore')

# Merge 'df_sample' and 'df_titlelabel' on '招聘主键ID' using an outer join, and add a merge indicator column.
merged_df = df_sample.merge(df_titlelabel, on='招聘主键ID', how='outer', indicator=True)

# Select rows from 'merged_df' that are only in 'df_titlelabel' (right_only) and specific columns.
filtered_df = merged_df.loc[merged_df['_merge'] == 'right_only', ['招聘主键ID', '工作描述_y', '工作名称_y', 'soc_code_y']]

# Rename columns in 'filtered_df' by removing the '_y' suffix added during the merge.
filtered_df = filtered_df.rename(columns={'工作描述_y': '工作描述', '工作名称_y': '工作名称', 'soc_code_y': 'soc_code'})

# Drop rows from 'filtered_df' where the '工作描述' (job description) column has null values.
filtered_df = filtered_df.dropna(subset=['工作描述'])

# Within each 'soc_code' group, randomly select up to 5000 samples from 'filtered_df'.
filtered_df = filtered_df.groupby('soc_code', group_keys=False).apply(lambda x: x.sample(min(len(x), 5000)))

# append 'df_sample' and 'filtered_df', create a new dataframe 'df'
df = pd.concat([df_sample, filtered_df], axis = 0)

# Keep rows in 'df' where the 'soc_code' does not start with '55'.
df = df[~df['soc_code'].str.startswith('55')]
df.to_csv('PATH/est_sample.csv', index=False, encoding = "utf_8_sig", header=True, quoting=csv.QUOTE_NONNUMERIC)
