In [1]:
# This script processes job posting data from various sources. It reads data files, cleans and filters the data based on specific criteria (like removing duplicates and filtering by source), and finally extracts and saves job titles and descriptions for further analysis. Requires input data in .dat format and outputs processed data in CSV format.

# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from datetime import datetime
import gc, json, csv, re, os, glob

In [2]:
def read_dat_data(curFile):
    """
    Reads a .dat file containing job posting data and returns a DataFrame.
    The function expects a specific format with predefined column names and uses '@!' as a separator.
    Parameters:
    curFile (str): File path of the .dat file to be read.
    Returns:
    pandas.DataFrame: Contains the job posting data with specified column names.
    """
    colNames = ['招聘主键ID','公司ID','公司名称','城市名称','公司所在区域','工作薪酬','教育要求','工作经历',
                '工作描述','职位名称','工作名称','招聘数量','发布日期','行业名称','数据来源']
    resCSV = pd.read_csv(curFile, header=None, index_col=None, names=colNames,encoding='utf-8',quoting=csv.QUOTE_NONE, sep="@!", error_bad_lines=False, engine='python')
    return resCSV

In [None]:
# Data Cleaning and Preparation Steps:
# Step 1: Replace missing job titles ('工作名称') with position names ('职位名称').
# Step 2: Remove entries where the publication date ('发布日期') is missing.
# Step 3: Exclude part-time jobs ('兼职') from the dataset.
# Step 4: Eliminate duplicates within a month considering '公司ID', '工作名称', '城市名称' as identifying fields.
# Step 5: Retain job postings from major websites only, based on the '数据来源' field.

dataNameTmp = "PATH/job_posting_%s.dat"

naSum = 0
dupSum = 0

Start
2021-10-25 10:57:41


In [1]:
for i in range(1,142): 
    # Process each job posting file: Clean, filter, and save the processed data.

    curFile = dataNameTmp%i
        
    print(curFile," is Grouping Computing...")
    datDf = read_dat_data(curFile)
    datDf = datDf.replace(r'\N',np.NaN).dropna(subset=['发布日期'])
    datDf['工作名称'] = datDf[['工作名称']].replace(r'\N',np.NaN)
    datDf.loc[datDf['工作名称'].isna(),'工作名称'] = datDf.loc[datDf['工作名称'].isna(),'职位名称']
    datDf = datDf[datDf['工作名称'] != "兼职"]
     # subset the data to only include the '来源' == '智联招聘', '前程无忧', '拉勾网', 'BOSS直聘', '58同城', '猎聘网', '看准网', 百姓网', '拉勾网', '猎聘', '赶集网'， 'BOSS'
    datDf = datDf[datDf['数据来源'].isin(['智联招聘', '前程无忧', '拉勾网', 'BOSS直聘', '58同城', '猎聘网', '看准网', '百姓网', '拉勾网', '猎聘', '赶集网', 'BOSS'])]

    curNa = datDf.shape[0]
    naSum = naSum + curNa
    
    datDf['date'] = datDf['发布日期'].apply(lambda x: x[0:7])
    datDf = datDf.drop_duplicates(subset=['公司ID', '工作名称', '城市名称', 'date'], keep='first').reset_index(drop=True)
    
    curDup = datDf.shape[0]
    dupSum = dupSum + curDup
    
    print(curFile,"删除空值剩余: %s"%curNa, "去重复值剩余：%s"%curDup)
    datDf.to_csv('PATH/job_res_{}.csv'.format(i), sep='?', encoding = 'utf_8_sig', index=False)

In [None]:
# From the master data, we separate ``job description" so that rest of the data are manageable. 

directory = 'PATH/'

# iterate over files in that directory
for filename in os.listdir(directory):
    # checking if it is a file
    if filename.startswith("job_res_"): # for files start with a prefix #
        f = os.path.join(directory, filename)
        df = pd.read_csv(f, encoding = "utf_8_sig", on_bad_lines='skip', delimiter= "?", header=None, encoding_errors='ignore')
        df.rename(columns={0: '招聘主键ID', 1: '公司ID', 2: '公司名称', 3: '城市名称', 4: '公司所在区域', 5: '工作薪酬', 6: '教育要求', 
                   7: '工作经历', 8: '工作描述', 9: '职位名称', 10: '工作名称', 11: '招聘数量', 12: '发布日期', 13: '行业名称', 
                   14: '数据来源'}, inplace=True)
        df_charac = df[['招聘主键ID', '公司ID', '公司名称', '城市名称', '公司所在区域', '工作薪酬', '教育要求', '工作经历', '职位名称', 
         '工作名称', '招聘数量', '发布日期', '行业名称', '数据来源']]
        
        # export the data to csv, use the header and set the encoding to utf-8
        df_charac.to_csv('PATH/{}'.format(filename), encoding = "utf_8_sig", header=True)

In [None]:
# Append all the character data together, then generate a list of the job titles that used to feed to the ChatGPT
os.chdir("PATH")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f, encoding = "utf_8_sig", on_bad_lines='skip', usecols = ['工作名称']) for f in all_filenames], ignore_index=True)
# This is the complete list of job posting titles 
combined_csv.to_csv('PATH/charac_posting.csv', index=False, header=True)
# Save all the ``job description" data

directory = 'PATH/'
# iterate over files in that directory
for filename in os.listdir(directory):
    # checking if it is a file
    if filename.startswith("job_res_"): # for files start with a prefix #
        f = os.path.join(directory, filename)
        df = pd.read_csv(f, encoding = "utf_8_sig", on_bad_lines='skip', delimiter= "?", header=None, encoding_errors='ignore')
        df.rename(columns={0: '招聘主键ID', 1: '公司ID', 2: '公司名称', 3: '城市名称', 4: '公司所在区域', 5: '工作薪酬', 6: '教育要求', 
                   7: '工作经历', 8: '工作描述', 9: '职位名称', 10: '工作名称', 11: '招聘数量', 12: '发布日期', 13: '行业名称', 
                   14: '数据来源'}, inplace=True)
        df_desp = df[['招聘主键ID', '公司ID', '工作描述']]
        df_desp.to_csv('E:/Data/job_posting/processed/description/{}'.format(filename))
        
        del df_desp
        del df
        gc.collect()

### Note: 

- The cell below is used to determine the website source, it has been determined and does not need to be run again.

In [None]:
# Determine the data source, we limit to Top 10 job posting websites to avoid fuzzywuzzy in the data source.
dataNameTmp = "PATH/job_posting_%s.dat"

df_list = []

for i in range(1,3): 
    curFile = dataNameTmp%i
        
    print(curFile," is Grouping Computing...")
    datDf = read_dat_data(curFile)


    # count number of occurrences of each value in column '数据来源', generate a new column to record the count
    datDf['count'] = datDf.groupby('数据来源')['数据来源'].transform('count')

    # drop duplicates based on column '数据来源', only keep the first occurrence
    datDf = datDf.drop_duplicates(subset=['数据来源'], keep='first').reset_index(drop=True)
    datDf = datDf[['数据来源', 'count']]

    df_list.append(datDf)
        
final_df = pd.concat(df_list)

# group by '数据来源' and sum the count
final_df = final_df.groupby('数据来源').sum().reset_index()

# sort the dataframe based on column 'count'
final_df = final_df.sort_values(by=['count'], ascending=False)
final_df.head(20)