# Job Classifier System


Note: running the entire code takes about 60 minutes.
*   Import CSV: ~20 minutes
*   Train Job Type algorithm: ~20 minutes.
*   Train Job Category algorithm: ~20 minutes.


## Libraries


In [None]:
!pip install openai==0.28
!pip install -q cohere
!pip install -q tiktoken

In [None]:
# General purpose
import pandas as pd
import numpy as np
import re
import openai
import io
import random
import warnings
warnings.filterwarnings('ignore')

# Data preparation
from dateutil import parser
import string

# Modelling
import nltk
nltk.download('popular')

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split

import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding

## Text Classification Algorithms

### Data Preparation


The final csv file is saved as `updated3_postings.csv`. The code in this section is for demonstration purposes only.
To upload the final csv and ran the next sections of the notebook, use the box below with first line `from google.colab import files` (loading time: ~20 minutes).

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
### Code for demomstration purposes only. ###

# processing the CSV files into a useful version
jobs_file=pd.read_csv("job_postings.csv")
jobs2 = pd.read_csv('updated_postings.csv')
new_column_names = {'job_id': 'job_no','title': 'job_description','formatted_work_type': 'job_type',}
jobs2.rename(columns=new_column_names, inplace=True)
jobs2.to_csv('updated2_postings.csv', index=False)
jobs3 = pd.read_csv('updated2_postings.csv')

# keywords per each category
it_keywords = ['software', 'developer', 'engineer', 'programmer', 'IT', 'data', 'analyst', 'database', 'network', 'web', 'cloud', 'java', 'python', 'javascript', 'frontend', 'backend', 'full stack', 'devops', 'systems', 'security']
marketing_keywords = ['marketing', 'manager', 'advertising', 'sales', 'digital', 'brand', 'social media', 'communications', 'public relations', 'content', 'market research', 'branding', 'campaign', 'SEO', 'SEM', 'analytics', 'email marketing', 'event planning', 'copywriting', 'creative']
finance_keywords = ['finance', 'accountant', 'analyst', 'auditor', 'investment', 'banking', 'financial', 'audit', 'tax', 'risk management', 'financial planning', 'credit analysis', 'treasury', 'actuary', 'forensic accounting', 'bookkeeping', 'payroll', 'financial modeling', 'wealth management']
customer_service_keywords = ['customer service', 'support', 'representative', 'client relations', 'help desk', 'call center', 'customer success', 'customer satisfaction', 'customer experience', 'service desk', 'complaint resolution', 'ticketing system', 'client support', 'client interaction', 'customer communication', 'escalation management', 'customer feedback', 'client support', 'service delivery']
healthcare_keywords = ['healthcare','care', 'nurse', 'doctor', 'medical', 'hospital', 'health', 'clinical', 'pharmacy', 'medical billing', 'health information management', 'patient care', 'nursing', 'surgery', 'medical research', 'healthcare administration', 'medical imaging', 'pharmaceutical', 'epidemiology', 'public health', 'radiology']
education_keywords = ['education', 'teacher', 'professor', 'instructor', 'tutor', 'academic', 'learning', 'school', 'curriculum development', 'educational technology', 'teaching assistant', 'classroom management', 'e-learning', 'educational research', 'student affairs', 'special education', 'educational leadership', 'pedagogy', 'instructional design']
engineering_keywords = ['engineering', 'engineer', 'mechanical', 'electrical', 'civil', 'structural', 'automation', 'chemical', 'aerospace', 'biomedical', 'industrial', 'environmental', 'process', 'systems', 'quality', 'materials', 'control systems', 'manufacturing', 'robotics', 'energy']
administration_keywords = ['administration', 'manager', 'coordinator', 'office', 'administrative', 'office manager', 'operations', 'administrative assistant', 'executive assistant', 'office administrator', 'office coordinator', 'office operations', 'office support', 'office procedures', 'office efficiency', 'office management', 'front desk', 'receptionist', 'secretary']
sales_keywords = ['sales', 'representative', 'account executive', 'business development', 'sales manager', 'sales associate', 'client acquisition', 'sales coordinator', 'sales support', 'sales operations', 'sales strategy', 'customer acquisition', 'sales funnel', 'cold calling', 'lead generation', 'sales negotiation', 'client relationship', 'sales targets', 'sales forecasting', 'sales presentation']
research_keywords = ['research', 'scientist', 'analyst', 'researcher', 'data analysis', 'research analyst', 'experimental', 'research scientist', 'statistical analysis', 'market research', 'quantitative research', 'qualitative research', 'research design', 'data collection', 'data interpretation', 'survey design', 'research methodologies', 'experimental design', 'scientific research']
creative_keywords = ['creative', 'designer', 'artist', 'writer', 'editor', 'graphic design', 'illustrator', 'content creation', 'creative director', 'art director', 'copywriter', 'visual arts', 'creative writing', 'storytelling', 'digital art', 'branding design', 'multimedia design', 'motion graphics', 'creative strategy']
legal_keywords = ['legal', 'lawyer', 'attorney', 'paralegal', 'legal advisor', 'law clerk', 'legal consultant', 'law enforcement', 'legal secretary', 'corporate law', 'criminal law', 'civil law', 'family law', 'intellectual property law', 'legal research', 'legal writing', 'court proceedings', 'contract law', 'litigation', 'legal compliance']
human_resources_keywords = ['human resources', 'HR', 'recruiter', 'personnel', 'talent acquisition', 'HR manager', 'employee relations', 'human resource management', 'staffing', 'HR coordinator', 'workforce planning', 'compensation', 'benefits administration', 'employee engagement', 'performance management', 'training and development', 'organizational development', 'HR policies', 'talent management']
manufacturing_keywords = ['manufacturing', 'production', 'operator', 'technician', 'assembly', 'manufacturing engineer', 'quality control', 'production supervisor', 'process improvement', 'lean manufacturing', 'six sigma', 'production planning', 'machine operation', 'materials management', 'maintenance technician', 'operations management', 'supply chain', 'continuous improvement', 'automation', 'product development']
consulting_keywords = ['consultant', 'advisor', 'consulting', 'expert', 'strategist', 'business consultant', 'management consulting', 'strategy consultant', 'financial consultant', 'technology consultant', 'HR consultant', 'marketing consultant', 'operations consultant', 'IT consultant', 'risk management consultant', 'organizational consultant', 'change management consultant', 'business analysis', 'business strategy', 'business process improvement']
logistics_keywords = ['logistics', 'supply chain', 'warehouse', 'dispatcher', 'shipping', 'logistics coordinator', 'inventory management', 'distribution', 'supply chain management', 'logistics manager', 'shipping and receiving', 'inventory control', 'logistics planning', 'order fulfillment', 'transportation', 'logistics operations', 'procurement', 'supply chain optimization', 'freight']
real_estate_keywords = ['real estate', 'property', 'realtor', 'broker', 'appraiser', 'real estate agent', 'property management', 'real estate development', 'commercial real estate', 'residential real estate', 'real estate finance', 'property valuation', 'real estate transactions', 'real estate law', 'real estate marketing', 'real estate investment', 'leasing agent', 'land development', 'real estate appraisal', 'property leasing']
jobs3['category'] = 'Uncategorized'

# checking keywords in job description to assign category
for index, row in jobs3.iterrows():

    title = row['job_description'].lower()

    if any(keyword in title for keyword in it_keywords):
        jobs3.at[index, 'category'] = 'IT'
    elif any(keyword in title for keyword in marketing_keywords):
        jobs3.at[index, 'category'] = 'Marketing'
    elif any(keyword in title for keyword in finance_keywords):
        jobs3.at[index, 'category'] = 'Finance'
    elif any(keyword in title for keyword in customer_service_keywords):
        jobs3.at[index, 'category'] = 'Customer Service'
    elif any(keyword in title for keyword in healthcare_keywords):
        jobs3.at[index, 'category'] = 'Healthcare'
    elif any(keyword in title for keyword in education_keywords):
        jobs3.at[index, 'category'] = 'Education'
    elif any(keyword in title for keyword in engineering_keywords):
        jobs3.at[index, 'category'] = 'Engineering'
    elif any(keyword in title for keyword in administration_keywords):
        jobs3.at[index, 'category'] = 'Administration'
    elif any(keyword in title for keyword in sales_keywords):
        jobs3.at[index, 'category'] = 'Sales'
    elif any(keyword in title for keyword in research_keywords):
        jobs3.at[index, 'category'] = 'Research'
    elif any(keyword in title for keyword in creative_keywords):
        jobs3.at[index, 'category'] = 'Creative'
    elif any(keyword in title for keyword in legal_keywords):
        jobs3.at[index, 'category'] = 'Legal'
    elif any(keyword in title for keyword in human_resources_keywords):
        jobs3.at[index, 'category'] = 'Human Resources'
    elif any(keyword in title for keyword in manufacturing_keywords):
        jobs3.at[index, 'category'] = 'Manufacturing'
    elif any(keyword in title for keyword in consulting_keywords):
        jobs3.at[index, 'category'] = 'Consulting'
    elif any(keyword in title for keyword in logistics_keywords):
        jobs3.at[index, 'category'] = 'Logistics'
    elif any(keyword in title for keyword in real_estate_keywords):
        jobs3.at[index, 'category'] = 'Real Estate'

jobs3.to_csv('updated3_postings.csv', index=False)

def is_valid_date(date_str):
    try:
        parser.parse(date_str)
        return True
    except:
        return False

def date_removal(data):
    new_list = [' '.join([w for w in line.split() if not is_valid_date(w)]) for line in data]
    return (new_list[0])

def stemmer_and_stopWord(doc):
    doc= nlp(doc)
    token_list = []
    for token in doc:
      lemma = token.lemma_
      if lemma == '-PRON-' or lemma == 'be':
        lemma = token.text
      token_list.append(lemma)
    stemmed = token_list
    filtered_sentence =[]
    for word in stemmed[:100]:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word)
    return (' '.join(filtered_sentence))

def normaliz(filtered_sentence):
    words = [str(word).lower() for word in filtered_sentence.split()]
    return  ' '.join(words[:100])

def numbers_removal(data):
    s = [data]
    result = ''.join([i for i in s if not i.isdigit()])
    return (result)

def punch_removal(words):
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in [words]]
    return re.sub(' +', ' ', stripped[:100][0])

def cleaner(data):
    string = [data]
    string = date_removal(string)
    string = punch_removal(string)
    string = stemmer_and_stopWord(string)
    string = normaliz(string)
    return string

### Modelling


In this section, we prepare the input to our models by importing the job data, engineering features, and tokenizing the text.

In [None]:
#import the CSV
train_df = pd.read_csv(io.BytesIO(uploaded['updated3_postings.csv']))

In [None]:
# Feature engineering
train_df['word_count'] = train_df["job_description"].apply(lambda x: len(str(x).split(" ")))
train_df['char_count'] = train_df["job_description"].apply(lambda x:sum(len(word) for word in str(x).split(" ")))
train_df['sentence_count'] = train_df["job_description"].apply(lambda x: len(str(x).split(".")))
train_df['avg_word_length'] = train_df['char_count'] / train_df['word_count']
train_df['avg_sentence_lenght'] = train_df['word_count'] / train_df['sentence_count']

# Check for possible division by zero
train_df['avg_word_length'] = train_df.apply(lambda row: row['char_count'] / row['word_count'] if row['word_count'] != 0 else 0, axis=1)
train_df['avg_sentence_length'] = train_df.apply(lambda row: row['word_count'] / row['sentence_count'] if row['sentence_count'] != 0 else 0, axis=1)


# Hyperparameter specifying the limit of words in the vocabulary of the model
MAX_NB_WORDS = 29000

# Tokenizer with OOV (out of vocabulary)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, oov_token='OOV')
tokenizer.fit_on_texts(train_df['job_description'].values)

# Hyperparameter specifying the max number of words in each job description
MAX_SEQUENCE_LENGTH = 100

# Convert job descriptions into sequeneces of digits and apply padding to ensure consistent dimensions
X_descr = tokenizer.texts_to_sequences(train_df['job_description'].values)
X_descr = pad_sequences(X_descr, maxlen = MAX_SEQUENCE_LENGTH)
MAX_NB_WORDS = len(tokenizer.word_index) + 1 #Â accounting for OOV words.

#### Training for Job Type


In this sub-section, we design and train a model for predicting the job type given a job description.

In [None]:
# Job types
job_type = {
    'Full-time':0,
    'Contract':1,
    'Part-time':2,
    'Temporary':3,
    'Internship':4,
    'Other':5,
    'Volunteer':6,
    }

# Setting target labels
Y_type = pd.get_dummies(train_df.replace({"job_type": job_type})['job_type'].values)

# Random initialisations of weights
class_weights = {i: random.random() for i in range(7)}

# Random data splitting for training and evaluating
X_train_type, X_test_type, Y_train_type, Y_test_type = train_test_split(X_descr,Y_type, test_size = 0.20)

# Hyperparameter specifying the size of the vector for each words in the Dense layer
EMBEDDING_DIM = 100

# Creating a sequential model
job_model = Sequential()
job_model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length = X_train_type.shape[1]))
job_model.add(LSTM(100, dropout = 0.3, recurrent_dropout = 0.3, return_sequences=True))
job_model.add(LSTM(80, dropout = 0.3, recurrent_dropout = 0.3))
job_model.add(Dense(128, activation = 'relu'))
job_model.add(Dropout(0.3))
job_model.add(Dense(7, activation = 'softmax'))
job_model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

# Training and evaluating the model
epochs = 5
batch_size = 64
history = job_model.fit(
    X_train_type,
    Y_train_type,
    epochs = epochs,
    batch_size = batch_size,
    validation_split = 0.2,
    callbacks = [EarlyStopping(monitor = 'val_loss', patience = 3, min_delta = 0.0001)],
    class_weight = class_weights)

print("Evaluating model on test set...")

accr = job_model.evaluate(X_test_type, Y_test_type)

print('Test set:    Loss: {:0.3f}, Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluating model on test set...
Test set:    Loss: 0.607, Accuracy: 0.819


#### Training for Job Category

In this sub-section, we design and train a model for predicting the job category given a job description.

In [None]:
# Job categories
job_cat = {
    'IT': 0,
    'Marketing': 1,
    'Finance': 2,
    'Customer Service': 3,
    'Healthcare': 4,
    'Education': 5,
    'Engineering': 6,
    'Administration': 7,
    'Sales': 8,
    'Research': 9,
    'Creative': 10,
    'Legal': 11,
    'Human Resources': 12,
    'Manufacturing': 13,
    'Consulting': 14,
    'Logistics': 15,
    'Real Estate': 16,
    'uncategorized': 17
}

# Random weight initialisation
class_weights = {i: random.random() for i in range(18)}

# Setting target labels
Y_cat = pd.get_dummies(train_df.replace({"category": job_cat})['category'].values) # target variable

# Random data splitting for training and evaluating
X_train_cat, X_test_cat, Y_train_cat, Y_test_cat = train_test_split(X_descr, Y_cat, test_size=0.20)

# Hyperparameter specifying the size of the vector for each words in the Dense layer
EMBEDDING_DIM = 300

# Creating a sequential model
cat_model = Sequential()
cat_model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_descr.shape[1]))
cat_model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.5))
cat_model.add(Dense(128, activation='relu'))
cat_model.add(Dropout(0.5))
cat_model.add(Dense(18, activation='softmax'))
cat_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training and evaluating the model
epochs = 5
batch_size = 64
history_cat = cat_model.fit(
    X_train_cat,
    Y_train_cat,
    epochs = epochs,
    batch_size = batch_size,
    validation_split = 0.2,
    callbacks = [EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)],
    class_weight = class_weights)

print('Evaluating model on test set...')

accr = cat_model.evaluate(X_test_cat, Y_test_cat)

print('Test set:    Loss: {:0.3f}, Accuracy: {:0.3f}'.format(accr[0], accr[1]))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluating model on test set...
Test set:    Loss: 0.102, Accuracy: 0.977


## Letter Text Parser (with ChatGPT)


### Fine tuning

In this sub-section we connect to OpenAI's ChatGPT API and fine tune the model.

In [None]:
openai.api_key = ''


# Example conversation to fine tune OpenAI's model. In this case, we just specify the content for system, a general attribute describing the context and goal of the model.
messages = [

    {"role": "system",
     "content": """You receive as input a motivational or statement or reference letter written by the user for a job application.
        Internally, extract extract these features: Job Type (full time, part time, hybrid), Job Title, Job Position (intership, entry level, associate, midsenior level, director, executive),
        Subject of education obtained (excluding univeristy name), Professional experiences, International experiences, Intrapersonal skills, Computer software tools known, Passions, and Distinguishable features.
        Then, write a brief job description based on the features you extracted."""},
    ]

### Usage

In this sub-section, we define two functions that take as input a letter and output the key characteristics.

In [None]:
# Function for extracting the key information from the letter
def convert_letter(message):
    if message:
        messages.append({"role": "user", "content": message})
        response = openai.Completion.create(
            model="text-davinci-003",
            prompt="\n".join([msg['content'] for msg in messages]),
            temperature = 0.3, # parameter for randomness of response
            max_tokens = 250, # parameter for length of response
            stop=None)

        reply = response.choices[0].text.strip()

        return reply

Note for the block below: running the API may return an empty print. If that is the case, run again the block with the same input.  

In [None]:
#Usage of OpenAI's ChatGPT API
message = input("User: ")
reply = convert_letter(message)
print(reply)

job_description = reply

## Predicting job type and category

In this section we put pass the information extracted from the letter and predict the ideal job type and category of the user.

In [None]:
job_pred = []
category_pred = []

# Preparing input from motivational letter
seq = tokenizer.texts_to_sequences([job_description])
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)

# Predicting job type
pred_job = job_model.predict(padded)
labels_job = ['Full-time', 'Contract', 'Part-time', 'Temporary', 'Internship', 'Other', 'Volunteer']
index_job = min(np.argmax(pred_job), len(labels_job) - 1)
job_pred.append(labels_job[index_job])

# Predicting job category
pred_cat = cat_model.predict(padded)
labels_cat = ['IT', 'Marketing', 'Finance', 'Customer Service', 'Healthcare', 'Education', 'Engineering', 'Administration',
              'Sales', 'Research', 'Creative', 'Legal', 'Human Resources', 'Manufacturing', 'Consulting', 'Logistics', 'Real Estate']
index_cat = min(np.argmax(pred_cat), len(labels_cat) - 1)
category_pred.append(labels_cat[index_cat])

# Saving and visualising results
pred_results = pd.DataFrame()
pred_results['job_type_pred'] = pd.Series(job_pred)
pred_results['job_cat_pred'] = pd.Series(category_pred)
pred_results