<a href="https://colab.research.google.com/github/milazudina/ds4a_team36/blob/main/impute_job_type.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import spacy
import random
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, SpatialDropout1D, Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

nltk.download('stopwords')

nlp = spacy.load("en_core_web_sm")
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Code below does the following:

1.   Load the Kaggle job description (JDs) dataset
  *  Remove all the job postings that don't have either a JD or a Job Type
  *  Extract a unique set of noun phrases for each Job Posting

2.   Build a deep learning model and assess its accuracy

3.   Predict the Job Types for the scraped job descriptions



In [None]:
kaggle_df = pd.read_csv("indeed_job_dataset.csv")

#print(kaggle_df['Description'][6])
#print(kaggle_df['Job_Type'][0])

In [None]:
# 302 jobs do not have a description that we will use to infer the job type, hence let's remove these from the dataframe
kaggle_df = kaggle_df[kaggle_df.Description.notnull()]
kaggle_df.reset_index(drop=True, inplace=True)
kaggle_df.shape
print(kaggle_df.isnull().sum(axis = 0))
kaggle_df.head(10)

In [None]:
# tidy up the job description text

kaggle_df["Description_tidy"] = 0
i = 0

for job_description in kaggle_df['Description']:

    ## Clean the text
  job_description = job_description.lower().replace("\n", "").replace("</b>", "").replace("</p>", "").replace("<b>", "").replace("<p>", "").replace("</li>", "").replace("</ul>", "").replace("<li>", "").replace("<ul>", "").replace("<i>", "").replace("</i>", "").replace("\r", "").replace("<div>", "").replace("h2", "").replace("h3","").replace("h1","")
  job_description = job_description.replace("</h1>", "").replace("</div>", "").replace("/h2", "").replace("/h3","")
  job_description = re.sub("-", " ", job_description)
  job_description = re.sub("/", " ", job_description)
  job_description = re.sub(" a ", " ", job_description)
  job_description = re.sub(" an ", " ", job_description)
  job_description = re.sub("[0-9]", "", job_description)
  job_description = job_description.replace(" the ", " ").replace(")", "").replace("(", "").replace("e.g.", "").replace("£", "").replace("$", "").replace("%", "").replace("e g", "").replace(".", " ").replace(",", " ").replace(":", "").replace(";", "").replace("?", "").replace("*", "").replace(" eg ", "").replace(">", "").replace("<", "")
  temp = [w for w in job_description.split(" ") if not w in STOPWORDS]
  job_description = " ".join(temp)

  text = nlp(job_description)  
  noun_phrases = [chunk.text for chunk in text.noun_chunks] 
  noun_phrases = np.array(noun_phrases)
  noun_phrases = np.unique(noun_phrases)
  job_description = " ".join(noun_phrases)

  kaggle_df["Description_tidy"][i] = job_description

  i = i + 1


NameError: ignored

In [None]:
print(kaggle_df['Description'][0])
print("\n")
print(kaggle_df['Description_tidy'][0])

[<p><b>POSITION SUMMARY</b></p>, <p>
The Business Analyst role is the primary architect of reporting and dashboard solutions for internal and external clients. Utilizing ESI corporate standard development tools this position is responsible for the design, development, implementation, analysis, interpretation and communication of business information based on the needs of individual clients. The ability to balance overall aesthetics with robust and intuitive functionality is a critical requirement for success in this position.</p>, <p><b>
ESSENTIAL FUNCTIONS</b></p>, <ul><li>
Successfully design and implement external client data reporting and dashboard solutions with a strong focus on product aesthetics and functionality.</li><li>
Aid in the design, development, and implementation of new product ideas for external and internal clients.</li><li>
Maintain Live and Data Warehouse Business Objects Universes; add new fields, modify table joins, implement data structures that strea

In [None]:
kaggle_df = kaggle_df[kaggle_df.Description_tidy != 0]
kaggle_JDs = np.asarray(kaggle_df["Description_tidy"])

y = kaggle_df['Job_Type'] # labels
print(y.value_counts())
y = y.tolist()

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(kaggle_JDs)
vocab_size = len(t.word_index)+1
print(vocab_size)

# integer encode the documents
encoded_JDs = t.texts_to_sequences(kaggle_JDs)
print(len(encoded_JDs))

list_len = [len(i) for i in encoded_JDs]

# pad documents to a max length
max_length = max(list_len)
print(max(list_len))
padded_JDs = pad_sequences(encoded_JDs, maxlen=max_length, padding = 'post')


data_scientist    2376
data_analyst      1535
data_engineer     1309
Name: Job_Type, dtype: int64
36292
5220
915


In [None]:
print(kaggle_JDs)

['ability advance career company bachelor’s degree related field   years experience better health   express scripts leading healthcare company better health outcomes br   bachelor’s degree related field   years experience business objects creative committed creating systems service solutions creativity  integrity customer value technical innovation data structures data universe structures data visualization applications degree master department  express scripts design implementation process development development  implementation new product ideas external internal clients document best practices points easier people esi corporate standard development tools essential functions evaluation potential integration new development tools express scripts equal opportunity employer disability veteran focus product aesthetics functionality aid design hard work highly competitive base salary comprehensive benefits program implement implementation  analysis  interpretation communication business i

In [None]:
def make_dummy_var(y):
  temp = np.zeros([len(y), 3])
  for i in range(0,len(y)):
    if y[i] == 'data_scientist':
      temp[i,0] = 1
      temp[i,1] = 0
      temp[i,2] = 0
    elif y[i] == 'data_analyst':
      temp[i,0] = 0
      temp[i,1] = 1
      temp[i,2] = 0
    elif y[i] == 'data_engineer':
      temp[i,0] = 0
      temp[i,1] = 0
      temp[i,2] = 1
  return temp

y_onehot = make_dummy_var(y)

X_train, X_test, y_train, y_test = train_test_split(padded_JDs, y_onehot, test_size=0.33)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

print(len(y_train[y_train[:,0] == 1])/(len(y_train[y_train[:,0] == 0])+len(y_train[y_train[:,0] == 1])))
print(len(y_test[y_test[:,0] == 1])/(len(y_test[y_test[:,0] == 0])+len(y_test[y_test[:,0] == 1])))

print(len(y_train[y_train[:,1] == 1])/(len(y_train[y_train[:,1] == 0])+len(y_train[y_train[:,1] == 1])))
print(len(y_test[y_test[:,1] == 1])/(len(y_test[y_test[:,1] == 0])+len(y_test[y_test[:,1] == 1])))

print(len(y_train[y_train[:,2] == 1])/(len(y_train[y_train[:,2] == 0])+len(y_train[y_train[:,2] == 1])))
print(len(y_test[y_test[:,2] == 1])/(len(y_test[y_test[:,2] == 0])+len(y_test[y_test[:,2] == 1])))

print(y_train)

(3497, 915)
(1723, 915)
(3497, 3)
(1723, 3)
0.4566771518444381
0.4521183981427742
0.2942522161853017
0.293673824724318
0.24907063197026022
0.2542077771329077


In [None]:
# Run this only if using pretrained embedding matrix

embeddings_index = dict()
# can download from https://nlp.stanford.edu/projects/glove/
f = open('glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 100))
count = 0
for word, i in t.word_index.items():
	#print(word)
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		count = count + 1
		embedding_matrix[i] = embedding_vector

print(count)

glove_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False)

In [None]:
model = Sequential()
# model.add(glove_layer) # uncomment this only if using pretrained embedding matrix
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Conv1D(filters=144, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(72, activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(3, activation='softmax'))

Adam = keras.optimizers.Adam(learning_rate=0.001)

model.compile(optimizer=Adam, loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

# add learning rate parameter

# fit the model
model.fit(X_train, y_train, epochs=20, verbose=1, batch_size=32)

# evaluate the model
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
print('Accuracy: %f' % (accuracy*100))

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print('Test accuracy: %f' % (test_accuracy*100))

# With pre-trained embedding:
# Accuracy: 99.779373
# 56/56 [==============================] - 3s 51ms/step - loss: 1.1872 - accuracy: 0.8075
# Test accuracy: 80.749857

# With embedding trained on thie kaggle set.
# Accuracy: 99.685442
# 54/54 [==============================] - 7s 129ms/step - loss: 0.7627 - accuracy: 0.8601
# Test accuracy: 86.012769

The section below repeats some steps in the section above but for a combination of kaggle + indeed datasets

In [None]:
indeed_df = pd.read_csv("df_UK_2021-10-04_with_skills.csv")
indeed_df.head(10)

# tidy up the job description text

indeed_df["Description_tidy"] = 0
i = 0

for job_description in indeed_df['Description']:

    ## Clean the text
  job_description = job_description.lower().replace("\n", "").replace("</b>", "").replace("</p>", "").replace("<b>", "").replace("<p>", "").replace("</li>", "").replace("</ul>", "").replace("<li>", "").replace("<ul>", "").replace("<i>", "").replace("</i>", "").replace("\r", "").replace("<div>", "").replace("h2", "").replace("h3","").replace("h1","")
  job_description = job_description.replace("</h1>", "").replace("</div>", "").replace("/h2", "").replace("/h3","")
  job_description = re.sub("-", " ", job_description)
  job_description = re.sub("/", " ", job_description)
  job_description = re.sub(" a ", " ", job_description)
  job_description = re.sub(" an ", " ", job_description)
  job_description = re.sub("[0-9]", "", job_description)
  job_description = job_description.replace(" the ", " ").replace(")", "").replace("(", "").replace("e.g.", "").replace("£", "").replace("$", "").replace("%", "").replace("e g", "").replace(".", " ").replace(",", " ").replace(":", "").replace(";", "").replace("?", "").replace("*", "").replace(" eg ", "").replace(">", "").replace("<", "")
  temp = [w for w in job_description.split(" ") if not w in STOPWORDS]
  job_description = " ".join(temp)

  text = nlp(job_description)  
  noun_phrases = [chunk.text for chunk in text.noun_chunks] 
  noun_phrases = np.array(noun_phrases)
  noun_phrases = np.unique(noun_phrases)
  job_description = " ".join(noun_phrases)

  indeed_df["Description_tidy"][i] = job_description

  i = i + 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [None]:
indeed_JDs = np.asarray(indeed_df["Description_tidy"])
kaggle_df = kaggle_df[kaggle_df.Description_tidy != 0]
kaggle_JDs = np.asarray(kaggle_df["Description_tidy"])
combined_JDs = np.concatenate((kaggle_JDs, indeed_JDs), axis=None)

y = kaggle_df['Job_Type'] # labels
print(y.value_counts())
y = y.tolist()

# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(combined_JDs)
vocab_size = len(t.word_index)+1
print(vocab_size)

# integer encode the documents
encoded_JDs = t.texts_to_sequences(combined_JDs)
print(len(encoded_JDs))

list_len = [len(i) for i in encoded_JDs]

# pad documents to a max length
max_length = max(list_len)
print(max(list_len))
padded_JDs = pad_sequences(encoded_JDs, maxlen=max_length, padding = 'post')

data_scientist    2376
data_analyst      1535
data_engineer     1309
Name: Job_Type, dtype: int64
41004
6363
926


In [None]:
print(combined_JDs)

['ability advance career company bachelor’s degree related field   years experience better health   express scripts leading healthcare company better health outcomes br   bachelor’s degree related field   years experience business objects creative committed creating systems service solutions creativity  integrity customer value technical innovation data structures data universe structures data visualization applications degree master department  express scripts design implementation process development development  implementation new product ideas external internal clients document best practices points easier people esi corporate standard development tools essential functions evaluation potential integration new development tools express scripts equal opportunity employer disability veteran focus product aesthetics functionality aid design hard work highly competitive base salary comprehensive benefits program implement implementation  analysis  interpretation communication business i

In [None]:
kaggle_JDs.shape

(5220,)

In [None]:
def make_dummy_var(y):
  temp = np.zeros([len(y), 3])
  for i in range(0,len(y)):
    if y[i] == 'data_scientist':
      temp[i,0] = 1
      temp[i,1] = 0
      temp[i,2] = 0
    elif y[i] == 'data_analyst':
      temp[i,0] = 0
      temp[i,1] = 1
      temp[i,2] = 0
    elif y[i] == 'data_engineer':
      temp[i,0] = 0
      temp[i,1] = 0
      temp[i,2] = 1
  return temp

y_onehot = make_dummy_var(y)

X_train, X_test, y_train, y_test = train_test_split(padded_JDs[0:5220], y_onehot, test_size=0.33)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

print(len(y_train[y_train[:,0] == 1])/(len(y_train[y_train[:,0] == 0])+len(y_train[y_train[:,0] == 1])))
print(len(y_test[y_test[:,0] == 1])/(len(y_test[y_test[:,0] == 0])+len(y_test[y_test[:,0] == 1])))

print(len(y_train[y_train[:,1] == 1])/(len(y_train[y_train[:,1] == 0])+len(y_train[y_train[:,1] == 1])))
print(len(y_test[y_test[:,1] == 1])/(len(y_test[y_test[:,1] == 0])+len(y_test[y_test[:,1] == 1])))

print(len(y_train[y_train[:,2] == 1])/(len(y_train[y_train[:,2] == 0])+len(y_train[y_train[:,2] == 1])))
print(len(y_test[y_test[:,2] == 1])/(len(y_test[y_test[:,2] == 0])+len(y_test[y_test[:,2] == 1])))

print(y_train)

(3497, 926)
(1723, 926)
(3497, 3)
(1723, 3)
0.45925078638833283
0.4468949506674405
0.28738919073491564
0.3076030179918746
0.2533600228767515
0.24550203134068485
[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [None]:
model = Sequential()
# model.add(glove_layer) # uncomment this only if using pretrained embedding matrix
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(SpatialDropout1D(0.2))
model.add(Conv1D(filters=144, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(72, activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(3, activation='softmax'))

Adam = keras.optimizers.Adam(learning_rate=0.001)

model.compile(optimizer=Adam, loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

# add learning rate parameter

# fit the model
model.fit(X_train, y_train, epochs=20, verbose=1, batch_size=32)

# evaluate the model
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
print('Accuracy: %f' % (accuracy*100))

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print('Test accuracy: %f' % (test_accuracy*100))

#Accuracy: 99.714041
#54/54 [==============================] - 7s 131ms/step - loss: 0.9482 - accuracy: 0.8741
#Test accuracy: 87.405688

NameError: ignored

In [None]:
indeed_job_types = model.predict(padded_JDs[5220:len(padded_JDs)])

In [None]:
print(indeed_job_types.shape)

(1143, 3)
(1143,)
