In [89]:
import amlutils
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import time
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from tqdm.notebook import tqdm

In [2]:
tf.__version__

'2.1.0'

In [3]:
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Parse Training Data

In [7]:
csv_file = "./data/Dice_US_jobs.csv"
csv_data = amlutils.csv_utils.parse_csv_file(csv_file, quotechar='"', encoding="latin-1")
headers = csv_data[0]
csv_data = csv_data[1:]
START_TOKEN = "<start_token>"
END_TOKEN = "<end_token>"

print(headers)
print(len(csv_data))

['country_code', 'date_added', 'job_board', 'job_description', 'job_title', 'job_type', 'location', 'organization', 'page_url', 'phone_number', 'salary', 'sector']
21919


In [8]:
import re
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)

def stripnewlinechars(data):
    data = data.replace(r"\n", "")
    data = data.replace(r"\r", "")
    return data

def add_start_and_end_token(string, start_token=START_TOKEN, end_token=END_TOKEN):
    return "{} {} {}".format(start_token, string, end_token) 

def preprocess_sentence(sentence):
    sentence = sentence.lower()
    s = striphtml(sentence)
    return add_start_and_end_token(s)

In [9]:
job_description_index = headers.index("job_description")
job_title_index = headers.index("job_title")
job_titles = [row[job_title_index] for row in csv_data]
job_descriptions = [row[job_description_index] for row in csv_data]

start_end_token_descriptions = [preprocess_sentence(desc) for desc in job_descriptions]
start_end_token_job_titles = [preprocess_sentence(title) for title in job_titles]


In [10]:
print(start_end_token_descriptions[0], start_end_token_job_titles[0])

<start_token> minimum required skills:edi, trustedlink, as2, vanif you are an edi analyst with experience, please read on!we are a strong, long standing company looking for an edi analyst for our team.  you must have 3+ years of edi experience  in a trustedlink for i environment. your role will work with our finance department identifying trading partners, work closely with external customers and be the edi liaison across the company. you will also monitor as2 and van communications, correct errors and incoming data.what you need for this positionrequirements:3+ years of edi experience3+ years of trustedlink for i experience, iseries/as400experience with van and as2 communicationswhat's in it for youwe offer a strong compensation package and benefits!local candidates only please.so, if you are an edi analyst with experience, please apply today!applicants must be authorized to work in the u.s.please apply directly to by clicking 'click here to apply' with your word resume!looking forwar

In [11]:
def tokenize(data, maxlen=None):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='#$%&()*+,-./:;=?@[\\]^`{|}~\t\n')
    tokenizer.fit_on_texts(data)
    data_sequences = tokenizer.texts_to_sequences(data)
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
        data_sequences,
        maxlen=maxlen,
        padding="post",
    )
    return padded_sequences, tokenizer

In [12]:
train_sequences, train_tokenizer = tokenize(start_end_token_descriptions, maxlen=4000)
label_sequences, label_tokenizer = tokenize(start_end_token_job_titles)

In [13]:
max_length_targ, max_length_inp = label_sequences.shape[1], train_sequences.shape[1]
print(max_length_targ, max_length_inp)

18 4000


In [14]:
BUFFER_SIZE = len(train_sequences)
BATCH_SIZE = 64
steps_per_epoch = len(train_sequences) // BATCH_SIZE
embedding_dim = 128
units = 256
vocab_inp_size = len(train_tokenizer.word_index)+1
vocab_tar_size = len(label_tokenizer.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((train_sequences, label_sequences)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [15]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 4000]), TensorShape([64, 18]))

In [16]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(
            self.enc_units,
            return_sequences=True,
            return_state=True,
            recurrent_initializer='glorot_uniform',
        )

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [17]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

In [18]:
# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 4000, 256)
Encoder Hidden state shape: (batch size, units) (64, 256)


In [19]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [20]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 256)
Attention weights shape: (batch_size, sequence_length, 1) (64, 4000, 1)


In [21]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [22]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(
    tf.random.uniform((BATCH_SIZE, 1)),
    sample_hidden, sample_output,
)

print("Decoder output shape: (batch_size, vocab size) {}".format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 4093)


In [23]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [24]:
checkpoint_dir = "./models/dice_model/"
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [25]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([label_tokenizer.word_index[START_TOKEN]] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

## Train Model

In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print("Epoch {} Batch {} Loss {:.4f}".format(
                epoch + 1,
                batch,
                batch_loss.numpy()),
            )
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print("Epoch {} Loss {:.4f}".format(epoch + 1, total_loss / steps_per_epoch))
    print("Time taken for 1 epoch {} sec\n".format(time.time() - start))

## Evaluate Model

### Restore from checkpoint

In [26]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1af359cbf48>

In [27]:
def get_job_title(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))

    sentence = preprocess_sentence(sentence)
    words = [word for word in sentence.split(" ") if word in train_tokenizer.word_index]
    inputs = [train_tokenizer.word_index[i] for i in words]
    inputs = tf.keras.preprocessing.sequence.pad_sequences(
        [inputs],
        maxlen=max_length_inp,
        padding="post",
    )
    inputs = tf.convert_to_tensor(inputs)

    result = ""

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([label_tokenizer.word_index[START_TOKEN]], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += label_tokenizer.index_word[predicted_id] + ' '

        if label_tokenizer.index_word[predicted_id] == END_TOKEN:
            return result, sentence, attention_plot

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result

In [65]:
job_desc = "Minimum Required Skills:Civil Engineering, AutoCAD, Site Layout, Conceptual DesignThis role will require an expertise in Site Layout and Grading for Commercial Builds. Candidates with experience working on federal projects is a huge plus!Our contractor is a full service contractor, providing; architectural and engineering design; environmental consulting; remediation; and operations and maintenance services to municipal, government, and private sector Clients throughout New England and adjacent states.What You Will Be DoingThe Civil Engineer will be responsible for maintaining client relationships through the successful management of projects and/or leading design efforts on a project. The Civil Engineer will conduct a wide variety of engineering tasks including conceptual designs, engineering reports/studies, detailed designs including drawings and specifications, and cost estimates.- Engineering of site layout, grading, drainage- Use AutoCAD for Engineering Design- Prepare/support proposals to support base line engineering workload.- Coordinate execution of engineering field work on specific projects.- Coordinate the preparation of project design submittals for permitting, bidding, and/or construction. Responsible for being a Civil Designer of Record.- Coordinate construction inspection services and/or construction phase services, as required. Responsible for conducting the construction inspection services and/or construction phase services for the civil engineering disciplinWhat You Need for this PositionB.S. Degree in Civil Engineering required.Professional Registration in at least one New England state, preferably Massachusetts.Proficiency in AutoCAD required.A minimum of 5 years of experience.   - Civil Engineering   - AutoCAD   - Site Layout   - Conceptual Design   - EstimatingWhat's In It for YouWe offer excellent compensation packages and benefits, including medical, dental, and vision insurance, and an attractive 401(k) plan.So, if you are a Civil Engineer, P.E. with experience, please apply today!Applicants must be authorized to work in the U.S.Please apply directly to by clicking 'Click Here to Apply' with your Word resume!Looking forward to receiving your resume and going over the position in more detail with you.- Not a fit for this position? Click the link at the bottom of this email to search all of our open positions.Looking forward to receiving your resume!CyberCodersCyberCoders, Inc is proud to be an Equal Opportunity EmployerAll qualified applicants will receive consideration for employment without regard to race, color, religion, sex, national origin, disability, protected veteran status, or any other characteristic protected by law.Your Right to Work - In compliance with federal law, all persons hired will be required to verify identity and eligibility to work in the United States and to complete the required employment eligibility verification document form upon hire.Copyright å© 1999 - 2016 . CyberCoders, Inc. All rights reserved."
preprocess_sentence(job_desc)

"<start_token> minimum required skills:civil engineering, autocad, site layout, conceptual designthis role will require an expertise in site layout and grading for commercial builds. candidates with experience working on federal projects is a huge plus!our contractor is a full service contractor, providing; architectural and engineering design; environmental consulting; remediation; and operations and maintenance services to municipal, government, and private sector clients throughout new england and adjacent states.what you will be doingthe civil engineer will be responsible for maintaining client relationships through the successful management of projects and/or leading design efforts on a project. the civil engineer will conduct a wide variety of engineering tasks including conceptual designs, engineering reports/studies, detailed designs including drawings and specifications, and cost estimates.- engineering of site layout, grading, drainage- use autocad for engineering design- pre

In [90]:
job_titles_predicted = []
for job_desc in tqdm(job_descriptions):
    jt = get_job_title(job_desc.replace("\r", " ").replace("\n", " "))
    job_titles_predicted.append(jt)

HBox(children=(FloatProgress(value=0.0, max=21919.0), HTML(value='')))




## Computer BLEU Score

In [199]:
def compute_bleu_n_gram(groundtruths, prediction, n=1):
    groundtruth_list = [gs.split(" ") for gs in groundtruths]
    prediction_list = prediction.split(" ")
    return compute_bleu_score(groundtruth_list, prediction_list, np.eye(n)[n-1])
    
def compute_bleu_score(groundtruth_list, prediction_list, weights):
    return nltk.translate.bleu_score.sentence_bleu(
        groundtruth_list,
        prediction_list,
        weights=weights,
    )

In [211]:
metrics = []
for i, jt_predicted in enumerate(job_titles_predicted):
    prediction = jt_predicted.replace("<end_token>", "").replace("<start_token>", "").strip().lower()
    bl_1 = compute_bleu_n_gram([job_titles[i].strip().lower()], prediction, n=1)
    bl_2 = compute_bleu_n_gram([job_titles[i].strip().lower()], prediction, n=2)
    metrics.append((bl_1, bl_2, 1 if job_titles[i].lower() == prediction.strip().lower() else 0))

In [209]:
bleu_1s, bleu_2s, matches = zip(*metrics)
len(bleu_1s), len(bleu_2s), len(matches)

(21919, 21919, 21919)

In [210]:
# Metrics
print(f"Exact matches {sum(matches)}")
print(f"BLEU 1 {np.mean(bleu_1s)}")
print(f"BLEU 2 {np.mean(bleu_2s)}")

Exact matches 6660
BLEU 1 0.5345026429116695
BLEU 2 0.3973641292158458
