Model Used: https://huggingface.co/bert-base-uncased

In [1]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

import sys
sys.path.append('..//..')
from utils.tokenizer import sent_tokenize
from dataset.job_description import job_descriptions

# import dataset 
dataset = pd.read_csv('../../dataset/resume_dataset.csv')

# drop unecessary columns
dataset = dataset.drop(columns=['ID', 'Resume_html', 'Category'])

# import bert
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
resume_sentences = sent_tokenize(dataset['Resume_str'][0])
job_desc_sentences = sent_tokenize(job_descriptions[3])

print(resume_sentences)
print(job_desc_sentences)

tokenized_resume_sentences = tokenizer(resume_sentences, return_tensors='pt', padding=True, truncation=True)
tokenized_job_desc_sentences = tokenizer(job_desc_sentences, return_tensors='pt', padding=True, truncation=True)

# resume embeddings
with torch.no_grad():
    resume_embeddings = model(**tokenized_resume_sentences).last_hidden_state
    job_desc_embeddings = model(**tokenized_job_desc_sentences).last_hidden_state

['hr administrator marketing associate', 'hr administrator', 'dedicated customer service manager with [NUMBER] years of experience in hospitality and customer service management', 'respected builder and leader of customer-focused teams strives to instill a shared enthusiastic commitment to customer service', 'focused on customer satisfaction', 'team management', 'marketing savvy', 'conflict resolution techniques', 'training and development', 'skilled multi-tasker', 'client relations specialist', 'missouri dot supervisor training certification', 'certified by ihg in customer loyalty and marketing by segment', 'hilton worldwide general manager training certification', 'accomplished trainer for cross server hospitality systems such as', 'hilton onq', 'opera pms', 'reservation system ors', 'completed courses and seminars in customer service sales strategies inventory control loss prevention safety time management leadership and performance assessment', 'hr administrator marketing associate

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided []

In [None]:
resume_mean_pooled_word_embeddings = [torch.mean(sentence, dim=0) for sentence in resume_embeddings]
resume_mean_pooled_sentence_embeddings = torch.mean(torch.stack(resume_mean_pooled_word_embeddings), dim=0)

job_desc_mean_pooled_word_embeddings = [torch.mean(sentence, dim=0) for sentence in job_desc_embeddings]
job_desc_mean_pooled_sentence_embeddings = torch.mean(torch.stack(job_desc_mean_pooled_word_embeddings), dim=0)

print(resume_mean_pooled_sentence_embeddings.reshape(1, -1))
print(job_desc_mean_pooled_sentence_embeddings.reshape(1, -1))

print(cosine_similarity(resume_mean_pooled_sentence_embeddings.reshape(1, -1), job_desc_mean_pooled_sentence_embeddings.reshape(1, -1)))

tensor([[-4.4319e-02, -6.5156e-03,  4.2019e-03,  1.1836e-01,  2.6145e-01,
          7.6716e-02, -2.4464e-03,  2.1378e-01, -7.7215e-03, -2.2571e-01,
         -7.7868e-02,  2.0465e-01,  2.5795e-01, -3.6866e-02, -2.9201e-02,
          3.4238e-02,  2.9965e-02,  1.7317e-01,  5.2314e-02, -5.1877e-02,
         -2.8945e-01, -1.1396e-01, -3.6949e-02, -2.1253e-02,  8.6654e-02,
          1.9215e-01, -1.3624e-01,  4.2553e-02, -3.9289e-01,  8.4857e-02,
          1.0184e-01, -3.4405e-01,  2.1968e-01,  1.3380e-01, -1.4706e-01,
         -2.2257e-01, -2.0132e-02,  2.3860e-02, -3.2726e-01,  4.6076e-03,
         -2.1926e-01, -4.5307e-02,  1.6769e-01, -2.2148e-01, -1.8203e-02,
         -2.2088e-01, -9.5639e-02, -1.1149e-01, -1.8860e-01,  7.8613e-02,
         -4.3233e-01,  2.9616e-01, -1.2446e-01,  1.4864e-02, -1.7383e-02,
          3.0980e-01,  2.8847e-01, -1.7778e-01, -1.7220e-01, -1.8221e-01,
          1.2692e-01, -1.3830e-01,  6.5575e-02,  1.1152e-01,  3.4218e-01,
         -1.1447e-01,  3.3031e-01,  2.