In [3]:
#  %pip install sentence-transformers

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

In [7]:
import pandas as pd
# this csv contains cleaned data by preprocessing it manually using regex
df = pd.read_csv('/content/drive/MyDrive/modified_data.csv')
df.head()


Unnamed: 0,Resume_str,Category,Resume
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR,hr administrator marketing associate hr admini...
1,"HR SPECIALIST, US HR OPERATIONS ...",HR,hr specialist us hr operations summary versati...
2,HR DIRECTOR Summary Over 2...,HR,hr director summary years experience recruitin...
3,HR SPECIALIST Summary Dedica...,HR,hr specialist summary dedicated driven dynamic...
4,HR MANAGER Skill Highlights ...,HR,hr manager skill highlights hr skills hr depar...


In [8]:
df.groupby('Category').describe()


Unnamed: 0_level_0,Resume_str,Resume_str,Resume_str,Resume_str,Resume,Resume,Resume,Resume
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ACCOUNTANT,118,118,ACCOUNTANT Professional Summary...,1,118,118,ountant professional summary current ountant c...,1
ADVOCATE,118,118,ADVOCATE Professional Summary...,1,118,118,advocate professional summary conscientious at...,1
AGRICULTURE,63,63,AGRICULTURE ADVISOR AND LANGUAGE OFFI...,1,63,63,agriculture advisor language officer professio...,1
APPAREL,97,97,APPAREL ASSOCIATE Summary An...,1,97,97,apparel associate summary organized detail ori...,1
ARTS,103,103,ARTS EDUCATOR Summary Creat...,1,103,103,arts educator summary creative innovative art ...,1
AUTOMOBILE,36,36,AUTOMOBILE TRANSPORTER Professi...,1,36,36,automobile transporter professional summary de...,1
AVIATION,117,116,STOREKEEPER II Professional Sum...,2,117,116,storekeeper ii professional summary purpose do...,2
BANKING,115,115,"BANKING Summary Hands-on,...",1,115,115,banking summary hands client oriented banking ...,1
BPO,22,22,"DIRECTOR OF OPERATIONS, BPO Exe...",1,22,22,director operations bpo executive summary resu...,1
BUSINESS-DEVELOPMENT,120,120,BUSINESS DEVELOPMENT Summar...,1,119,119,business development summary results driven hi...,1


In [9]:
df = df.drop(columns='Resume_str')
df

Unnamed: 0,Category,Resume
0,HR,hr administrator marketing associate hr admini...
1,HR,hr specialist us hr operations summary versati...
2,HR,hr director summary years experience recruitin...
3,HR,hr specialist summary dedicated driven dynamic...
4,HR,hr manager skill highlights hr skills hr depar...
...,...,...
2479,AVIATION,rank sgt e non commissioned officer charge bri...
2480,AVIATION,government relations communications organizati...
2481,AVIATION,geek squad agent professional profile support ...
2482,AVIATION,program director office manager summary highly...


In [10]:
sentences_res = df["Resume"].to_list()

for s in sentences_res:
    if type(s) != str:
        sentences_res.remove(s)#removing 1 particular nan value

sentences_res


['hr administrator marketing associate hr administrator summary dedicated customer service manager years experience hospitality customer service management respected builder leader customer focused teams strives instill shared enthusiastic commitment customer service highlights focused customer satisfaction team management marketing savvy conflict resolution techniques training development skilled multi tasker client relations specialist omplishments missouri dot supervisor training certification certified ihg customer loyalty marketing segment hilton worldwide general manager training certification omplished trainer cross server hospitality systems hilton onq micros opera pms fidelio opera reservation system ors holidex completed courses seminars customer service sales strategies inventory control loss prevention safety time management leadership performance assessment experience hr administrator marketing associate hr administrator dec current company name city state helps develop po

In [11]:
example_sentence = sentences_res[0]
example_tokens = tokenizer.encode_plus(example_sentence, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
example_tokens


{'input_ids': tensor([[    0, 17854,  8915,  5825,  5486, 17854,  8915, 12658,  4060,  8017,
          2330,  3212,  2090,  3329, 15965,  8017,  2330,  2972,  9772, 12512,
          3007,  8017,  4212,  2784, 29457,  2019, 16025, 28349,  4211, 14731,
          8430,  8017,  2330, 11641,  4212,  8017,  9971,  2140,  2972,  5825,
          7846,  2619, 10740,  4740,  5817,  5465,  2735,  2462, 10575,  4804,
          4712,  2125,  7400,  4266,  8329, 18172, 24763, 21812,  2019,  5288,
         11093, 12370,  2735, 10622,  7382,  1049, 25623,  8017,  9725,  5825,
          6907, 15485,  4973,  2240,  3212,  2735, 10622, 18172, 24763, 13299,
         10369,  2896,  8245, 15965,  3005, 15485,  2010,  4164, 12706,  2019,
          3854,  7614,  2019, 26004, 12802,  3854, 11083,  2295,  2034,  2019,
          7574, 24202,  2599,  2953,  5356, 17243,  8017,  2330,  4345,  9946,
         12616,  2495,  3283,  9744,  3812,  2055,  2972,  4109,  2840,  7671,
          3329, 17854,  8915,  5825,  

In [12]:
tokens = {'input_ids': [], 'attention_mask': [] }

def tokenize_sentence(sentences):
    for sentence in sentences:
        try:
            new_tokens = tokenizer.encode_plus(sentence, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
            tokens['input_ids'].append(new_tokens['input_ids'][0])
            tokens['attention_mask'].append(new_tokens['attention_mask'][0])
        except Exception as e:
            print(f"Error processing sentence: {sentence}")
            print(f"Error message: {str(e)}")

    # restructure a list of tensors into single tensor
    tokens['input_ids'] = torch.stack(tokens['input_ids'])
    tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
    return tokens

In [13]:
tokens_res = tokenize_sentence(sentences_res)


In [14]:
tokens_res['input_ids'].shape

torch.Size([2483, 512])

In [15]:
tk = {
    'input_ids': tokens_res['input_ids'][:10],
    'attention_mask': tokens_res['attention_mask'][:10]
}
outputs = model(**tk)
outputs

BaseModelOutputWithPooling(last_hidden_state=tensor([[[ 0.2198,  0.1654, -0.0595,  ..., -0.1036,  0.0771, -0.0813],
         [ 0.2291,  0.0225, -0.0331,  ...,  0.0038,  0.0854, -0.0224],
         [ 0.1408,  0.1915, -0.0378,  ...,  0.0145,  0.1453, -0.1382],
         ...,
         [ 0.1822,  0.1376, -0.1738,  ..., -0.0449,  0.0693, -0.0913],
         [ 0.2205,  0.1458, -0.0881,  ..., -0.0467, -0.0401, -0.1202],
         [ 0.1787,  0.1399, -0.0321,  ..., -0.0850,  0.0747, -0.1056]],

        [[ 0.1612,  0.0240, -0.0761,  ..., -0.1114,  0.1313, -0.1219],
         [ 0.1721, -0.1207, -0.0429,  ...,  0.0377,  0.1374, -0.0181],
         [ 0.0704,  0.0207, -0.0398,  ...,  0.1827,  0.2529, -0.2205],
         ...,
         [ 0.1305,  0.1615, -0.0858,  ..., -0.0404,  0.1229, -0.0807],
         [ 0.1470,  0.2508, -0.0620,  ..., -0.0685,  0.0108, -0.0932],
         [ 0.1782,  0.0711, -0.0354,  ..., -0.0753,  0.0566, -0.1288]],

        [[ 0.2284,  0.0960,  0.0144,  ..., -0.0559,  0.1049, -0.0681],


In [16]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [17]:
embeddings = outputs.last_hidden_state
embeddings

tensor([[[ 0.2198,  0.1654, -0.0595,  ..., -0.1036,  0.0771, -0.0813],
         [ 0.2291,  0.0225, -0.0331,  ...,  0.0038,  0.0854, -0.0224],
         [ 0.1408,  0.1915, -0.0378,  ...,  0.0145,  0.1453, -0.1382],
         ...,
         [ 0.1822,  0.1376, -0.1738,  ..., -0.0449,  0.0693, -0.0913],
         [ 0.2205,  0.1458, -0.0881,  ..., -0.0467, -0.0401, -0.1202],
         [ 0.1787,  0.1399, -0.0321,  ..., -0.0850,  0.0747, -0.1056]],

        [[ 0.1612,  0.0240, -0.0761,  ..., -0.1114,  0.1313, -0.1219],
         [ 0.1721, -0.1207, -0.0429,  ...,  0.0377,  0.1374, -0.0181],
         [ 0.0704,  0.0207, -0.0398,  ...,  0.1827,  0.2529, -0.2205],
         ...,
         [ 0.1305,  0.1615, -0.0858,  ..., -0.0404,  0.1229, -0.0807],
         [ 0.1470,  0.2508, -0.0620,  ..., -0.0685,  0.0108, -0.0932],
         [ 0.1782,  0.0711, -0.0354,  ..., -0.0753,  0.0566, -0.1288]],

        [[ 0.2284,  0.0960,  0.0144,  ..., -0.0559,  0.1049, -0.0681],
         [ 0.1443, -0.0993,  0.0174,  ...,  0

In [18]:
embeddings.shape

torch.Size([10, 512, 768])

In [22]:
attention_mask = tokens_res['attention_mask'][:10]
attention_mask.shape

torch.Size([10, 512])

In [23]:
resized_attention_mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
resized_attention_mask.shape

torch.Size([10, 512, 768])

In [24]:
resized_attention_mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        ...,

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1., 

In [25]:
resized_attention_mask[0][0].shape

torch.Size([768])

In [26]:
masked_embedding = embeddings * resized_attention_mask
masked_embedding.shape

torch.Size([10, 512, 768])

In [27]:
masked_embedding

tensor([[[ 0.2198,  0.1654, -0.0595,  ..., -0.1036,  0.0771, -0.0813],
         [ 0.2291,  0.0225, -0.0331,  ...,  0.0038,  0.0854, -0.0224],
         [ 0.1408,  0.1915, -0.0378,  ...,  0.0145,  0.1453, -0.1382],
         ...,
         [ 0.1822,  0.1376, -0.1738,  ..., -0.0449,  0.0693, -0.0913],
         [ 0.2205,  0.1458, -0.0881,  ..., -0.0467, -0.0401, -0.1202],
         [ 0.1787,  0.1399, -0.0321,  ..., -0.0850,  0.0747, -0.1056]],

        [[ 0.1612,  0.0240, -0.0761,  ..., -0.1114,  0.1313, -0.1219],
         [ 0.1721, -0.1207, -0.0429,  ...,  0.0377,  0.1374, -0.0181],
         [ 0.0704,  0.0207, -0.0398,  ...,  0.1827,  0.2529, -0.2205],
         ...,
         [ 0.1305,  0.1615, -0.0858,  ..., -0.0404,  0.1229, -0.0807],
         [ 0.1470,  0.2508, -0.0620,  ..., -0.0685,  0.0108, -0.0932],
         [ 0.1782,  0.0711, -0.0354,  ..., -0.0753,  0.0566, -0.1288]],

        [[ 0.2284,  0.0960,  0.0144,  ..., -0.0559,  0.1049, -0.0681],
         [ 0.1443, -0.0993,  0.0174,  ...,  0

In [28]:
summed_masked_embeddings = torch.sum(masked_embedding, 1)
summed_masked_embeddings.shape

torch.Size([10, 768])

In [29]:

summed_masked_embeddings

tensor([[ 8.5913e+01,  8.6104e+01, -2.8109e+01,  ..., -1.3907e+01,
          6.5112e+01, -4.3724e+01],
        [ 8.5844e+01,  3.3009e+01, -4.0685e+01,  ..., -2.1483e+01,
          4.4727e+01, -4.2570e+01],
        [ 1.1669e+02,  6.6474e+01, -3.0652e+00,  ..., -1.5385e+01,
          7.0678e+01, -4.2384e+01],
        ...,
        [ 6.3001e+01, -1.5482e+01, -1.8845e+01,  ..., -6.7765e-01,
          6.2923e+01, -3.5906e+01],
        [ 1.0014e+02,  8.3377e+00,  1.0364e-01,  ..., -8.0415e+00,
          6.5932e+01, -4.2554e+01],
        [ 4.3843e+01,  8.7123e+00, -3.3758e+01,  ..., -1.6524e+01,
          3.9052e+01, -3.4210e+01]], grad_fn=<SumBackward1>)

In [30]:
resized_attention_mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        ...,

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1., 

In [31]:
count_of_one_in_mask_tensor = torch.clamp(resized_attention_mask.sum(1), min=1e-9 )

count_of_one_in_mask_tensor.shape

torch.Size([10, 768])

In [32]:
count_of_one_in_mask_tensor

tensor([[512., 512., 512.,  ..., 512., 512., 512.],
        [512., 512., 512.,  ..., 512., 512., 512.],
        [512., 512., 512.,  ..., 512., 512., 512.],
        ...,
        [512., 512., 512.,  ..., 512., 512., 512.],
        [512., 512., 512.,  ..., 512., 512., 512.],
        [512., 512., 512.,  ..., 512., 512., 512.]])

In [34]:
  summed_masked_embeddings.shape

torch.Size([10, 768])

In [35]:
count_of_one_in_mask_tensor.shape

torch.Size([10, 768])

In [36]:
mean_pooled = summed_masked_embeddings / count_of_one_in_mask_tensor

In [37]:
mean_pooled.shape

torch.Size([10, 768])

In [38]:
from sklearn.metrics.pairwise import cosine_similarity

mean_pooled = mean_pooled.detach().numpy()

cosine_similarity([mean_pooled[0]], mean_pooled[1:] )

array([[0.8829461 , 0.8245016 , 0.829877  , 0.8372379 , 0.82125103,
        0.83059955, 0.77365875, 0.7419498 , 0.8088125 ]], dtype=float32)