In [1]:
#!pip install datasets evaluate transformers[sentencepiece]
#!pip install faiss-cpu
#!pip install -U sentence-transformers

In [29]:
from transformers import AutoTokenizer, AutoModel
from transformers.pipelines import pipeline
from datasets import load_dataset, Dataset

To start processing the resumes, we need to load the data in CSV format into a Huggingface Dataset object. HuggingFace Dataset is excellent for handling large volumes of information and integrates seamlessly with Faiss.

In [2]:
from transformers.pipelines import pipeline

question_answerer = pipeline('question-answering')

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [3]:
resume_dataset = load_dataset("csv", data_files='UpdatedResumeDataSet.csv', split="train")
resume_dataset

Found cached dataset csv (C:/Users/irfan/.cache/huggingface/datasets/csv/default-c5aab28669a5591a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


Dataset({
    features: ['Category', 'Resume'],
    num_rows: 962
})

In [4]:
resume_dataset.set_format("pandas")
df = resume_dataset[:]

In [5]:
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [6]:
from datasets import Dataset

resume_dataset = Dataset.from_pandas(df)
resume_dataset

Dataset({
    features: ['Category', 'Resume'],
    num_rows: 962
})

 extract the Embedding vector of each resume. HuggingFace provides several pre-trained Transformers for that purpose; we’ll use the model sentence-transformers/multi-qa-mpnet-base-dot-v1 that, according to Huggingface’s documentation, performs great for Semantic search.

In [7]:
resume_dataset = resume_dataset.map(
    lambda x: {"resume_length": len(x["Resume"].split())}
)

Map:   0%|          | 0/962 [00:00<?, ? examples/s]

In [8]:
resume_dataset = resume_dataset.filter(lambda x: x["resume_length"] > 15)
resume_dataset

Filter:   0%|          | 0/962 [00:00<?, ? examples/s]

Dataset({
    features: ['Category', 'Resume', 'resume_length'],
    num_rows: 962
})

In [9]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

So now, we can extract each resume’s embedding vector and add it to a new column called embeddings. You can see in the following code snippet how straightforward it is.

In [10]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [11]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]
     

In [12]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [13]:
embedding = get_embeddings(resume_dataset["Resume"][0])
embedding.shape

torch.Size([1, 768])

In [14]:

embeddings_dataset = resume_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["Resume"]).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/962 [00:00<?, ? examples/s]

In [15]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['Category', 'Resume', 'resume_length', 'embeddings'],
    num_rows: 962
})

In [16]:
question = '''
Startup is looking for a founder engineer with experience on Blockchain and smart contracts.
Experience on frontEnd development with Angular is also desired.
'''
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [17]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=10
)

In [18]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)
     

In [19]:
for _, row in samples_df.iterrows():
    print(f"CATEGORY: {row.Category}")
    print(f"SCORE: {row.scores}")
    print(f"RESUME: {row.Resume}")
    print("=" * 50)
    print()

CATEGORY: Blockchain
SCORE: 33.69646453857422
RESUME: SOFTWARE SKILLS: Languages: C, C++ & java Operating Systems: Windows XP, 7, Ubuntu RDBMS: Oracle (SQL) Database, My SQL, PostgreSQL Markup & Scripting: HTML, JavaScript & PHP, CSS, JQuery, Angular js. Framework: Struts, Hibernate, spring, MVC Web Server: Tomcat and Glassfish. Web Services: REST AND SOAP TRAINING DETAIL Duration: 4 months From: - United Telecommunication Limited Jharnet project (Place - Ranchi, Jharkhand) Networking Requirements: Elementary configuration of router and switch, IP and MAC addressing, Lease Line, OSI Layers, Routing protocols. Status: - Network Designer.Education Details 
    2 High School
 Diploma Government Women Ranchi, Jharkhand The Institution
Blockchain Engineer 

Blockchain Engineer - Auxledger
Skill Details 
JAVA- Exprience - 19 months
CSS- Exprience - 12 months
HTML- Exprience - 12 months
JAVASCRIPT- Exprience - 12 months
C++- Exprience - 6 monthsCompany Details 
company - Auxledger

In [20]:
print(samples_df['Resume'][0])

Skills Strong CS fundamentals and problem solving Ethereum, Smart Contracts, Solidity skills Golang, Node, Angular, React Culturally fit for startup environment MongoDB, PostGresql, MySql Enthusiastic to learn new technologies AWS, Docker, Microservices Blockchain, Protocol, ConsensusEducation Details 
January 2014 M.Tech Computer Engineering Jaipur, Rajasthan Malaviya National Institute Of Technology Jaipur
January 2011 B.E. Computer Science And Engg Kolhapur, Maharashtra Shivaji University
Blockchain Engineer 

Blockchain Engineer - XINFIN Orgnization
Skill Details 
MONGODB- Exprience - 16 months
CONTRACTS- Exprience - 12 months
MYSQL- Exprience - 9 months
AWS- Exprience - 6 months
PROBLEM SOLVING- Exprience - 6 monthsCompany Details 
company - XINFIN Orgnization
description - Xinfin is a global open source Hybrid Blockchain protocol.
Rolled out multiple blockchain based pilot projects on different use cases for various clients. Eg.
Tradefinex (Supply chain Management)

## Lets ask question on the top ranked profile

In [21]:
answer = question_answerer(question='What are your main technical skills?', context=samples_df['Resume'][0])
print(answer)

{'score': 0.7134348154067993, 'start': 2238, 'end': 2312, 'answer': 'bug fixing, DB operations, Feature customisation and writing API endpoints'}


In [22]:
answer = question_answerer(question='What was your major at school?', context=samples_df['Resume'][0])
print(answer)
     

{'score': 0.006006759125739336, 'start': 2475, 'end': 2486, 'answer': 'IT Services'}


In [25]:
#!pip3 install openai
import openai

COMPLETIONS_MODEL = "text-curie-001"
openai.api_key = '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'

In [26]:
openai_prompt = samples_df['Resume'][0]
openai_prompt += '''

Answer the following questions:

1 - What are your main technical skills?

2 - What was your major at schools?

3 - What is your experience with databases?

4 - Can you List all your work experience?

Answers: 
'''

In [27]:
print(openai_prompt)

Skills Strong CS fundamentals and problem solving Ethereum, Smart Contracts, Solidity skills Golang, Node, Angular, React Culturally fit for startup environment MongoDB, PostGresql, MySql Enthusiastic to learn new technologies AWS, Docker, Microservices Blockchain, Protocol, ConsensusEducation Details 
January 2014 M.Tech Computer Engineering Jaipur, Rajasthan Malaviya National Institute Of Technology Jaipur
January 2011 B.E. Computer Science And Engg Kolhapur, Maharashtra Shivaji University
Blockchain Engineer 

Blockchain Engineer - XINFIN Orgnization
Skill Details 
MONGODB- Exprience - 16 months
CONTRACTS- Exprience - 12 months
MYSQL- Exprience - 9 months
AWS- Exprience - 6 months
PROBLEM SOLVING- Exprience - 6 monthsCompany Details 
company - XINFIN Orgnization
description - Xinfin is a global open source Hybrid Blockchain protocol.
Rolled out multiple blockchain based pilot projects on different use cases for various clients. Eg.
Tradefinex (Supply chain Management)

In [28]:
openai.Completion.create(
    prompt=openai_prompt,
    temperature=0,
    max_tokens=100,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

'1 - I have strong CS fundamentals and problem solving skills in Ethereum, Smart Contracts, Solidity. \n2 - I am a B.E. Computer Science and Engineering graduate from Maharashtra Shivaji University. \n3 - I have experience with databases such as MongoDB, PostgreSQL, MySQL. \n4 - I have worked in various roles such as Blockchain Engineer, Developer, Web Developer.'