In [1]:
import PyPDF2,docx
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



In [3]:
def extract_resume_text(file_name):
    text = ""
    if re.search('.pdf',file_name):
        with open(file_name, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text()

    if re.search('.doc',file_name):
        doc = docx.Document(file_name)
        for element in doc.element.body:
            if element.tag.endswith('p'):
                paragraph = docx.text.paragraph.Paragraph(element, doc)
                text += paragraph.text + "\n"
            elif element.tag.endswith('tbl'):
                table = docx.table.Table(element, doc)
                for row in table.rows:
                    for cell in row.cells:
                        cell_text = cell.text.strip() if cell.text else ''
                        text += cell_text + "\t"
                    text += "\n"
    return text      

In [4]:
def normalize_newlines_and_spaces(text):
    # Replace all newline characters with spaces and normalize multiple spaces
    return ' '.join(text.replace('\n', ' ').split())

In [5]:
jd = '''Title : Azure  Data Engineer
Experience: 4-6 Years 
Location: Remote
Budget : Below 25 LPA
•	Extensive hands-on experience in ADF, Data Bricks
•	Familiarity with ETL tools and techniques
•	Hands on experience with  Microsoft Azure,
•	Good Communication skills
•	Should be able to work independently and own the client deliverables
•	Prior experience with Azure cloud resources
•	Familiar with programming with GitHub, CICD, Docker.
'''.lower()

In [6]:
file_name = 'VNC_Vignesh_QA_Professional.docx'
resume_text = normalize_newlines_and_spaces(extract_resume_text(file_name).lower())

In [7]:
jd_text = '''Title : Azure  Data Engineer
Experience: 4-6 Years 
Location: Remote
Budget : Below 25 LPA
•	Extensive hands-on experience in ADF, Data Bricks
•	Familiarity with ETL tools and techniques
•	Hands on experience with  Microsoft Azure,
•	Good Communication skills
•	Should be able to work independently and own the client deliverables
•	Prior experience with Azure cloud resources
•	Familiar with programming with GitHub, CICD, Docker.
'''.lower()

In [8]:
def preprocess_text(text):
    special_chars = ['■', '•', '●', '-', '*', '➢','.','/']
    # Replace each special character with a space around it
    for char in special_chars:
        text = text.replace(char, f' {char} ')
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [9]:
jd_text = preprocess_text(jd_text)
resume_text = preprocess_text(resume_text)

In [10]:
from nltk.tokenize import word_tokenize
jd_tokens = word_tokenize(jd_text)
resume_tokens = word_tokenize(resume_text)

In [11]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stop words from a list of words
def remove_stop_words(word_list):
    return [word for word in word_list if word not in stop_words]

pre_final_jd = remove_stop_words(jd_tokens)
pre_final_resume = remove_stop_words(resume_tokens)

In [12]:
# Filter out stopwords and punctuation
filtered_jd = [word for word in pre_final_jd if re.match(r'[a-z0-9]', word)]
filtered_resume = [word for word in pre_final_resume if re.match(r'[a-z0-9]', word)]

In [13]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
lemmatizer = WordNetLemmatizer()



# Function to get POS tag for accurate lemmatization
def get_pos(word):
    pos_tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": "a", "N": "n", "V": "v", "R": "r"}
    return tag_dict.get(pos_tag, "n")

# Apply lemmatization
lemmatized_jd = [lemmatizer.lemmatize(word, get_pos(word)) for word in filtered_jd]
lemmatized_resume = [lemmatizer.lemmatize(word, get_pos(word)) for word in filtered_resume]

In [14]:
final_jd = ' '.join(lemmatized_jd)
final_resume = ' '.join(lemmatized_resume)

In [15]:
# Encode the job description and resume
job_description_embedding = model.encode(final_jd)
resume_embedding = model.encode(final_resume)

In [16]:
similarity_score = cosine_similarity([resume_embedding], [job_description_embedding])[0][0]

# Print similarity score
print(f"Similarity Score: {similarity_score:.2f}")

Similarity Score: 0.36
