In [42]:
import spacy
import random
from PyPDF2 import PdfReader
import pandas as pd

from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

import pandas as pd
import numpy as np
import jsonlines

import re
import nltk
from typing import List

from sentence_transformers import SentenceTransformer, util

In [43]:
import constants

In [44]:
class FeaturesExtractor:

    def __init__(self):

        self.nlp = spacy.load('./data/train_model/model_ner/')
        
        if "sentencizer" not in self.nlp.pipe_names:
            self.nlp.add_pipe("sentencizer")
        
        if 'entity_ruler' not in self.nlp.pipe_names:
            ruler = self.nlp.add_pipe('entity_ruler', after='ner')
            ruler.from_disk(constants.SKILLS_PATTERN_PATH)
            
            job_title_patterns = pd.read_csv(constants.JOB_TITLE_PATH)['Job Title'].unique()
            
            for title in job_title_patterns:
                ruler.add_patterns([{"label": "JOB TITLE", "pattern": title}])
        
        self._CATEGORIES_PATTERN = constants.CATEGORIES_PATTERN
    
    def fit_transform(self, input: List[str]):
        
        self._text_arr = []
        self._feature_arr = []
        self._input_len = len(input)

        for i in range(self._input_len):
            
            doc = self._remove_excess_spaces(input[i])
            self._text_arr.append(doc.text)

            self._extract_features(i, doc)

        return self._text_arr, self._feature_arr

    def _remove_excess_spaces(self, text):
            
        doc = self.nlp(re.sub(r'\s+', ' ', text).strip())

        return doc
            

    def _extract_features(self, resume_idx, doc):

        feature_dict = {
            
            'resume_idx': resume_idx,
            'name': self._extract_name(doc),
            'phone': self._extract_phone(doc),
            'educations': self._extract_educations(doc),
            'gpa': self._extract_gpa(doc),
            'job_titles' : self._extract_job_titles(doc),
            'years_experiences': self._extract_years_experiences(doc),
            'experiences': self._extract_experiences(doc),
            'skills': self._extract_skills(doc),
            'soft_skills': self._extract_soft_skills(doc),
            'languages': self._extract_languages(doc),
            
        }
        
        self._feature_arr.append(feature_dict)
           

    def _extract_name(self, doc):
        
        name = []

        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                name.append(ent.text)

        return name


    def _extract_phone(self, doc):

        pattern = r'(?:\+?(?:\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?(?:\d{2,4}[-.\s]?){2,5}\d{2,4})'

        matches = re.findall(pattern, doc.text)

        return matches
    
    def _extract_educations(self, doc):
        
        educations = []

        pattern = self._CATEGORIES_PATTERN['EDUCATIONS']
        
        matches = re.findall(pattern, doc.text)
        for match in matches:
            educations.append(match.strip())
            
        for ent in doc.ents:
            if 'DIPLOMA' in ent.label_:
                educations.append(ent.text)

        return [edu for edu in set(educations)]
    
    def _extract_gpa(self, doc):
        
        gpas = []
        
        for ent in doc.ents:
            if 'GPA' in ent.label_:
                gpas.append(ent.text)
                
        return [gpa.capitalize() for gpa in set(gpas)]
    
    def _extract_job_titles(self, doc):
        
        job_titles = []
        
        for ent in doc.ents:
            if 'JOB TITLE' in ent.label_:
                job_titles.append(ent.text)
        
        return [job for job in set(job_titles)]
    
    def _extract_years_experiences(self, doc):
        
        years_experiences= []
        
        pattern = self._CATEGORIES_PATTERN['YEARS_EXPERIENCES']
        
        sentences = [sent.text.strip() for sent in doc.sents]

        # for sentence in sentences:
        #     if re.search(pattern, sentence, re.IGNORECASE):
        #         years_experiences_sentences.append(sentence)
        
        for sentence in sentences:
            
            matches = re.findall(pattern, sentence, re.IGNORECASE)
                        
            for match in matches:
                
                # Prevent context not parsed
                try:

                    
                    year = match  # Extracted years
                    year = re.sub(r'\+', '', year)  # Remove '+' if present

                    try:
                        
                        year = int(year)
                    
                    except Exception as e:
                        
                        print(f"Error converting {match} to number: {e}")
                        continue

                    match_doc = self.nlp(sentence)
                    
                    skills = self._extract_skills(match_doc)
                    job_titles = self._extract_job_titles(match_doc)
                    languages = self._extract_languages(match_doc)
                    
                    # Make string out of keywords for similarity scoring
                    keywords_match = ' '.join(skills + languages)
                    keywords_context = ' '.join(job_titles)

                    years_experience_dict = {
                        'text': sentence,
                        'year': year,
                        'keywords_match': keywords_match,
                        'keywords_context': keywords_context
                    }
                    
                    years_experiences.append(years_experience_dict)

                except Exception as e:
                    print(f"Error processing match {match}: {e}")
                    continue 
            
        return years_experiences
    
    def _extract_experiences(self, doc):
        
        experiences = []
        
        pattern = self._CATEGORIES_PATTERN['EXPERIENCES']
        
        sentences = [sent.text.strip() for sent in doc.sents]

        for sentence in sentences:
            if re.search(pattern, sentence, re.IGNORECASE):
                experiences.append(sentence)
                
        for ent in doc.ents:
            if 'EXPERIENCE' in ent.label_:
                experiences.append(ent.text)
        
        return [exp for exp in set(experiences)]
    
    def _extract_skills(self, doc):

        skills = []

        for ent in doc.ents:
            if 'SKILL' in ent.label_ and 'SOFT SKILL' not in ent.label_:
                skills.append(ent.text)
                
        return [skill.capitalize() for skill in set(skills)]
    
    def _extract_soft_skills(self, doc):
        
        soft_skills = []
        
        for ent in doc.ents:
            if 'SOFT SKILL' in ent.label_:
                soft_skills.append(ent.text)
                
        return [soft_skill for soft_skill in set(soft_skills)]

    def _extract_languages(self, doc):
        
        languages = []

        for ent in doc.ents:
            if 'LANGUAGE' in ent.label_:
                languages.append(ent.text)
                
        return [language.capitalize() for language in set(languages)]

    



In [45]:
def cv_parse(text):
    
    model = spacy.load('en_core_web_trf')
    
    print(model.pipe_names)

    doc = model(text)
    for ent in doc.ents:
        print(f'{ent.label_.upper():{30}}- {ent.text}')

In [46]:
PDF_PATH = './PDF/Anthonio Obert - Software Developer - CV (1).pdf'

reader = PdfReader(PDF_PATH)
n_pages = len(reader.pages)

extracted_text = ''

for i in range(n_pages):
    page = reader.pages[i]
    extracted_text += page.extract_text()

extracted_text = re.sub(r'\s+', ' ', extracted_text).strip()
extracted_text

"Anthonio Obert Software Developer+62 81273724892 laisobert2@gmail.com Jakarta, Indonesia SUMMARY A passionate college student with a keen interest in software development that is able to learn quickly and delve deeply into new subjects. Currently working as a Database Administrator for Bina Nusantara's Software Laboratory, where I manage and maintain student scores across multiple campuses with honesty and integrity. Capable of working under pressure and meeting deadlines. EXPERIENCE 02/2024 - Present Database Administrator Bina Nusantara University Manage and maintain student scores for laboratory subjects across six campuses: Kemanggisan, Alam Sutera, Bekasi, Bandung, Malang, and Semarang. Maintain web application to support internal and external activities. Create and maintain SQL query for internal and external requests. Provide student's scores data to identify and improve laboratory processes. Schedule important dates for laboratory activities. Post student's scores in Bina Nusa

In [47]:
PDF_PATH = './PDF/Anthonio Obert - Software Developer - CV (1).pdf'

reader = PdfReader(PDF_PATH)
n_pages = len(reader.pages)

extracted_text = ''

for i in range(n_pages):
    page = reader.pages[i]
    extracted_text += page.extract_text()

extractor = FeaturesExtractor()

texts, features = extractor.fit_transform([extracted_text])
job_text, job_features = extractor.fit_transform([constants.DUMMY_JOB_DESCRIPTION])

print(f'\nEducations:')
for education in job_features[0]['educations']:
    print(education)
    
print(f'\nGPA:')
gpa = job_features[0]['gpa']
if gpa:
    for g in gpa:
        print(f'  - {g}')
else:
    print('  - No GPA information found')

print(f'\nJob Titles:')
for job_title in job_features[0]['job_titles']:
    print(job_title)
    
print(f'\nYears Experience:')
for years_experience in job_features[0]['years_experiences']:
    print(years_experience)

print(f'\nExperiences:')
for experience in job_features[0]['experiences']:
    print(experience)
    
print(f'\nSkills:')
for skill in job_features[0]['skills']:
    print(skill)   
    
print(f'\nSkills:')
for soft_skill in job_features[0]['soft_skills']:
    print(soft_skill)   
    
print(f'\nLanguages:')
for language in job_features[0]['languages']:
    print(language)   


Educations:
Bachelor's degree in Computer Science or equivalent

GPA:
  - No GPA information found

Job Titles:
Backend Software Engineer
Software Engineer

Years Experience:
{'text': 'Having minimum 3 years of experience in software engineering (Java), application development or system development + experience in RDBMS and NoSQL databases.', 'year': 3, 'keywords_match': 'Nosql Java Software engineering Databases', 'keywords_context': ''}

Experiences:
Schedule important dates for laboratory activities. ‎ Requirements Bachelor's degree in Computer Science or equivalent from a reputable university with good academic results is preferred.
Strong object-oriented analysis and design skills.
As a Backend Software Engineer, you are expected to: Be responsible for designing, building, improving, or maintaining our backend applications, third-party data integration, data API, backend systems, or working with monitoring tools and infrastructure Work in cross-functional teams and meet great peo

In [48]:

# Print all extracted features in a formatted way
print(f'\nName:')
name = features[0]['name']
if name:
    print(f'  - {name[0]}')  # Assuming there is only one name
else:
    print('  - No name found')

print(f'\nPhone:')
phone = features[0]['phone']
if phone:
    for p in phone:
        print(f'  - {p}')
else:
    print('  - No phone number found')

print(f'\nEducations:')
education = features[0]['educations']
if education:
    for edu in education:
        print(f'  - {edu}')
else:
    print('  - No education information found')

print(f'\nGPA:')
gpa = features[0]['gpa']
if gpa:
    for g in gpa:
        print(f'  - {g}')
else:
    print('  - No GPA information found')
    
print(f'\nJob Titles:')
job_titles = features[0]['job_titles']
if job_titles:
    for job in job_titles:
        print(f'  - {job}')
else:
    print('  - No job titles found')
    
    
print(f'\nYears Experience:')
years_experiences = features[0]['years_experiences']
if years_experiences:
    for years in years_experiences:
        print(f'  - {years}')
else:
    print('  - No years experience found')

print(f'\nExperiences:')
experience = features[0]['experiences']
if experience:
    for exp in experience:
        print(f'  - {exp}')
else:
    print('  - No experience information found')

print(f'\nSkills:')
skills = features[0]['skills']
if skills:
    for skill in skills:
        print(f'  - {skill}')
else:
    print('  - No skills found')

print(f'\nSoft Skills:')
soft_skills = features[0]['soft_skills']
if soft_skills:
    for s_skill in soft_skills:
        print(f'  - {s_skill}')
else:
    print('  - No soft skills found')

print(f'\nLanguages:')
languages = features[0]['languages']
if languages:
    for language in languages:
        print(f'  - {language}')
else:
    print('  - No languages found')
    
    


Name:
  - No name found

Phone:
  - +62 81273724892

Educations:
  - Bachelor's Degree GPA
  - Bachelor's Degree GPA:

GPA:
  - 3.82

Job Titles:
  - Database Administrator

Years Experience:
  - No years experience found

Experiences:
  - Provide student's scores data to identify and improve laboratory processes.
  - Currently working as a Database Administrator for Bina Nusantara's Software Laboratory, where I manage and maintain student scores across multiple campuses with honesty and integrity.
  - Create and maintain SQL query for internal and external requests.
  - EXPERIENCE 02/2024 - Present Database Administrator Bina Nusantara University Manage and maintain student scores for laboratory subjects across six campuses: Kemanggisan, Alam Sutera, Bekasi, Bandung, Malang, and Semarang.
  - Post student's scores in Bina Nusantara University's internal application for students Process assistant's honor payment for case making
  - Schedule important dates for laboratory activities.
 

In [49]:
class ResumeRater:
    
    def __init__(self):
        
        self._pretrained_model = SentenceTransformer(constants.PRETRAINED_SENTENCE_TRANSFORMERS_MODEL)
        
        self._RATING_WEIGHTS = constants.RATING_WEIGHTS
            
    def fit_transform(self, job_description_text: List[str], resume_text: List[dict]):
        
        self._extractor = FeaturesExtractor()
        
        job_text, job_features = extractor.fit_transform(job_description_text)

        # Only 1 job posting description
        self._job_text = job_text[0] 
        self._job_features = job_features[0]
        
        self._resume_text, self._resume_features = extractor.fit_transform(resume_text)
        
        for feature in self._resume_features:
        
            feature['rating_details'] = {
                
                'educations': self._rate_educations(feature['educations']),
                'gpa': self._rate_gpa(feature['gpa']),
                'job_titles' : self._rate_job_titles(feature['job_titles']),
                # 'years_experiences': self._rate_years_experiences(feature['years_experience']), TO BE UPDATED
                'years_experiences': 0,
                'experiences': self._rate_experiences(feature['experiences']),
                'skills': self._rate_skills(feature['skills']),
                'soft_skills': self._rate_soft_skills(feature['soft_skills']),
                'languages': self._rate_languages(feature['languages']),
                
            }
            
            final_rating = 0
            
            for category, rating in feature['rating_details'].items():
                
                try:
                    
                    calculated_rating =  rating * self._RATING_WEIGHTS[str(category).upper()]
                    
                except Exception as e:
                    
                    print(f'Error on parsing rating weights: {e}')
                    continue
                
                final_rating += calculated_rating
                 
            feature['rating'] = final_rating
            
        
        return self._resume_text, self._resume_features

    def top_resume_by_rating(self, k = 5):
            
        sorted_resumes = sorted(self._resume_features, key=lambda x: x['rating'], reverse=True)
    
        top_k_resumes = sorted_resumes[:k]

        return top_k_resumes
            
    def _calculate_cosine_similarity_matrix_mean(self, job_feature_category: List[str], resume_feature_category: List[str], use_threshold= False, threshold= 0.42):
        
        job_embeddings = self._pretrained_model.encode(job_feature_category)
        resume_embeddings  = self._pretrained_model.encode(resume_feature_category)
        
        cosine_sim_matrix = util.pytorch_cos_sim(job_embeddings, resume_embeddings).numpy()
        
        top_similarity = cosine_sim_matrix.max(axis= 1) # Rows: Job, Columns: Resume -> Get max similarity per job
        
        score = top_similarity.mean()
        
        if use_threshold:

            matches = np.sum(top_similarity >= threshold)
                
            # Calculate the score based on the number of matches and similarity >= threshold
            if matches > 0:
                score = (matches / len(job_text)) * np.mean(top_similarity[top_similarity >= threshold])
            else:
                score = 0.0
                        
        return float(score)

    def _calculate_matching_words_score(self, job_word_list: List[str], resume_word_list: List[str]):
        
        n_job_word = len(job_word_list)
        
        if n_job_word <= 0:
            return 1 # No extracted word on the category
        
        job_word_list = [word.lower() for word in job_word_list]
        resume_word_list = [word.lower() for word in resume_word_list]
        
        score = 0
        
        for resume_word in resume_word_list:
            
            if resume_word in job_word_list:
                score += 1

        score /= n_job_word
        
        return score
        
    
    def _rate_educations(self, resume_feature: List[str]):
        
        return self._calculate_cosine_similarity_matrix_mean(self._job_features['educations'], resume_feature)
    
    def _rate_gpa(self, resume_feature: List[str]):
        
        return 0
    
    def _rate_job_titles(self, resume_feature: List[str]):
        
        return self._calculate_cosine_similarity_matrix_mean(self._job_features['job_titles'], resume_feature, use_threshold=False)
    
    def _rate_years_experiences(self, resume_feature: List[str]):
        
        return self._calculate_cosine_similarity_matrix_mean(self._job_features['educations'], resume_feature)
    
    def _rate_experiences(self, resume_feature: List[str]):
        
        return self._calculate_cosine_similarity_matrix_mean(self._job_features['experiences'], resume_feature)
        
    def _rate_skills(self, resume_feature: List[str]):
        
        return self._calculate_matching_words_score(self._job_features['skills'], resume_feature)
        
    def _rate_soft_skills(self, resume_feature: List[str]):
        
        return self._calculate_cosine_similarity_matrix_mean(self._job_features['soft_skills'], resume_feature)
        
    def _rate_languages(self, resume_feature: List[str]):
        
        return self._calculate_matching_words_score(self._job_features['languages'], resume_feature)
        
        
    
    
    
            
            
    

In [50]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



In [51]:
test_resume_features = [
    {
        'year': 5,
        'keywords_match': 'Java Nosql Databases',
        'keywords_context': 'Software engineering'
    },
    {
        'year': 3,
        'keywords_match': 'Typescript Javascript',
        'keywords_context': 'Web Developer'
    },
    {
        'year': 8,
        'keywords_match': 'Python Machine Learning TensorFlow',
        'keywords_context': 'Data Scientist'
    },
    {
        'year': 2,
        'keywords_match': 'SQL Data Analysis Excel',
        'keywords_context': 'Data Analyst'
    },
    {
        'year': 4,
        'keywords_match': 'HTML CSS React',
        'keywords_context': 'Frontend Developer'
    },
]

test_job_features = [
    {
        'year': 4,
        'keywords_match': 'Java Spring SQL',
        'keywords_context': 'Backend Developer'
    },
    {
        'year': 2,
        'keywords_match': 'JavaScript React CSS',
        'keywords_context': 'Frontend Developer'
    },
    {
        'year': 5,
        'keywords_match': 'Python Machine Learning NLP',
        'keywords_context': 'Machine Learning Engineer'
    },
    {
        'year': 3,
        'keywords_match': 'AWS Docker Kubernetes',
        'keywords_context': 'DevOps Engineer'
    }
]


def _calculate_cosine_similarity_matrix_mean(job_feature_category: List[str], resume_feature_category: List[str], row_weights: List[float] = None):
    
    n_job_word = len(job_feature_category)
    
    if n_job_word <= 0:
        return 1
    
    print(f'Job: {job_feature_category}')
    print(f'Resume: {resume_feature_category}')

    
    job_embeddings = model.encode(job_feature_category)
    resume_embeddings  = model.encode(resume_feature_category)
    
    cosine_sim_matrix = util.pytorch_cos_sim(job_embeddings, resume_embeddings).numpy()
    
    if row_weights is not None:
        
        cosine_sim_matrix = cosine_sim_matrix * np.array(row_weights).reshape((-1, 1)) # Transform 1D to 2D with 1 data per columns
    
    top_similarity = cosine_sim_matrix.max(axis= 1) # Because job is in rows
    
    return cosine_sim_matrix, top_similarity , top_similarity.mean()


def _calculate_matching_words_score(job_word_list: List[str], resume_word_list: List[str]):
        
    n_job_word = len(job_word_list)
    
    if n_job_word <= 0:
        return 1 # No extracted word on the category
    
    job_word_list = [word.lower() for word in job_word_list]
    resume_word_list = [word.lower() for word in resume_word_list]
    
    print(job_word_list, resume_word_list)
    
    score = 0
    
    for resume_word in resume_word_list:
        print(resume_word, job_word_list)
        if resume_word in job_word_list:
            score += 1

    score /= n_job_word
    
    return score
        

def _calculate_year_weight(job_year, resume_year):
    
    print(job_year, resume_year)
    weight = min(job_year / resume_year, 1)
    
    
    return weight

    
mtx, top_similarity, context_score = _calculate_cosine_similarity_matrix_mean([features['keywords_context'] for features in test_job_features], 
                                                                                [features['keywords_context'] for features in test_resume_features])

match_score = []

for job_keywords in test_job_features:
    
    score_per_job = []
    
    for resume_keywords in test_resume_features:
        
        score = _calculate_matching_words_score([job_keywords['keywords_match']], [resume_keywords['keywords_match']])
        
        score_per_job.append(score)
        
    match_score.append(score_per_job)
    


print(f'match score: {match_score}')
print(f'top_similarity: {top_similarity}')


# print((top_similarity * constants.YEARS_EXPERIENCE_KEYWORDS_WEIGHTS['KEYWORDS_CONTEXT']
#       +
#       match_score * constants.YEARS_EXPERIENCE_KEYWORDS_WEIGHTS['KEYWORDS_MATCH'])
#       *
#       _calculate_year_weight([features['year'] for features in test_job_features], 
#                                 [features['year'] for features in test_resume_features])
#       )


Job: ['Backend Developer', 'Frontend Developer', 'Machine Learning Engineer', 'DevOps Engineer']
Resume: ['Software engineering', 'Web Developer', 'Data Scientist', 'Data Analyst', 'Frontend Developer']
['java spring sql'] ['java nosql databases']
java nosql databases ['java spring sql']
['java spring sql'] ['typescript javascript']
typescript javascript ['java spring sql']
['java spring sql'] ['python machine learning tensorflow']
python machine learning tensorflow ['java spring sql']
['java spring sql'] ['sql data analysis excel']
sql data analysis excel ['java spring sql']
['java spring sql'] ['html css react']
html css react ['java spring sql']
['javascript react css'] ['java nosql databases']
java nosql databases ['javascript react css']
['javascript react css'] ['typescript javascript']
typescript javascript ['javascript react css']
['javascript react css'] ['python machine learning tensorflow']
python machine learning tensorflow ['javascript react css']
['javascript react css'] 

In [52]:
test = [0.2, 0.3]

print(test)

print(np.array(test).reshape((-1, 1)))

[0.2, 0.3]
[[0.2]
 [0.3]]


In [53]:
job_text = ['C++', 'Embedded Systems', 'Firmware', 'IoT', 'Frontend engineer']
resume_text = ['Java', 'Nosql', 'Databases', 'Software engineering']

job = model.encode(job_text)
resume = model.encode(resume_text)

cosine_sim = util.pytorch_cos_sim(job, resume)

job_len = len(job)

cosine_sim_matrix = cosine_sim.numpy()
print(cosine_sim_matrix)

max_per_job = cosine_sim_matrix.max(axis=1)

print()
print(max_per_job)

print()
print(max_per_job.mean())

threshold = 0.5

matches = np.sum(max_per_job >= threshold)
    
# Calculate the score based on the number of matches and their strengths
if matches > 0:
    score = (matches / len(job_text)) * np.mean(max_per_job[max_per_job >= threshold])
else:
    score = 0.0
    
print(f"Resume score: {score:.4f}")


# rating = 0

# if resume.lower() in job.lower():
#     rating += 0.1
    
# print(rating)

[[0.5067527  0.17577899 0.28201088 0.42740208]
 [0.3362365  0.22355133 0.27252504 0.3434708 ]
 [0.23669623 0.10252491 0.18281116 0.29533428]
 [0.3382214  0.3504028  0.29262048 0.22224598]
 [0.2891426  0.19791049 0.21635374 0.54375005]]

[0.5067527  0.3434708  0.29533428 0.3504028  0.54375005]

0.40794215
Resume score: 0.2101


In [54]:
job_text = ['Marketing Manager', 'Social Media Specialist', 'Content Creator', 'Software Engineer']
resume_text = ['Marketing Supervisor', 'Project Manager', 'CAD Designer', 'Manufacturing Supervisor', 'Database Administrator']

job = model.encode(job_text)
resume = model.encode(resume_text)

cosine_sim = util.pytorch_cos_sim(job, resume)

job_len = len(job)

cosine_sim_matrix = cosine_sim.numpy()
print(cosine_sim_matrix)

max_per_job = cosine_sim_matrix.max(axis=1)

print()
print(max_per_job)

print()
print(max_per_job.mean())

threshold = 0.5

matches = np.sum(max_per_job >= threshold)
    
# Calculate the score based on the number of matches and their strengths
if matches > 0:
    score = (matches / len(job_text)) * np.mean(max_per_job[max_per_job >= threshold])
else:
    score = 0.0
    
print(f"Resume score: {score:.4f}")


# rating = 0

# if resume.lower() in job.lower():
#     rating += 0.1
    
# print(rating)

[[0.8246068  0.5252687  0.32776105 0.4130693  0.32009926]
 [0.46173808 0.3680032  0.29165304 0.32076508 0.30375063]
 [0.2843247  0.3277973  0.30289125 0.30853605 0.2963429 ]
 [0.3632027  0.4990375  0.45114562 0.35039157 0.30742115]]

[0.8246068  0.46173808 0.3277973  0.4990375 ]

0.5282949
Resume score: 0.2062


In [55]:
import time
from sentence_transformers import SentenceTransformer, util

# Load pre-trained model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Define texts
texts = [
    "Bachelor's Degree GPA",
    "Bachelor's Degree GPAaaaaaaaaaaaaaaaaaaaa:"
]
job_descriptions = [
    "Bachelor's degree in Computer Science or equivalent from a reputable university with good academic results is preferred",
    "Bachelor's degree in Computer Science"
]

# Benchmark model.similarity (if available)
start_time = time.time()
similarity_scores = model.similarity(model.encode(job_descriptions), model.encode(texts))  # Hypothetical function
end_time = time.time()
print("Time taken using model.similarity:", end_time - start_time)

# Benchmark manual encoding + util.pytorch_cos_sim
start_time = time.time()
resume_embeddings = model.encode(texts, convert_to_numpy=True)
job_embeddings = model.encode(job_descriptions, convert_to_numpy=True)
cosine_sim = util.pytorch_cos_sim(job_embeddings, resume_embeddings)
end_time = time.time()
print("Time taken using manual encoding + util.pytorch_cos_sim:", end_time - start_time)

Time taken using model.similarity: 0.026009082794189453
Time taken using manual encoding + util.pytorch_cos_sim: 0.02451157569885254


In [56]:
list1 = ['test', 'test2']
list2 = ['test3']
list3 = ['test4']

final_list = list1 + list2 + list3

' '.join(final_list)

'test test2 test3 test4'

In [57]:
rating_model = ResumeRater()

resume_text, resume_features = rating_model.fit_transform([constants.DUMMY_JOB_DESCRIPTION], [extracted_text])

In [58]:
top_resume = rating_model.top_resume_by_rating()

top_resume

[{'resume_idx': 0,
  'name': [],
  'phone': ['+62 81273724892'],
  'educations': ["Bachelor's Degree GPA", "Bachelor's Degree GPA:"],
  'gpa': ['3.82'],
  'job_titles': ['Database Administrator'],
  'years_experiences': [],
  'experiences': ["Provide student's scores data to identify and improve laboratory processes.",
   "Currently working as a Database Administrator for Bina Nusantara's Software Laboratory, where I manage and maintain student scores across multiple campuses with honesty and integrity.",
   'Create and maintain SQL query for internal and external requests.',
   'EXPERIENCE 02/2024 - Present Database Administrator Bina Nusantara University Manage and maintain student scores for laboratory subjects across six campuses: Kemanggisan, Alam Sutera, Bekasi, Bandung, Malang, and Semarang.',
   "Post student's scores in Bina Nusantara University's internal application for students Process assistant's honor payment for case making",
   'Schedule important dates for laboratory a

In [59]:
for idx, resume in enumerate(resume_features):
    
    print(f"\nResume {idx + 1}'s Rating = {resume['rating']:.2f}, details:\n")
    
    for category, rating in resume['rating_details'].items():
        
        print(f' - {category.capitalize()}: {rating:.4f}')
    


Resume 1's Rating = 0.36, details:

 - Educations: 0.6003
 - Gpa: 0.0000
 - Job_titles: 0.3275
 - Years_experiences: 0.0000
 - Experiences: 0.4563
 - Skills: 0.1600
 - Soft_skills: 0.1866
 - Languages: 1.0000
