In [76]:
import spacy
import random
from PyPDF2 import PdfReader
import pandas as pd

from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc

import pandas as pd
import numpy as np
import jsonlines

import re
import nltk
from typing import List

from sentence_transformers import SentenceTransformer, util

In [77]:
import constants

In [78]:
class FeaturesExtractor:

    def __init__(self):

        self.nlp = spacy.load('./data/train_model/model_ner/')
        
        if "sentencizer" not in self.nlp.pipe_names:
            self.nlp.add_pipe("sentencizer")
        
        if 'entity_ruler' not in self.nlp.pipe_names:
            ruler = self.nlp.add_pipe('entity_ruler', after='ner')
            ruler.from_disk(constants.SKILLS_PATTERN_PATH)
            
            job_title_patterns = pd.read_csv(constants.JOB_TITLE_PATH)['Job Title'].unique()
            
            for title in job_title_patterns:
                ruler.add_patterns([{"label": "JOB TITLE", "pattern": title}])
        
        self._CATEGORIES_PATTERN = constants.CATEGORIES_PATTERN
    
    def fit_transform(self, input: List[str]):
        
        self._text_arr = []
        self._feature_arr = []
        self._input_len = len(input)

        for i in range(self._input_len):
            
            doc = self._remove_excess_spaces(input[i])
            self._text_arr.append(doc.text)

            self._extract_features(i, doc)

        return self._text_arr, self._feature_arr

    def _remove_excess_spaces(self, text):
            
        doc = self.nlp(re.sub(r'\s+', ' ', text).strip())

        return doc
            

    def _extract_features(self, resume_idx, doc):

        feature_dict = {
            
            'resume_idx': resume_idx,
            'name': self._extract_name(doc),
            'phone': self._extract_phone(doc),
            'educations': self._extract_educations(doc),
            'gpa': self._extract_gpa(doc),
            'job_titles' : self._extract_job_titles(doc),
            'years_experiences': self._extract_years_experiences(doc),
            'experiences': self._extract_experiences(doc),
            'skills': self._extract_skills(doc),
            'soft_skills': self._extract_soft_skills(doc),
            'languages': self._extract_languages(doc),
            
        }
        
        self._feature_arr.append(feature_dict)
           

    def _extract_name(self, doc):
        
        name = []

        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                name.append(ent.text)

        return name


    def _extract_phone(self, doc):

        pattern = r'(?:\+?(?:\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?(?:\d{2,4}[-.\s]?){2,5}\d{2,4})'

        matches = re.findall(pattern, doc.text)

        return matches
    
    def _extract_educations(self, doc):
        
        educations = []

        pattern = self._CATEGORIES_PATTERN['EDUCATIONS']
        
        matches = re.findall(pattern, doc.text)
        for match in matches:
            educations.append(match.strip())
            
        for ent in doc.ents:
            if 'DIPLOMA' in ent.label_:
                educations.append(ent.text)

        return [edu for edu in set(educations)]
    
    def _extract_gpa(self, doc):
        
        gpas = []
        
        for ent in doc.ents:
            if 'GPA' in ent.label_:
                gpas.append(ent.text)
                
        return [gpa.capitalize() for gpa in set(gpas)]
    
    def _extract_job_titles(self, doc):
        
        job_titles = []
        
        for ent in doc.ents:
            if 'JOB TITLE' in ent.label_:
                job_titles.append(ent.text)
        
        return [job for job in set(job_titles)]
    
    def _extract_years_experiences(self, doc):
        
        years_experiences= []
        
        pattern = self._CATEGORIES_PATTERN['YEARS_EXPERIENCES']
        
        sentences = [sent.text.strip() for sent in doc.sents]

        # for sentence in sentences:
        #     if re.search(pattern, sentence, re.IGNORECASE):
        #         years_experiences_sentences.append(sentence)
        
        for sentence in sentences:
            
            matches = re.findall(pattern, sentence, re.IGNORECASE)
                        
            for match in matches:
                
                # Prevent context not parsed
                try:

                    
                    year = match  # Extracted years
                    year = re.sub(r'\+', '', year)  # Remove '+' if present

                    try:
                        
                        year = int(year)
                    
                    except Exception as e:
                        
                        print(f"Error converting {match} to number: {e}")
                        continue

                    match_doc = self.nlp(sentence)
                    
                    skills = self._extract_skills(match_doc)
                    job_titles = self._extract_job_titles(match_doc)
                    languages = self._extract_languages(match_doc)
                    
                    # Make list out of keywords for similarity scoring
                    keywords_match = skills + languages
                    keywords_context = job_titles
                    
                    if keywords_match or keywords_context:

                        years_experience_dict = {
                            'text': sentence,
                            'year': year,
                            'keywords_match': keywords_match,
                            'keywords_context': keywords_context
                        }
                        
                        years_experiences.append(years_experience_dict)

                except Exception as e:
                    print(f"Error processing match {match}: {e}")
                    continue 
            
        return years_experiences
    
    def _extract_experiences(self, doc):
        
        experiences = []
        
        pattern = self._CATEGORIES_PATTERN['EXPERIENCES']
        
        sentences = [sent.text.strip() for sent in doc.sents]

        for sentence in sentences:
            if re.search(pattern, sentence, re.IGNORECASE):
                experiences.append(sentence)
                
        for ent in doc.ents:
            if 'EXPERIENCE' in ent.label_:
                experiences.append(ent.text)
        
        return [exp for exp in set(experiences)]
    
    def _extract_skills(self, doc):

        skills = []

        for ent in doc.ents:
            if 'SKILL' in ent.label_ and 'SOFT SKILL' not in ent.label_:
                skills.append(ent.text)
                
        return [skill.capitalize() for skill in set(skills)]
    
    def _extract_soft_skills(self, doc):
        
        soft_skills = []
        
        for ent in doc.ents:
            if 'SOFT SKILL' in ent.label_:
                soft_skills.append(ent.text)
                
        return [soft_skill for soft_skill in set(soft_skills)]

    def _extract_languages(self, doc):
        
        languages = []

        for ent in doc.ents:
            if 'LANGUAGE' in ent.label_:
                languages.append(ent.text)
                
        return [language.capitalize() for language in set(languages)]

    



In [79]:
def cv_parse(text):
    
    model = spacy.load('en_core_web_trf')
    
    print(model.pipe_names)

    doc = model(text)
    for ent in doc.ents:
        print(f'{ent.label_.upper():{30}}- {ent.text}')

In [80]:
PDF_PATH = './PDF/Anthonio Obert - Software Developer - CV (1).pdf'

reader = PdfReader(PDF_PATH)
n_pages = len(reader.pages)

extracted_text = ''

for i in range(n_pages):
    page = reader.pages[i]
    extracted_text += page.extract_text()

extracted_text = re.sub(r'\s+', ' ', extracted_text).strip()
extracted_text

"Anthonio Obert Software Developer+62 81273724892 laisobert2@gmail.com Jakarta, Indonesia SUMMARY A passionate college student with a keen interest in software development that is able to learn quickly and delve deeply into new subjects. Currently working as a Database Administrator for Bina Nusantara's Software Laboratory, where I manage and maintain student scores across multiple campuses with honesty and integrity. Capable of working under pressure and meeting deadlines. EXPERIENCE 02/2024 - Present Database Administrator Bina Nusantara University Manage and maintain student scores for laboratory subjects across six campuses: Kemanggisan, Alam Sutera, Bekasi, Bandung, Malang, and Semarang. Maintain web application to support internal and external activities. Create and maintain SQL query for internal and external requests. Provide student's scores data to identify and improve laboratory processes. Schedule important dates for laboratory activities. Post student's scores in Bina Nusa

In [81]:
PDF_PATH = './PDF/Smith Resume.pdf'

reader = PdfReader(PDF_PATH)
n_pages = len(reader.pages)

extracted_text = ''

for i in range(n_pages):
    page = reader.pages[i]
    extracted_text += page.extract_text()

extractor = FeaturesExtractor()

texts, features = extractor.fit_transform([extracted_text])
job_text, job_features = extractor.fit_transform([constants.DUMMY_JOB_DESCRIPTION])

print(f'\nEducations:')
for education in job_features[0]['educations']:
    print(education)
    
print(f'\nGPA:')
gpa = job_features[0]['gpa']
if gpa:
    for g in gpa:
        print(f'  - {g}')
else:
    print('  - No GPA information found')

print(f'\nJob Titles:')
for job_title in job_features[0]['job_titles']:
    print(job_title)
    
print(f'\nYears Experience:')
for years_experience in job_features[0]['years_experiences']:
    print(years_experience)

print(f'\nExperiences:')
for experience in job_features[0]['experiences']:
    print(experience)
    
print(f'\nSkills:')
for skill in job_features[0]['skills']:
    print(skill)   
    
print(f'\nSkills:')
for soft_skill in job_features[0]['soft_skills']:
    print(soft_skill)   
    
print(f'\nLanguages:')
for language in job_features[0]['languages']:
    print(language)   


Educations:
Bachelor's degree in Computer Science or equivalent

GPA:
  - No GPA information found

Job Titles:
Backend Software Engineer
Software Engineer

Years Experience:
{'text': 'Having minimum 3 years of experience in software engineering (Java), application development or system development + experience in RDBMS and NoSQL databases.', 'year': 3, 'keywords_match': ['Databases', 'Nosql', 'Java', 'Software engineering'], 'keywords_context': []}

Experiences:
Traveloka services related to new products, business models, business growth, market expansion and process optimization.
Comfortable working up and down the technology stack.
Good business acumen, excellent problem skills and broad understanding of software and system design.
Schedule important dates for laboratory activities. ‎ Requirements Bachelor's degree in Computer Science or equivalent from a reputable university with good academic results is preferred.
As a Backend Software Engineer, you are expected to: Be responsibl

In [82]:

# Print all extracted features in a formatted way
print(f'\nName:')
name = features[0]['name']
if name:
    print(f'  - {name[0]}')  # Assuming there is only one name
else:
    print('  - No name found')

print(f'\nPhone:')
phone = features[0]['phone']
if phone:
    for p in phone:
        print(f'  - {p}')
else:
    print('  - No phone number found')

print(f'\nEducations:')
education = features[0]['educations']
if education:
    for edu in education:
        print(f'  - {edu}')
else:
    print('  - No education information found')

print(f'\nGPA:')
gpa = features[0]['gpa']
if gpa:
    for g in gpa:
        print(f'  - {g}')
else:
    print('  - No GPA information found')
    
print(f'\nJob Titles:')
job_titles = features[0]['job_titles']
if job_titles:
    for job in job_titles:
        print(f'  - {job}')
else:
    print('  - No job titles found')
    
    
print(f'\nYears Experience:')
years_experiences = features[0]['years_experiences']
if years_experiences:
    for years in years_experiences:
        print(f'  - {years}')
else:
    print('  - No years experience found')

print(f'\nExperiences:')
experience = features[0]['experiences']
if experience:
    for exp in experience:
        print(f'  - {exp}')
else:
    print('  - No experience information found')

print(f'\nSkills:')
skills = features[0]['skills']
if skills:
    for skill in skills:
        print(f'  - {skill}')
else:
    print('  - No skills found')

print(f'\nSoft Skills:')
soft_skills = features[0]['soft_skills']
if soft_skills:
    for s_skill in soft_skills:
        print(f'  - {s_skill}')
else:
    print('  - No soft skills found')

print(f'\nLanguages:')
languages = features[0]['languages']
if languages:
    for language in languages:
        print(f'  - {language}')
else:
    print('  - No languages found')
    
    


Name:
  - No name found

Phone:
  - 140749

Educations:
  - No education information found

GPA:
  - No GPA information found

Job Titles:
  - No job titles found

Years Experience:
  - {'text': 'Michael Smith BI / Big Data/ Azure Manchester , UK- Email me on Indeed: indeed.com/r/ falicent/140749dace5dc26f 10+ years of Experience in Designing, Development, Administration, Analysis, Management inthe Business Intelligence Da ta warehousing, Client Server Technologies, Web -based Applications, cloud solutions and Databases.', 'year': 10, 'keywords_match': ['Databases', 'Azure', 'Server', 'Business intelligence'], 'keywords_context': []}
  - {'text': 'Less than 1 year) ADDITIONAL INFORMATION Professiona l Skills Excellent analytical, problem solving, communication, knowledge transfer and interpersonalskills with ability to interact with individuals at all the levels Quick learner and maintains cordial relationship with project manager and team members and good performer both in team and i

In [83]:

class ResumeRater:
    
    def __init__(self):
        
        self._pretrained_model = SentenceTransformer(constants.PRETRAINED_SENTENCE_TRANSFORMERS_MODEL)
        
        self._RATING_WEIGHTS = constants.RATING_WEIGHTS
        self._YEARS_EXPERIENCES_WEIGHTS = constants.YEARS_EXPERIENCES_WEIGHTS
            
    def fit_transform(self, job_description_text: List[str], resume_text: List[dict]):
        
        self._extractor = FeaturesExtractor()
        
        job_text, job_features = self._extractor.fit_transform(job_description_text)

        # Only 1 job posting description
        self._job_text = job_text[0] 
        self._job_features = job_features[0]
        
        self._resume_text, self._resume_features = self._extractor.fit_transform(resume_text)
        
        for feature in self._resume_features:
        
            feature['rating_details'] = {
                
                'educations': self._rate_educations(feature['educations']),
                'gpa': self._rate_gpa(feature['gpa']),
                'job_titles' : self._rate_job_titles(feature['job_titles']),
                'years_experiences': self._rate_years_experiences(feature['years_experiences']),
                'experiences': self._rate_experiences(feature['experiences']),
                'skills': self._rate_skills(feature['skills']),
                'soft_skills': self._rate_soft_skills(feature['soft_skills']),
                'languages': self._rate_languages(feature['languages']),
                
            }
            
            final_rating = 0
            
            for category, rating in feature['rating_details'].items():
                
                try:
                    
                    calculated_rating =  rating * self._RATING_WEIGHTS[str(category).upper()]
                    
                except Exception as e:
                    
                    print(f'Error on parsing rating weights: {e}')
                    continue
                
                final_rating += calculated_rating
                 
            feature['rating'] = final_rating
            
        
        return self._job_features, self._resume_features
    
    def top_resume_by_rating(self, k = 5):
        
        if k <= 0:
            raise ValueError(f"Parameter k must be non-negative and non-zero, got {k}.")
        
        sorted_resumes = sorted(self._resume_features, key=lambda x: x['rating'], reverse=True)
    
        top_k_resumes = sorted_resumes[:k]

        return top_k_resumes
    
    def _check_feature_availability(self, job_feature: List[str], resume_feature: List[str]):
        
        if len(job_feature) <= 0 or not job_feature:
            return 1
        
        if len(resume_feature) <= 0 or not resume_feature:
            return 0
        
        return -1
    
    def _calculate_cosine_similarity_matrix_mean(self, job_feature_category: List[str], resume_feature_category: List[str], return_matrix = False, use_threshold= False, threshold= 0.42):
        
        check_feature = self._check_feature_availability(job_feature_category, resume_feature_category)
        
        if check_feature != -1:
            
            m = len(job_feature_category)
            n = len(resume_feature_category)
            
            return (check_feature, np.full((m, n), check_feature)) if return_matrix else check_feature
        
        job_embeddings = self._pretrained_model.encode(job_feature_category)
        resume_embeddings  = self._pretrained_model.encode(resume_feature_category)
        
        cosine_sim_matrix = util.pytorch_cos_sim(job_embeddings, resume_embeddings).numpy()
        
        top_similarity = cosine_sim_matrix.max(axis= 1) # Rows: Job, Columns: Resume -> Get max similarity per job
        
        score = top_similarity.mean()
        
        if use_threshold:

            matches = np.sum(top_similarity >= threshold)
                
            # Calculate the score based on the number of matches and similarity >= threshold
            if matches > 0:
                score = (matches / len(job_feature_category)) * np.mean(top_similarity[top_similarity >= threshold])
            else:
                score = 0.0
                
        if return_matrix:
            return float(score), cosine_sim_matrix
        
        return float(score)

    def _calculate_matching_words_score(self, job_word_list: List[str], resume_word_list: List[str]):
                
        check_feature = self._check_feature_availability(job_word_list, resume_word_list)
        
        if check_feature != -1:
            return check_feature
        
        job_word_list = [word.lower() for word in job_word_list]
        resume_word_list = [word.lower() for word in resume_word_list]
        
        score = 0
        
        for resume_word in resume_word_list:
            
            if resume_word in job_word_list:
                score += 1

        n_job_word = len(job_word_list)

        score /= n_job_word
        
        return score
    
    def _calculate_year_weight(self, job_year_list: List[int], resume_year_list: List[int]):
        
        check_feature = self._check_feature_availability(job_year_list, resume_year_list)
        
        if check_feature != -1:
            return check_feature
        
        m = len(job_year_list)
        n = len(resume_year_list)
        
        weight_mtx = np.zeros((m, n))
        
        for job_idx, job_year in enumerate(job_year_list):
            
            for resume_idx, resume_year in enumerate(resume_year_list):
                
                weight = min(resume_year / job_year, 1)
                            
                weight_mtx[job_idx, resume_idx] = weight
        
        return weight_mtx

    
    def _rate_educations(self, resume_feature: List[str]):
        
        return self._calculate_cosine_similarity_matrix_mean(self._job_features['educations'], resume_feature)
    
    def _rate_gpa(self, resume_feature: List[str]):
        
        return 0
    
    def _rate_job_titles(self, resume_feature: List[str]):
        
        return self._calculate_cosine_similarity_matrix_mean(self._job_features['job_titles'], resume_feature, use_threshold=False)
    
    def _rate_years_experiences(self, resume_feature: List[str]):
                
        job_keywords_context = [' '.join(features['keywords_context']) for features in self._job_features['years_experiences']]
        resume_keywords_context = [' '.join(features['keywords_context']) for features in resume_feature]

        if not job_keywords_context or job_keywords_context == ['']:
            similarity_mtx = np.array([[1]])
        elif not resume_keywords_context or job_keywords_context == ['']:
            similarity_mtx = np.array([[0]])
        else:
            _, similarity_mtx = self._calculate_cosine_similarity_matrix_mean(job_keywords_context, resume_keywords_context, return_matrix= True)
      
        job_year_list = [features['year'] for features in self._job_features['years_experiences']]
        resume_year_lsit = [features['year'] for features in resume_feature]
        
        year_weight_mtx = self._calculate_year_weight(job_year_list, resume_year_lsit)

        job_keywords_match = [features['keywords_match'] for features in self._job_features['years_experiences']]
        resume_keywords_match = [features['keywords_match'] for features in resume_feature]

        
        m = len(job_keywords_match)
        n = len(resume_keywords_match)
        
        if m <= 0:
            matching_mtx = np.array([[1]])
        elif n <= 0:
            matching_mtx = np.array([[0]])
        else:
            matching_mtx = np.zeros((m, n))

            for job_idx, job_match in enumerate(job_keywords_match):
                
                for resume_idx, resume_match in enumerate(resume_keywords_match):
                    
                    matching_mtx[job_idx, resume_idx] = self._calculate_matching_words_score(job_match, resume_match)

        
        
        weighted_similarity_mtx = similarity_mtx * self._YEARS_EXPERIENCES_WEIGHTS['KEYWORDS_CONTEXT']
        weighted_matching_mtx = matching_mtx * self._YEARS_EXPERIENCES_WEIGHTS['KEYWORDS_MATCH']
        
        final_mtx = (weighted_similarity_mtx + weighted_matching_mtx) * year_weight_mtx
        top_score = final_mtx.max(axis= 1)
        score = top_score.mean()
                
        return score
    
    def _rate_experiences(self, resume_feature: List[str]):
        
        return self._calculate_cosine_similarity_matrix_mean(self._job_features['experiences'], resume_feature)
        
    def _rate_skills(self, resume_feature: List[str]):
        
        return self._calculate_matching_words_score(self._job_features['skills'], resume_feature)
        
    def _rate_soft_skills(self, resume_feature: List[str]):
        
        return self._calculate_cosine_similarity_matrix_mean(self._job_features['soft_skills'], resume_feature)
        
    def _rate_languages(self, resume_feature: List[str]):
        
        return self._calculate_matching_words_score(self._job_features['languages'], resume_feature)
        

In [84]:
rating_model = ResumeRater()

resume_text, resume_features = rating_model.fit_transform([constants.DUMMY_JOB_DESCRIPTION], [extracted_text])



[3] [10, 1]
['']
[[1]] [[0.1625 0.    ]]
[[0.5125     0.11666667]] [0.5125] 0.5125


In [85]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



In [86]:
test_resume_features = [
    {
        'year': 5,
        'keywords_match': ['Java', 'Nosql', 'Databases'],
        'keywords_context': ['Software engineering', 'Front-end web developer']
    },
    {
        'year': 3,
        'keywords_match': ['Typescript', 'Javascript'],
        'keywords_context': ['Web Developer']
    },
    {
        'year': 8,
        'keywords_match': ['Python', 'Machine', 'Learning', 'TensorFlow'],
        'keywords_context': ['Data Scientist']
    },
    {
        'year': 2,
        'keywords_match': ['SQL', 'Data', 'Analysis', 'Excel'],
        'keywords_context': ['Data Analyst']
    },
    {
        'year': 4,
        'keywords_match': ['HTML', 'CSS', 'React'],
        'keywords_context': ['Frontend Developer']
    },
]

test_job_features = [
    {
        'year': 4,
        'keywords_match': ['Java', 'Spring', 'SQL'],
        'keywords_context': ['Backend Developer']
    },
    {
        'year': 2,
        'keywords_match': ['JavaScript', 'React', 'CSS'],
        'keywords_context': ['Frontend Developer']
    },
    {
        'year': 5,
        'keywords_match': ['Python', 'Machine', 'Learning', 'NLP'],
        'keywords_context': ['Machine Learning Engineer']
    },
    {
        'year': 3,
        'keywords_match': ['AWS', 'Docker', 'Kubernetes'],
        'keywords_context': ['DevOps Engineer']
    }
]



def _calculate_cosine_similarity_matrix_mean(job_feature_category: List[str], resume_feature_category: List[str]):
    
    n_job_word = len(job_feature_category)
    
    if n_job_word <= 0:
        return 1
    
    print(f'Job: {job_feature_category}')
    print(f'Resume: {resume_feature_category}')

    
    job_embeddings = model.encode(job_feature_category)
    resume_embeddings  = model.encode(resume_feature_category)
    
    cosine_sim_matrix = util.pytorch_cos_sim(job_embeddings, resume_embeddings).numpy()
    
    top_similarity = cosine_sim_matrix.max(axis= 1) # Because job is in rows
    
    return cosine_sim_matrix, top_similarity , top_similarity.mean()


def _calculate_matching_words_score(job_word_list: List[str], resume_word_list: List[str]):
        
    n_job_word = len(job_word_list)
    
    if n_job_word <= 0:
        return 1 # No extracted word on the category
    
    job_word_list = [word.lower() for word in job_word_list]
    resume_word_list = [word.lower() for word in resume_word_list]
        
    score = 0
    
    for resume_word in resume_word_list:
        print(resume_word, job_word_list)
        if resume_word in job_word_list:
            score += 1

    score /= n_job_word
    
    return score
        

def _calculate_year_weight(job_year_list: List[int], resume_year_list: List[int]):
        
    m = len(job_year_list)
    n = len(resume_year_list)
    
    weight_mtx = np.zeros((m, n))
    
    for job_idx, job_year in enumerate(job_year_list):
        
        for resume_idx, resume_year in enumerate(resume_year_list):
            
            weight = min(resume_year / job_year, 1)
                        
            weight_mtx[job_idx, resume_idx] = weight
    
    
    return weight_mtx

job_keywords_context = [' '.join(features['keywords_context']) for features in test_job_features]
resume_keywords_context = [' '.join(features['keywords_context']) for features in test_resume_features]
job_year_list = [features['year'] for features in test_job_features]
resume_year_lsit = [features['year'] for features in test_resume_features]

year_weight_mtx = _calculate_year_weight(job_year_list, resume_year_lsit)

print(_calculate_year_weight([features['year'] for features in test_job_features],
                             [features['year'] for features in test_resume_features]))

similarity_mtx, top_similarity, context_score = _calculate_cosine_similarity_matrix_mean(job_keywords_context, 
                                                                                resume_keywords_context)
job_keywords_match = [features['keywords_match'] for features in test_job_features]
resume_keywords_match = [features['keywords_match'] for features in test_resume_features]

m = len(job_keywords_match)
n = len(resume_keywords_match)

matching_mtx = np.zeros((m, n))

for job_idx, job_match in enumerate(job_keywords_match):
    
    for resume_idx, resume_match in enumerate(resume_keywords_match):
        
        matching_mtx[job_idx, resume_idx] = _calculate_matching_words_score(job_match, resume_match)


weighted_similarity_mtx = similarity_mtx * constants.YEARS_EXPERIENCES_WEIGHTS['KEYWORDS_CONTEXT']
weighted_matching_mtx = matching_mtx * constants.YEARS_EXPERIENCES_WEIGHTS['KEYWORDS_MATCH']

years_experiences_mtx = (weighted_similarity_mtx + weighted_matching_mtx) * year_weight_mtx
print(f'Final matrix: \n{years_experiences_mtx}\n')

        
# Join many strings together for each keywords_context

# print(job_feature_category)
# print(resume_feature_category)

# job_embeddings = model.encode(job_feature_category)
# resume_embeddings  = model.encode(resume_feature_category)

# cosine_sim_matrix = util.pytorch_cos_sim(job_embeddings, resume_embeddings).numpy()

# print(job_embeddings, resume_embeddings, cosine_sim_matrix)

# match_score = []

# for job_keywords in test_job_features:
    
#     score_per_job = []
    
#     for resume_keywords in test_resume_features:
        
#         score = _calculate_matching_words_score([job_keywords['keywords_match']], [resume_keywords['keywords_match']])
        
#         score_per_job.append(score)
        
#     match_score.append(score_per_job)
    


# print(f'match score: {match_score}')
print(f'Max per row:{years_experiences_mtx.max(axis= 1)}')
print(f'Final RESULT:{years_experiences_mtx.max(axis= 1).mean()}')


# print((top_similarity * constants.YEARS_EXPERIENCE_KEYWORDS_WEIGHTS['KEYWORDS_CONTEXT']
#       +
#       match_score * constants.YEARS_EXPERIENCE_KEYWORDS_WEIGHTS['KEYWORDS_MATCH'])
#       *
#       _calculate_year_weight([features['year'] for features in test_job_features], 
#                                 [features['year'] for features in test_resume_features])
#       )


[[1.         0.75       1.         0.5        1.        ]
 [1.         1.         1.         1.         1.        ]
 [1.         0.6        1.         0.4        0.8       ]
 [1.         1.         1.         0.66666667 1.        ]]
Job: ['Backend Developer', 'Frontend Developer', 'Machine Learning Engineer', 'DevOps Engineer']
Resume: ['Software engineering Front-end web developer', 'Web Developer', 'Data Scientist', 'Data Analyst', 'Frontend Developer']
java ['java', 'spring', 'sql']
nosql ['java', 'spring', 'sql']
databases ['java', 'spring', 'sql']
typescript ['java', 'spring', 'sql']
javascript ['java', 'spring', 'sql']
python ['java', 'spring', 'sql']
machine ['java', 'spring', 'sql']
learning ['java', 'spring', 'sql']
tensorflow ['java', 'spring', 'sql']
sql ['java', 'spring', 'sql']
data ['java', 'spring', 'sql']
analysis ['java', 'spring', 'sql']
excel ['java', 'spring', 'sql']
html ['java', 'spring', 'sql']
css ['java', 'spring', 'sql']
react ['java', 'spring', 'sql']
java ['

In [87]:
rating_model = ResumeRater()

resume_text, resume_features = rating_model.fit_transform([constants.DUMMY_JOB_DESCRIPTION], [extracted_text])

[3] [10, 1]
['']
[[1]] [[0.1625 0.    ]]
[[0.5125     0.11666667]] [0.5125] 0.5125


In [88]:
top_resume = rating_model.top_resume_by_rating()

top_resume

[{'resume_idx': 0,
  'name': [],
  'phone': ['140749'],
  'educations': [],
  'gpa': [],
  'job_titles': [],
  'years_experiences': [{'text': 'Michael Smith BI / Big Data/ Azure Manchester , UK- Email me on Indeed: indeed.com/r/ falicent/140749dace5dc26f 10+ years of Experience in Designing, Development, Administration, Analysis, Management inthe Business Intelligence Da ta warehousing, Client Server Technologies, Web -based Applications, cloud solutions and Databases.',
    'year': 10,
    'keywords_match': ['Databases',
     'Azure',
     'Server',
     'Business intelligence'],
    'keywords_context': []},
   {'text': 'Less than 1 year) ADDITIONAL INFORMATION Professiona l Skills Excellent analytical, problem solving, communication, knowledge transfer and interpersonalskills with ability to interact with individuals at all the levels Quick learner and maintains cordial relationship with project manager and team members and good performer both in team and independent job environments

In [89]:
for idx, resume in enumerate(resume_features):
    
    print(f"\nResume {idx + 1}'s Rating = {resume['rating']:.2f}, details:\n")
    
    for category, rating in resume['rating_details'].items():
        
        print(f' - {category.capitalize()}: {rating:.4f}')
    


Resume 1's Rating = 0.37, details:

 - Educations: 0.0000
 - Gpa: 0.0000
 - Job_titles: 0.0000
 - Years_experiences: 0.5125
 - Experiences: 0.3825
 - Skills: 0.2400
 - Soft_skills: 0.5584
 - Languages: 1.0000
