# Resume Ranking

___ 
We try to rank resume profiles based on content similarity. The BM25 family of ranking models were considered

### Install dependencies using pip

In [1]:
# !pip install textract
# !pip install tika

### Import packages

In [3]:
import glob
import os

import pandas as pd
import textract
from tika import parser

import pickle

### Paths to inputs and output files

In [4]:
path = "./../Data/Resumes/"
save_to_path = "./../Data/Workin_Data/"

for filename in glob.glob(path+"~*"):
    os.remove(filename)

### Utility class that serves as helper class containing functions regulary used

In [4]:
from sklearn.utils import shuffle
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re
from bs4 import BeautifulSoup
import shutil


class Utils:
    """
    Class containing methods that serve as helper functions
    """
    
    def shuffle_data(self, data_pd):
        """
        Data shuffling
        """
        
        data_columns = data_pd.columns
        data_body = data_pd[data_columns]
        data_body = shuffle(data_body)

        return data_body
    
    def string_to_words(self, query):
        """
        from string of words to list of processed words
        """
        
        nltk.download("stopwords", quiet=True)
        try:
            # add_similar_words_to_search_query(query[-1])
            text = BeautifulSoup(query[-1], "html.parser").get_text()  # Remove HTML tags
            text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())  # Remove non-alphanumeric and Convert to lower case
        except:
            text = ''
        word_list = text.split()  # Split string into words
        word_list = [w for w in word_list if w not in stopwords.words("english")]  # Remove stopwords
        word_list = [PorterStemmer().stem(w) for w in word_list]  # stem

        return [query[0], word_list]
    
    def clean_data(self, data, cache_dir, cache_file="cleaned_data.pkl"):
        """
        Convert each data row to words; read from cache if available.
        input: dataframe with columns key->col1, value->col2
        output: list of lists, e.g [[employee1_id,body1_word_list],[employee2_id,body2_word_list],...]
        """

        data_keys, data_body = data[data.columns[0]].values, data[data.columns[1]].values
        data_train = [[data_keys[i], data_body[i]] for i in range(len(data_body))]

        # If cache_file is not None, try to read from it first
        cache_data = None
        if cache_file is not None:
            try:
                with open(os.path.join(cache_dir, cache_file), "rb") as f:
                    cache_data = pickle.load(f)
                print("Read cleaned data from cache file:", cache_file)
            except:
                pass  # unable to read from cache, but that's okay

        # If cache is missing, then do the heavy lifting
        if cache_data is None:
            # Preprocess the data to obtain words for each employee data
            words_train = list(map(self.string_to_words, data_train))

            # Write to cache file for future runs
            if cache_file is not None:
                cache_data = dict(words_train=words_train)
                with open(os.path.join(cache_dir, cache_file), "wb") as f:
                    pickle.dump(cache_data, f)
                print("Wrote preprocessed data to cache file:", cache_file)
        else:
            # Unpack data loaded from cache file
            words_train = (cache_data['words_train'])

        return words_train
    
    def add_data_to_pickle(self, data_file, data=None, path='./'):

        data_path = path + data_file

        data_file_name = data_file.split('.')[0]
        pickle_file_name = path + 'data_dict.pkl'

        if os.path.isfile(pickle_file_name):
            pickle_file = open(pickle_file_name, 'rb')
        else:
            pickle_file = open(pickle_file_name, 'bw')
            pickle_file.close()

        if os.path.getsize(pickle_file_name) > 0:
            data_collections = pickle.load(pickle_file)
            pickle_file.close()
        else:
            data_collections = {}


        data_collections[data_file_name] = data
        with open(pickle_file_name, 'bw') as f:
            pickle.dump(data_collections, f)
            
    def swap_key_value(self, index_id):
        return {emm_id: index for index, emm_id in index_id.items()}
    
    def resume_index_id_data(self, data_processed_with_id):
        count, index_id, data = 0, {}, []
        for item in data_processed_with_id:
            index_id[count] = item[0]
            data.append(item[1])
            count += 1
        return index_id, data
            

### Class containing functions that helps to extract data from the document resumes

In [5]:
class RecruitmentPreprocess:
    """
    Class for preprocessing job offer and resume text queries.
    """

    def __init__(self, resume_path, save_to_path=None):
        
        self.resume_path = resume_path
        
        if save_to_path != None:
            self.save_to_path = save_to_path
            self.resume_id_index, self.resume_id_data, self.resume_data = self.__merge_resume_to_dataframe()
        

    def extract_text_from_resume(self, file_name):
        if file_name.split('.')[-1] == "pdf":
            text = parser.from_file(self.resume_path + file_name)['content']
        else:
            text = textract.process(self.resume_path + file_name).decode()
        return text


    def __merge_resume_to_dataframe(self):

        all_collection = []
        all_files = [file.split('\\')[-1] for file in glob.glob(path + "*") if not file.startswith('~')]
        index = 1
        resume_id_index = {}
        resume_id_data = {}
        
        for file in all_files:
            if file.split('.')[-1] in ['docx', 'pdf', 'doc']:
                resume_id_index[index] = '{}_{}'.format(''.join(file.split('.')[:-1]),index)
                resume_id_data[index] = self.extract_text_from_resume(file)
                collection = [index, self.extract_text_from_resume(file)]
                all_collection.append(collection)
                index += 1

        return resume_id_index, resume_id_data, pd.DataFrame(all_collection, columns=['employee_id', 'data'])
    
    def save_resume_data_to_csv(self):
        
        self.resume_data.to_csv(self.save_to_path + 'resume_data.csv', index=False)
        
    def add_resume_keys_to_pickle(self):
        
        utils = Utils()
        utils.add_data_to_pickle('resume_id_index', self.resume_id_index, self.save_to_path)
        
    def add_user_accessible_resume_to_pickle(self):
        
        utils = Utils()
        utils.add_data_to_pickle('user_accessible_resume', self.resume_id_data, self.save_to_path)
        
    def add_processed_resume_to_pickle(self):
        
        cache_directory = os.path.join("cache", "words_tokens")  # where to store cache files
        os.makedirs(cache_directory, exist_ok=True)  # ensure cache directory exists

        cache_file = 'cleaned_{}.pkl'.format('processed_resume')
        
        utils = Utils()
    
        data_shuffled = utils.shuffle_data(self.resume_data)
        data_processed_with_id = utils.clean_data(data_shuffled, cache_directory, cache_file=cache_file)
        
        shutil.rmtree('cache')
        
        utils.add_data_to_pickle('processed_resume', data_processed_with_id, self.save_to_path)
        

### Extracting the resume files and storing them in a pickle file for faster access

In [6]:
resume = RecruitmentPreprocess(path, save_to_path)
resume.save_resume_data_to_csv()
resume.add_resume_keys_to_pickle()
resume.add_user_accessible_resume_to_pickle()
resume.add_processed_resume_to_pickle()

Wrote preprocessed data to cache file: cleaned_processed_resume.pkl


### Confirming that the files were extracted successfully

In [12]:
with open(save_to_path+'data_dict.pkl', 'rb') as f:
    cv_id_index = pickle.load(f)

print(cv_id_index.keys())
print(cv_id_index['resume_data'])
# print(cv_id_index['user_accessible_resume'][144])

dict_keys(['resume_id_index', 'user_accessible_resume', 'processed_resume', 'resume_count', 'resume_data'])
     employee_id                                               data
0              1  Name: Abiral Pandey\n\nEmail: abiral.pandey88@...
1              2  Achyuth\n\n540-999-8048\n\nachyuth.java88@gmai...
2              3  Adelina Erimia, PMP, Six Sigma Green Belt, SMC...
3              4  Adhi Gopalam\n\nadhigopalam@gmail.com\n\n281-2...
4              5  Ajay Kumar (CSM)\t     \t\t     Email/Skype: a...
..           ...                                                ...
219          220  VISHNU J\n\nEmail to jammigumpulavishnu452@gma...
220          221  Cell: 972-514-3667\n\n\n\n\n\n\n\n\n\nVivek Jo...
221          222  VIVEK SAGAR                                   ...
222          223  YOHAN \n\nSr. Business Analyst\n\n\n\nVersatil...
223          224  Yugesh\n\n+1(515)-650-2459\n\nyugeshm4@gmail.c...

[224 rows x 2 columns]


### The family of BM25 Models

In [8]:
import math
import pickle

import numpy as np
from multiprocessing import Pool, cpu_count
import os

class BM25:
    def __init__(self, corpus, tokenizer=None):
        self.corpus_size = len(corpus)
        self.avgdl = 0
        self.doc_freqs = []
        self.idf = {}
        self.doc_len = []
        self.tokenizer = tokenizer

        if tokenizer:
            corpus = self._tokenize_corpus(corpus)

        nd = self._initialize(corpus)
        self._calc_idf(nd)

    def _initialize(self, corpus):
        nd = {}  # word -> number of documents with word
        num_doc = 0
        for document in corpus:
            self.doc_len.append(len(document))
            num_doc += len(document)

            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.doc_freqs.append(frequencies)

            for word, freq in frequencies.items():
                try:
                    nd[word] += 1
                except KeyError:
                    nd[word] = 1

        self.avgdl = num_doc / self.corpus_size
        return nd

    def _tokenize_corpus(self, corpus):
        pool = Pool(cpu_count())
        tokenized_corpus = pool.map(self.tokenizer, corpus)
        return tokenized_corpus

    def _calc_idf(self, nd):
        raise NotImplementedError()

    def get_scores(self, query):
        raise NotImplementedError()

    def get_batch_scores(self, query, doc_ids):
        raise NotImplementedError()

    def get_top_n(self, query, documents, n=5):

        assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"

        scores = self.get_scores(query)
        top_n = np.argsort(scores)[::-1][:n]
        return [documents[i] for i in top_n]


class BM25Okapi(BM25):
    def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, epsilon=0.25):
        # k1=1.5, b=0.75, epsilon=0.25 -> k1=0 to 3, b -> 0 to 1
        self.k1 = k1
        self.b = b
        self.epsilon = epsilon
        super().__init__(corpus, tokenizer)

    def _calc_idf(self, nd):
        """
        Calculates frequencies of terms in documents and in corpus.
        This algorithm sets a floor on the idf values to eps * average_idf
        """
        # collect idf sum to calculate an average idf for epsilon value
        idf_sum = 0
        # collect words with negative idf to set them a special epsilon value.
        # idf can be negative if word is contained in more than half of documents
        negative_idfs = []
        for word, freq in nd.items():
            idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
            self.idf[word] = idf
            idf_sum += idf
            if idf < 0:
                negative_idfs.append(word)
        self.average_idf = idf_sum / len(self.idf)

        eps = self.epsilon * self.average_idf
        for word in negative_idfs:
            self.idf[word] = eps

    def get_scores(self, query):
        """
        The ATIRE BM25 variant uses an idf function which uses a log(idf) score. To prevent negative idf scores,
        this algorithm also adds a floor to the idf value of epsilon.
        See [Trotman, A., X. Jia, M. Crane, Towards an Efficient and Effective Search Engine] for more info
        :param query:
        :return:
        """
        score = np.zeros(self.corpus_size)
        doc_len = np.array(self.doc_len)
        for q in query:
            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
            score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
                                               (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
        return score

    def get_batch_scores(self, query, doc_ids):
        """
        Calculate bm25 scores between query and subset of all docs
        """
        assert all(di < len(self.doc_freqs) for di in doc_ids)
        score = np.zeros(len(doc_ids))
        doc_len = np.array(self.doc_len)[doc_ids]
        for q in query:
            q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
            score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
                                               (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
        return score.tolist()


class BM25L(BM25):
    def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, delta=1):
        # Algorithm specific parameters
        self.k1 = k1
        self.b = b
        self.delta = delta
        super().__init__(corpus, tokenizer)

    def _calc_idf(self, nd):
        for word, freq in nd.items():
            idf = math.log(self.corpus_size + 1) - math.log(freq + 0.5)
            self.idf[word] = idf

    def get_scores(self, query):
        score = np.zeros(self.corpus_size)
        doc_len = np.array(self.doc_len)
        for q in query:
            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
            ctd = q_freq / (1 - self.b + self.b * doc_len / self.avgdl)
            score += (self.idf.get(q) or 0) * q_freq * (self.k1 + 1) * (ctd + self.delta) / \
                     (self.k1 + ctd + self.delta)
        return score

    def get_batch_scores(self, query, doc_ids):
        """
        Calculate bm25 scores between query and subset of all docs
        """
        assert all(di < len(self.doc_freqs) for di in doc_ids)
        score = np.zeros(len(doc_ids))
        doc_len = np.array(self.doc_len)[doc_ids]
        for q in query:
            q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
            ctd = q_freq / (1 - self.b + self.b * doc_len / self.avgdl)
            score += (self.idf.get(q) or 0) * q_freq * (self.k1 + 1) * (ctd + self.delta) / \
                     (self.k1 + ctd + self.delta)
        return score.tolist()


class BM25Plus(BM25):
    def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, delta=1):
        # Algorithm specific parameters
        self.k1 = k1
        self.b = b
        self.delta = delta
        super().__init__(corpus, tokenizer)

    def _calc_idf(self, nd):
        for word, freq in nd.items():
            idf = math.log((self.corpus_size + 1) / freq)
            self.idf[word] = idf

    def get_scores(self, query):
        score = np.zeros(self.corpus_size)
        doc_len = np.array(self.doc_len)
        for q in query:
            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
            score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
                                               (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
        return score

    def get_batch_scores(self, query, doc_ids):
        """
        Calculate bm25 scores between query and subset of all docs
        """
        assert all(di < len(self.doc_freqs) for di in doc_ids)
        score = np.zeros(len(doc_ids))
        doc_len = np.array(self.doc_len)[doc_ids]
        for q in query:
            q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
            score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
                                               (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
        return score.tolist()


# BM25Adpt and BM25T are a bit more complicated than the previous algorithms here. Here a term-specific k1
# parameter is calculated before scoring is done

class BM25Adpt(BM25):
    def __init__(self, corpus, k1=1.5, b=0.75, delta=1):
        # Algorithm specific parameters
        self.k1 = k1
        self.b = b
        self.delta = delta
        super().__init__(corpus)

    def _calc_idf(self, nd):
        for word, freq in nd.items():
            idf = math.log((self.corpus_size + 1) / freq)
            self.idf[word] = idf

    def get_scores(self, query):
        score = np.zeros(self.corpus_size)
        doc_len = np.array(self.doc_len)
        for q in query:
            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
            score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
                                               (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
        return score



class BM25T(BM25):
    def __init__(self, corpus, k1=1.5, b=0.75, delta=1):
        # Algorithm specific parameters
        self.k1 = k1
        self.b = b
        self.delta = delta
        super().__init__(corpus)

    def _calc_idf(self, nd):
        for word, freq in nd.items():
            idf = math.log((self.corpus_size + 1) / freq)
            self.idf[word] = idf

    def get_scores(self, query):
        score = np.zeros(self.corpus_size)
        doc_len = np.array(self.doc_len)
        for q in query:
            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
            score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
                                               (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
        return score


### Class that helps to get the ranking scores from the models

In [9]:
from operator import itemgetter

class ModelScores:
    """
    Returns Model Scores for single query, single resume and for all resumes 
    """
    
    def __init__(self):
        
        with open('./../Data/Workin_Data/'+'data_dict.pkl', 'rb') as f:
            self.pickle_data = pickle.load(f)
            
        self.utils = Utils()
    
    def get_resume_id_ranking_scores(self, resume_id, model):
        
        user_accessible_resume = self.pickle_data['user_accessible_resume']
        processed_resume = self.pickle_data['processed_resume']

        data_index_resume_id, data_processed = self.utils.resume_index_id_data(processed_resume)

        # Remove the resume from the list of resumes to be compared to
        data_processed_copy = data_processed[:]
        index = self.utils.swap_key_value(data_index_resume_id)[resume_id]
        del data_processed_copy[index]

        raw_query = user_accessible_resume[resume_id]

        if model == 'BM25':
            chosen_model = BM25(data_processed_copy)
        elif model == 'BM25Okapi':
            chosen_model = BM25Okapi(data_processed_copy)
        elif model == 'BM25L':
            chosen_model = BM25L(data_processed_copy)
        elif model == 'BM25Adpt':
            chosen_model = BM25Adpt(data_processed_copy)
        elif model == 'BM25T':
            chosen_model = BM25T(data_processed_copy)
        else:
            chosen_model = BM25Plus(data_processed_copy)

        scores = list(chosen_model.get_scores(self.utils.string_to_words(['', raw_query])[-1]))

        scores.insert(index, float('inf'))
        indices, id_sorted = zip(*sorted(enumerate(scores), reverse=True, key=itemgetter(1)))

        return [[data_index_resume_id[indices[i]], id_sorted[i]] for i in range(0, len(indices))][1:]
    

    def single_query_scores(self, query, model):
        
        processed_resume = self.pickle_data['processed_resume']
        data_index_employee_id, data_processed = self.utils.resume_index_id_data(processed_resume)
        
        if model == 'BM25':
            chosen_model = BM25(data_processed)
        elif model == 'BM25Okapi':
            chosen_model = BM25Okapi(data_processed)
        elif model == 'BM25L':
            chosen_model = BM25L(data_processed)
        elif model == 'BM25Adpt':
            chosen_model = BM25Adpt(data_processed)
        elif model == 'BM25T':
            chosen_model = BM25T(data_processed)
        else:
            chosen_model = BM25Plus(data_processed)
            
        scores = list(chosen_model.get_scores(self.utils.string_to_words(['', query])[-1]))
        indices, id_sorted = zip(*sorted(enumerate(scores), reverse=True, key=itemgetter(1)))
        
        return [[data_index_employee_id[indices[i]], id_sorted[i]] for i in range(len(indices))]
    
    def single_resume_scores(self, path, file_name, model):
        
        recruitment_preprocess = RecruitmentPreprocess(path)
        query = recruitment_preprocess.extract_text_from_resume(file_name)
        
        return self.single_query_scores(file_name, model)
    

### Class that helps to extract the resume names from the ranked ids

In [10]:
import pickle

class RankResult:
    
    def __init__(self, no_of_output, model):
        
        with open(save_to_path+'data_dict.pkl', 'rb') as f:
            pickle_data = pickle.load(f)
        self.user_accessible_resume = pickle_data['resume_id_index']
        self.no_of_output = no_of_output
        self.model_scores = ModelScores()
        self.model = model
        
    def get_ranking_with_resume_id(self, resume_id):
        
        scores = self.model_scores.get_resume_id_ranking_scores(145, self.model)
        ranked_Resume_names = []
        
        for i in range(self.no_of_output):
            if i >= len(scores):
                break
            ranked_Resume_names.append(self.user_accessible_resume[scores[i][0]])
        
        return ranked_Resume_names
    
    def get_ranking_with_query(self, query):
        
        scores = self.model_scores.single_query_scores(query, self.model)
        ranked_Resume_names = []
        
        for i in range(self.no_of_output):
            if i >= len(scores):
                break
            ranked_Resume_names.append(self.user_accessible_resume[scores[i][0]])
        
        return ranked_Resume_names
    
    def get_ranking_with_resume_filename(self, path, file_name):
        
        scores = self.model_scores.single_resume_scores(path, file_name, self.model)
        ranked_Resume_names = []
        
        for i in range(self.no_of_output):
            if i >= len(scores):
                break
            ranked_Resume_names.append(self.user_accessible_resume[scores[i][0]])
        
        return ranked_Resume_names
        

### Example usage

1. Get the resume ranking given resume id
2. Get the resume ranking given raw query
3. Get the resume ranking given file path and file name

In [11]:
ranked_result = RankResult(5, 'BM25Okapi')

# Resume ranking given resume id
print(ranked_result.get_ranking_with_resume_id(145),'\n')

# Resume ranking given raw query
query = "9+ years of experience in the field of business and data analysis supporting software solutions and analyzing business operations on various domains such as Banking,  Finance and Insurance. Worked in various software development environments including waterfall and agile methodologies including Scrum, Kanban, XP and SAFe. Implemented multiple projects in SOA architecture and dealt with APIs, SOAP and RESTful Web Services. Worked in the capacity of a certified Scrum Master by facilitating all scrum ceremonies, resolving Impediments and dependency issues. Proficient in data analytics – SQL querying, Ad Hoc / Canned report generation using tools like IBM Cognos BI and Tableau."
print(ranked_result.get_ranking_with_query(query),'\n')

# Resume ranking given file_path and file_name
file_name = 'Shail_Tank-Business Analyst.docx'
path = "./../Data/Resumes/"
print(ranked_result.get_ranking_with_resume_filename(path, file_name))

['SUNITHA Project Manager (1)_191', 'Srivatsan_Project_Manager_187', 'Sahas BA Resume_154', 'AjayKumar_5', 'Syed_Zia_Ashraf_192'] 

['Shaker Resume_172', 'B Shaker-Sr BSA-Scrum Master _21', 'Krishna_BSA_72', 'Tarun RESUME-BSAT_194', 'Akhilprofile_6'] 

['Shail_Tank-Business Analyst_171', 'Bharatha BA Resume_31', 'BA - Abhishek_23', 'BA Kiran_25', 'Robinson_151']
