In [6]:
# !pip install textract
# !pip install tika

In [113]:
import glob
import os

import pandas as pd
import textract
from tika import parser

import pickle

In [114]:
path = "./../Data/Resumes/"
save_to_path = "./../Data/Workin_Data/"

for filename in glob.glob(path+"~*"):
    os.remove(filename)

In [162]:
from sklearn.utils import shuffle
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re
from bs4 import BeautifulSoup
import shutil


class Utils:
    """
    Class containing methods that serve as helper functions
    """
    
    def shuffle_data(self, data_pd):
        """
        Data shuffling
        """
        
        data_columns = data_pd.columns
        data_body = data_pd[data_columns]
        data_body = shuffle(data_body)

        return data_body
    
    def string_to_words(self, query):
        """
        from string of words to list of processed words
        """
        
        nltk.download("stopwords", quiet=True)
        try:
            # add_similar_words_to_search_query(query[-1])
            text = BeautifulSoup(query[-1], "html.parser").get_text()  # Remove HTML tags
            text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())  # Remove non-alphanumeric and Convert to lower case
        except:
            text = ''
        word_list = text.split()  # Split string into words
        word_list = [w for w in word_list if w not in stopwords.words("english")]  # Remove stopwords
        word_list = [PorterStemmer().stem(w) for w in word_list]  # stem

        return [query[0], word_list]
    
    def clean_data(self, data, cache_dir, cache_file="cleaned_data.pkl"):
        """
        Convert each data row to words; read from cache if available.
        input: dataframe with columns key->col1, value->col2
        output: list of lists, e.g [[employee1_id,body1_word_list],[employee2_id,body2_word_list],...]
        """

        data_keys, data_body = data[data.columns[0]].values, data[data.columns[1]].values
        data_train = [[data_keys[i], data_body[i]] for i in range(len(data_body))]

        # If cache_file is not None, try to read from it first
        cache_data = None
        if cache_file is not None:
            try:
                with open(os.path.join(cache_dir, cache_file), "rb") as f:
                    cache_data = pickle.load(f)
                print("Read cleaned data from cache file:", cache_file)
            except:
                pass  # unable to read from cache, but that's okay

        # If cache is missing, then do the heavy lifting
        if cache_data is None:
            # Preprocess the data to obtain words for each employee data
            words_train = list(map(self.string_to_words, data_train))

            # Write to cache file for future runs
            if cache_file is not None:
                cache_data = dict(words_train=words_train)
                with open(os.path.join(cache_dir, cache_file), "wb") as f:
                    pickle.dump(cache_data, f)
                print("Wrote preprocessed data to cache file:", cache_file)
        else:
            # Unpack data loaded from cache file
            words_train = (cache_data['words_train'])

        return words_train
    
    def add_data_to_pickle(self, data_file, data=None, path='./'):

        data_path = path + data_file

        data_file_name = data_file.split('.')[0]
        pickle_file_name = path + 'data_dict.pkl'

        if os.path.isfile(pickle_file_name):
            pickle_file = open(pickle_file_name, 'rb')
        else:
            pickle_file = open(pickle_file_name, 'bw')
            pickle_file.close()

        if os.path.getsize(pickle_file_name) > 0:
            data_collections = pickle.load(pickle_file)
            pickle_file.close()
        else:
            data_collections = {}


        data_collections[data_file_name] = data
        with open(pickle_file_name, 'bw') as f:
            pickle.dump(data_collections, f)
            

In [163]:
class RecruitmentPreprocess:
    """
    Class for preprocessing job offer and resume text queries.
    """

    def __init__(self, resume_path, save_to_path):
        self.resume_path = resume_path
        self.save_to_path = save_to_path
        self.resume_id_index, self.resume_id_data, self.resume_data = self.__merge_resume_to_dataframe()

    def __extract_text_from_resume(self, file_name):
        if file_name.split('.')[-1] == "pdf":
            text = parser.from_file(self.resume_path + file_name)['content']
        else:
            text = textract.process(self.resume_path + file_name).decode()
        return text


    def __merge_resume_to_dataframe(self):

        all_collection = []
        all_files = [file.split('\\')[-1] for file in glob.glob(path + "*") if not file.startswith('~')]
        index = 1
        resume_id_index = {}
        resume_id_data = {}
        
        for file in all_files:
            if file.split('.')[-1] in ['docx', 'pdf', 'doc']:
                resume_id_index[index] = '{}_{}'.format(file.split('.')[:-1],index)
                resume_id_data[index] = self.__extract_text_from_resume(file)
                collection = [index, self.__extract_text_from_resume(file)]
                all_collection.append(collection)
                index += 1

        return resume_id_index, resume_id_data, pd.DataFrame(all_collection, columns=['employee_id', 'data'])

    
    def save_resume_data_to_csv(self):
        
        self.resume_data.to_csv(self.save_to_path + 'resume_data.csv', index=False)
        
    def add_resume_keys_to_pickle(self):
        
        utils = Utils()
        utils.add_data_to_pickle('resume_id_index', self.resume_id_index, self.save_to_path)
        
    def add_user_accessible_resume_to_pickle(self):
        
        utils = Utils()
        utils.add_data_to_pickle('user_accessible_resume', self.resume_id_data, self.save_to_path)
        
    def add_processed_resume_to_pickle(self):
        
        cache_directory = os.path.join("cache", "words_tokens")  # where to store cache files
        os.makedirs(cache_directory, exist_ok=True)  # ensure cache directory exists

        cache_file = 'cleaned_{}.pkl'.format('processed_resume')
        
        utils = Utils()
    
        data_shuffled = utils.shuffle_data(self.resume_data)
        data_processed_with_id = utils.clean_data(data_shuffled, cache_directory, cache_file=cache_file)
        
        shutil.rmtree('cache')
        
        utils.add_data_to_pickle('processed_resume', data_processed_with_id, self.save_to_path)
        

In [164]:
resume = RecruitmentPreprocess(path, save_to_path)
# resume.save_resume_data_to_csv()
# resume.add_resume_keys_to_pickle()
# resume.add_user_accessible_resume_to_pickle()
resume.add_processed_resume_to_pickle()

Read cleaned data from cache file: cleaned_processed_resume.pkl


In [166]:
cv_id_index.keys()

dict_keys(['resume_id_index', 'user_accessible_resume', 'processed_resume'])

In [165]:
with open(save_to_path+'data_dict.pkl', 'rb') as f:
    cv_id_index = pickle.load(f)

In [170]:
print(cv_id_index['resume_id_index'][174])
print(cv_id_index['user_accessible_resume'][174])

['Shashank']
2                                                                                                                                                                                         Shashank Tiwari



		

		SHASHANK TIWARI

Shashank.tiwari44@gmail.com                                                                                            (650) 600-1785 Jersey City, NJ (07306)



SUMMARY:

		Over seven years of experience as Business Analyst/Scrum Master with solid understanding of Business Requirement gathering, Business Process Modeling and database/data warehouse experience

		Extensive experience of communicating with Subject Matter Experts (SME’s), performing requirement gathering, business analysis, data analysis, and documentation

		Expert in facilitating Agile ceremonies, coached team in JIRA, Service Now including concentrated efforts with product owner on user story/epic/feature optimization

		Proficient in Agile engineering process such as Test Driven De

In [167]:
print(cv_id_index['processed_resume'][1])

[174, ['2', 'shashank', 'tiwari', 'shashank', 'tiwari', 'shashank', 'tiwari44', 'gmail', 'com', '650', '600', '1785', 'jersey', 'citi', 'nj', '07306', 'summari', 'seven', 'year', 'experi', 'busi', 'analyst', 'scrum', 'master', 'solid', 'understand', 'busi', 'requir', 'gather', 'busi', 'process', 'model', 'databas', 'data', 'warehous', 'experi', 'extens', 'experi', 'commun', 'subject', 'matter', 'expert', 'sme', 'perform', 'requir', 'gather', 'busi', 'analysi', 'data', 'analysi', 'document', 'expert', 'facilit', 'agil', 'ceremoni', 'coach', 'team', 'jira', 'servic', 'includ', 'concentr', 'effort', 'product', 'owner', 'user', 'stori', 'epic', 'featur', 'optim', 'profici', 'agil', 'engin', 'process', 'test', 'driven', 'develop', 'tdd', 'behavior', 'driven', 'develop', 'bdd', 'continu', 'integr', 'orchestr', 'variou', 'busi', 'analysi', 'activ', 'gap', 'roi', 'risk', 'swot', 'cost', 'impact', 'analysi', 'excel', 'busi', 'write', 'skill', 'write', 'busi', 'requir', 'document', 'use', 'case'

In [85]:
[4][-2]

IndexError: list index out of range