In [1]:
import os
import re
import glob
import textract
import pandas as pd

import spacy
from spacy.matcher import PhraseMatcher
nlp = spacy.load('en_core_web_sm')

In [2]:
#Get all the file names of resume's to be processed
RESUME_PATH = 'sample_resume/'
raw_list = glob.glob(RESUME_PATH+"*.*")
resume_list = [resume_path for resume_path in raw_list if resume_path[-4:] in ('.doc','.pdf', 'docx')]

In [3]:
#extract text from all the files
db_entries = []

for sno, path in enumerate(resume_list):
    try:
        data = [textract.process(path).decode("utf-8"), path.replace(RESUME_PATH,'')]
        db_entries.append(data)
    except:
        pass

df = pd.DataFrame(db_entries, columns=['resume_text', 'resume_path'])
df.head()

Unnamed: 0,resume_text,resume_path
0,\n\nSample 1\n\n\nEmail : sample1@gmail.com...,sample1.doc
1,Sample 2 Machine Learning Engineer\n\n\t...,sample2.docx


In [4]:
#creating skills of interest for extracting from resume's if available
skills = ['python', 'sql', 'sql', 'mysql', '.bigquery', 'mssql']

In [5]:
#Define the spacy phrase matcher
matcher = PhraseMatcher(nlp.vocab)

pattern = [nlp.make_doc(text) for text in skills]

matcher.add("skills", None, *pattern)

In [6]:
#simple function for preprocessing and keyword extraction
def clean_text(raw_text):
    '''clean text to remove unwanted 
    characters and convert to lowercase'''
    clean_text = ' '.join(re.findall(r'[a-z0-9\']+', raw_text.lower()))
    return clean_text

def get_keywords(text):
    '''exctract keywords from text 
    based on spacy rule matcher
    Args: param: text as string
    Returns: list of kerwords'''
    
    doc = nlp(text)
    matches = matcher(doc)
    match_lst = []
    
    for match_id, start, end in matches:
        span = doc[start:end]
        match_lst.append(span.text)
    return ', '.join(list(set(match_lst)))

In [7]:
df['clean_text'] = df['resume_text'].apply(clean_text)
df['skills'] = df['clean_text'].apply(get_keywords)
df.head()

Unnamed: 0,resume_text,resume_path,clean_text,skills
0,\n\nSample 1\n\n\nEmail : sample1@gmail.com...,sample1.doc,sample 1 email sample1 gmail com 1 mobile 91 1...,"mysql, python"
1,Sample 2 Machine Learning Engineer\n\n\t...,sample2.docx,sample 2 machine learning engineer mobile 9198...,"sql, python"


In [8]:
df.to_csv('resume_keywords.csv')