In [None]:
import spacy, os
import pandas as pd
import numpy as np
import en_core_web_sm
nlp = en_core_web_sm.load()
import re
re_c = re.compile(r'\w+')

In [None]:
# switch for debug
flag_print = True

# switch to clear existing data
flag_clear = True

#threshold value for determining section
threshold = 0.5

In [None]:
# to get extract sections from the resume -- add or remove from  'similar_to' accordingly
similar_to = {
    'edu' : ['education', 'study', 'academics', 'institute', 'school', 'college'],
    'exp' : ['job', 'internship', 'training', 'research', 'carrer', 'profession', 'role'
             'project', 'responsibility', 'description', 'work experience', 'workshop', 'conference'],
    'skill' : ['skill', 'languages', 'technology', 'framework', 'tools', 'database'],
    'extra' : ['introduction', 'intro', 'achievement', 'hobby', 'links', 'additional', 
               'personal', 'award', 'objective', 'miscellaneous', 'interest']
}

list_of_sections = similar_to.keys()

# to bring similar_words to their normal forms
for section in list_of_sections:
    new_list = []
    
    for word in similar_to[section]:
        docx = nlp(unicode(word))
        new_list.append(docx[0].lemma_)
        
    if flag_print:
        print section, new_list
        
    similar_to[section] = new_list

skill [u'skill', u'language', u'technology', u'framework', u'tool', u'database']
extra [u'introduction', u'intro', u'achievement', u'hobby', u'link', u'additional', u'personal', u'award', u'objective', u'miscellaneous', u'interest']
exp [u'job', u'internship', u'training', u'research', u'carrer', u'profession', u'roleproject', u'responsibility', u'description', u'work', u'workshop', u'conference']
edu [u'education', u'study', u'academic', u'institute', u'school', u'college']


In [None]:
# function to return the words in a uniform 
def modify(word):
    try:
        symbols = '''~'`!@#$%^&*)(_+-=}{][|\:;",./<>?'''
        mod_word = ''
        
        for char in word:
            if (char not in symbols):
                mod_word += char.lower()

        docx = nlp(unicode(mod_word))

        if (len(mod_word) == 0 or docx[0].is_stop):
            return None
        else:
            return docx[0].lemma_
    except:
        return None # to handle the odd case of characters like 'x02', etc.
    
if flag_print:
    test_words = ['Hello!!', '.,<>', 'India', 'of', '..freedoM..', 'e-mail']
    
    for word in test_words:
        print word, '--returned-->', modify(word)

Hello!! --returned--> hello
.,<> --returned--> None
India --returned--> india
of --returned--> None
..freedoM.. --returned--> freedom
e-mail --returned--> email


In [None]:
# utility function to skip line when no alphabet present
def is_bad(line):
    for c in line:
        if (c.isalpha()):
            return False
    return True
      
if flag_print:
    test_words = ['.', '<.>', 'Speak', 'out', '"Eric"', 'freemail...']
    
    for word in test_words:
        print word, '--returned-->', is_bad(word)  

. --returned--> True
<.> --returned--> True
Speak --returned--> False
out --returned--> False
"Eric" --returned--> False
freemail... --returned--> False


In [None]:
%%time
dict_of_data_series = {}
flag_print = False

for file_name in os.listdir(os.getcwd()+'/data/CVs'):
    if flag_print:
        print '\n'
        print '*'*25
        print file_name
        print '*'*25
        
    main_file_handler = open('data/CVs/'+file_name, 'r')    
    previous_section  = 'extra'
    
    curr_data_series = pd.Series([""]*len(list_of_sections), index=list_of_sections)
                   
    for line in main_file_handler:
        # skip line if empty
        if (len(line.strip()) == 0 or is_bad(line)):
            continue
                
        # processing next line
        list_of_words_in_line = re_c.findall(line)
        list_of_imp_words_in_line  = []
        
        for i in range(len(list_of_words_in_line)):
            modified_word = modify(list_of_words_in_line[i])
            
            if (modified_word):
                list_of_imp_words_in_line.append(modified_word)

        curr_line = ' '.join(list_of_imp_words_in_line)
        doc = nlp(unicode(curr_line))
        section_value = {}
            
        # initializing section values to zero
        for section in list_of_sections:
            section_value[section] = 0.0
        section_value[None] = 0.0
            
        # updating section values    
        for token in doc:
            for section in list_of_sections:
                for word in similar_to[section]:
                    word_token = doc.vocab[unicode(word)]
                    section_value[section] = max(section_value[section], float(word_token.similarity(token)))

        # determining the next section based on section values and threshold
        most_likely_section = None
        for section in list_of_sections:
            #print '>>', section, section_value[section]
            if (section_value[most_likely_section] < section_value[section] and section_value[section] > threshold):
                most_likely_section = section
            
        # updating the section
        if (previous_section != most_likely_section and most_likely_section is not None):
            previous_section = most_likely_section
                

        # writing data to the pandas series
        try:
            docx = nlp(unicode(line))
        except:
            continue  # to handle the odd case of characters like 'x02', etc.
        mod_line = ''
        for token in docx:
            if (not token.is_stop):
                mod_line += token.lemma_ + ' '
        
        curr_data_series[previous_section] += mod_line
            
    dict_of_data_series[file_name] = curr_data_series
    if flag_print:
        print curr_data_series
    main_file_handler.close()
    
data_frame = pd.DataFrame(dict_of_data_series)
data_frame.to_csv('prc_data.csv', sep='\t')
#data_frame.head()

CPU times: user 56.9 s, sys: 220 ms, total: 57.1 s
Wall time: 1min


In [None]:
data_frame.head()

Unnamed: 0,cv1.txt,cv10,cv100,cv101,cv102,cv103,cv104,cv105,cv106,cv107,...,cv90,cv91,cv92,cv93,cv94,cv95,cv96,cv97,cv98,cv99
skill,software developer - dynamix infotech \n kolka...,team member(software developer ) \n techsys so...,ios application developer \n software develope...,"application developer \n tiruppur , tamil nadu...",application developer - cognizant technology s...,"android application developer \n chennai , tam...",application developer - bny mellon \n coimbato...,"application developer \n bhopal , madhya prade...",pursue growth experience field information tec...,"application developer - at&t \n bangalore , ka...",...,"connectivity data systems , llc - chennai , ta...","skill \n microsoft excel , microsoft ppt , mic...",nielsen company \n april 2015 present \n - sam...,"tools - nlp , elasticsearch , python , alchemy...","cappius technologies - hyderabad , telangana \...",- extracting datum sql server 2014 \n - requi...,android application developer \n adler solutio...,android application developer \n interglobe te...,jr. ios application developer ( application de...,contribute organization new trend technology \...
extra,mahboob alam \n career objective \n personal q...,"deepali chaudhari \n mumbai , maharashtra \n","ramakrishnan k \n cuddalore , tamil nadu \n mc...",jayaprakash andamuthu \n adith business pvt . ...,rajeev gupta \n john hancock investment projec...,alaguraj ramachandran \n junior android applic...,aiswarya chandrasekaran \n new system introduc...,ashay jain \n kpn netherlands telecom company ...,"heena patel \n surat , gujarat \n contribute o...",deba hazra \n stand - order functionality nggn...,...,karthikeyan p \n additional information \n,"bishvajit bakshi \n bangalore , karnataka \n a...",darpana nandy \n project - linked - person \n ...,"harish venkataraman \n bangalore , karnataka \...",sharad kakran \n link \n https://github.com/sh...,"ketan bhatheja \n bangalore , karnataka \n - u...","rahul vhayaskar \n dhule , maharashtra \n goog...",babli bisht \n additional information \n appli...,reshma patil \n link \n https://itunes.apple.c...,rajith r \n web application developer \n chenn...
exp,want work progressive organization utilize kno...,work experience \n,"hard worker , enthusiastic , responsible , fas...","include design , activity working \n work expe...",* currently work cognizant technology solution...,work experience \n b.e computer science engg \...,work experience \n project details \n 1 ) bny ...,work experience \n project location groningen ...,good company . \n work experience \n,work experience \n role : application develope...,...,senior research analyst relationship science (...,"work experience \n data analyst \n college , r...","executive , data science - nielsen company \n ...",sufficiently expend skillset organization lear...,work experience \n data science intern \n c+ (...,"work experience \n data science & analytics , ...",work experience \n responsibility \n understan...,android application developer have 3.5 year ex...,work experience \n,"dedicated work , innovative idea dynamic chall..."
edu,education \n sidho kanhu murmu university dumk...,"education \n nagpur university nagpur , mahara...",education \n \n dhanalakshmi srinivasan colleg...,education \n kongu arts science college erode ...,senior solution integrator \n april 2016 augus...,education \n sir srnm polytechnic college satt...,"education \n anna university chennai , tamil n...",,,education \n wbut university \n,...,data analysis : \n education \n sathyabama uni...,education \n m.sc . statistics \n university a...,"lead "" evidence theory base uncertainty analys...",education \n dayananda sagar college engineeri...,analyze customer transaction behavior shopping...,"- data manipulation , exploratory analysis , t...",education \n north maharashtra university jalg...,education \n mtech electronics ( instrumentati...,education \n b.e \n indira college engineering...,"kodaikanal christian college - kodaikanal , ta..."
