In [1]:
import spacy, os
import pandas as pd
import numpy as np
nlp = spacy.load('en')

In [2]:
# switch for debug
flag_print = True

# switch to clear existing data
flag_clear = True

#threshold value for determining section
threshold = 0.5

In [3]:
# to get extract sections from the resume -- add or remove from  'similar_to' accordingly
similar_to = {
    'edu' : ['education', 'study', 'academics'],
    'exp' : ['job', 'internship', 'training', 'research', 'carrer', 'profession', 
             'project', 'responsibility', 'description'],
    'skill' : ['skill', 'languages', 'technology', 'frameworks', 'tools'],
    'extra' : ['introduction', 'intro', 'achievement', 'hobby', 'links', 'additional', 'personal']
}

list_of_sections = similar_to.keys()

# to bring similar_words to their normal forms
for section in list_of_sections:
    new_list = []
    
    for word in similar_to[section]:
        docx = nlp(unicode(word))
        new_list.append(docx[0].lemma_)
        
    if flag_print:
        print section, new_list
        
    similar_to[section] = new_list

skill [u'skill', u'language', u'technology', u'framework', u'tool']
extra [u'introduction', u'intro', u'achievement', u'hobby', u'link', u'additional', u'personal']
exp [u'job', u'internship', u'training', u'research', u'carrer', u'profession', u'project', u'responsibility', u'description']
edu [u'education', u'study', u'academic']


In [4]:
# function to return the words in a uniform 
def modify(word):
    try:
        symbols = '''~'`!@#$%^&*)(_+-=}{][|\:;",./<>?'''
        mod_word = ''
        
        for char in word:
            if (char not in symbols):
                mod_word += char.lower()

        docx = nlp(unicode(mod_word))

        if (len(mod_word) == 0 or docx[0].is_stop):
            return None
        else:
            return docx[0].lemma_
    except:
        return None # to handle the odd case of characters like 'x02', etc.
    
if flag_print:
    test_words = ['Hello!!', '.,<>', 'India', 'of', '..freedoM..', 'e-mail']
    
    for word in test_words:
        print word, '--returned-->', modify(word)

Hello!! --returned--> hello
.,<> --returned--> None
India --returned--> india
of --returned--> None
..freedoM.. --returned--> freedom
e-mail --returned--> email


In [5]:
# utility function to skip line when no alphabet present
def is_bad(line):
    for c in line:
        if (c.isalpha()):
            return False
    return True
      
if flag_print:
    test_words = ['.', '<.>', 'Speak', 'out', '"Eric"', 'freemail...']
    
    for word in test_words:
        print word, '--returned-->', is_bad(word)  

. --returned--> True
<.> --returned--> True
Speak --returned--> False
out --returned--> False
"Eric" --returned--> False
freemail... --returned--> False


In [23]:
dict_of_data_series = {}
flag_print = False

for file_name in os.listdir(os.getcwd()+'/raw_data'):
    if flag_print:
        print '\n'
        print '*'*25
        print file_name
        print '*'*25
        
    main_file_handler = open('raw_data/'+file_name, 'r')    
    previous_section  = 'extra'
    
    curr_data_series = pd.Series([""]*len(list_of_sections), index=list_of_sections)
                   
    for line in main_file_handler:
        # skip line if empty
        if (len(line.strip()) == 0 or is_bad(line)):
            continue
                
        # processing next line
        list_of_words_in_line = line.split()
        list_of_imp_words_in_line  = []
        
        for i in range(len(list_of_words_in_line)):
            modified_word = modify(list_of_words_in_line[i])
            
            if (modified_word):
                list_of_imp_words_in_line.append(modified_word)

        curr_line = ' '.join(list_of_imp_words_in_line)
        doc = nlp(unicode(curr_line))
        section_value = {}
            
        # initializing section values to zero
        for section in list_of_sections:
            section_value[section] = 0.0
        section_value[None] = 0.0
            
        # updating section values    
        for token in doc:
            for section in list_of_sections:
                for word in similar_to[section]:
                    word_token = doc.vocab[unicode(word)]
                    section_value[section] = max(section_value[section], float(word_token.similarity(token)))

        # determining the next section based on section values and threshold
        most_likely_section = None
        for section in list_of_sections:
            #print '>>', section, section_value[section]
            if (section_value[most_likely_section] < section_value[section] and section_value[section] > threshold):
                most_likely_section = section
            
        # updating the section
        if (previous_section != most_likely_section and most_likely_section is not None):
            previous_section = most_likely_section
                

        # writing data to the pandas series
        try:
            docx = nlp(unicode(line))
        except:
            continue  # to handle the odd case of characters like 'x02', etc.
        mod_line = ''
        for token in docx:
            if (not token.is_stop):
                mod_line += token.lemma_ + ' '
        
        curr_data_series[previous_section] += mod_line
            
    dict_of_data_series[file_name] = curr_data_series
    if flag_print:
        print curr_data_series
    main_file_handler.close()
    
data_frame = pd.DataFrame(dict_of_data_series)
data_frame.to_csv('prc_data.csv', sep='\t')
data_frame.head()

Unnamed: 0,cv1.txt,cv2.txt,cv3.txt,cv52.txt,cv53.txt,cv54.txt,cv55.txt
skill,software developer - dynamix infotech \n kolka...,work professional industry professional work d...,team member - appin software group \n shakarpu...,month exp goyal & parul company chartr...,indian institute technology - kharagpur \n kha...,indian institute technology delhi . \n homepag...,"indian institute technology , kanpur , \n e - ..."
extra,mahboob alam \n personal qualities \n addition...,"puneet singh \n new delhi , delhi \n personal ...",ranjeet kumar \n ❖ impose validations requirem...,post- sothauli gopalpujr \n dis - azamgarh...,ashish kumar yadav ( 07cs3028 ) \n mobile : + ...,area interest : \n fourth international confer...,"abhishek rajput \n room . a-215 hall x , \n ..."
exp,career objective \n work experience \n work so...,professional qualification \n ➢ mca [ … ] bist...,❖ experience development phase web application...,resume \n rahul yadav \n vill- agehata \n obje...,dual degree computer science & engineering ; \...,resume \n soham das \n currently pursue m.tech...,computer science & engineering \n m.tech . c...
edu,education \n sidho kanhu murmu university dumk...,➢ bca [ … ] integral university lucknow aggreg...,➢ active member sports meet college level . \n...,education \n pass b.a purvanchal univerciti ...,education \n lucknow public school \n lucknow ...,academic detail : \n year \n degree / exam \n ...,education \n year \n degree \n institute \n pe...
