In [1]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
import nltk
import os
import pandas as pd
from ast import literal_eval
import random
import tqdm
import re
import numpy as np

In [2]:
# Entities to extract 
# 1. Name of the candidate ( top line )
# 2. City (use external data to tag)
# 3. Phone no (use re)
# 4. E-mail addresses (use re)
# 5. Companies worked (use external data or spacy to extract)
# 6. Colleges/schools ( some re or pos based approach)
# 7. Experience ( worked _ years.. re)
# 8. Skills ( Bag of Skills )

In [3]:
files = os.listdir('./Resumes')

In [4]:
def clean_text(text):
    text = text.strip()
    text = re.sub(r'\t+',' ',text)
    text = re.sub(r'\n+','\n',text)
    return text.lower()

In [5]:
import tika
tika.initVM()
from tika import parser
def read_file(file):
    parsed = parser.from_file('./Resumes/'+file)
    #print(parsed["metadata"])
    #print(parsed["content"])
    return clean_text(parsed['content'])

In [6]:
resume_entities = pd.read_csv('resume_entities1.csv')
resume_entities.head()

Unnamed: 0,file,skills,phone,Organisations,Institutes,email,experience,person name,city,skills_len,orgs_len,skill_matches,org_matches,entities
0,Venkat_BA.docx,"[' microsoft office (excel word', ' powerpoint...",,,,,9+ years,Venkat N,,50,0,"{'agile': [(3890, 3895), (8719, 8724)], 'busin...","{'sprint': [(8909, 8915)], 'capital one': [(96...","[(933, 938, 'skill'), (3890, 3895, 'skill'), (..."
1,Shashank.docx,[],,,,,seven years,2 ...,,0,0,"{'testing': [(3645, 3652), (4663, 4670), (4684...","{'sprint': [(6704, 6710)]}","[(3645, 3652, 'skill'), (4663, 4670, 'skill'),..."
2,Anudeep N_Sr Java Developer.docx,"['\xa0html', ' css', ' xml', ' soap', ' hibern...",,,,,8+ years,Anudeep,,93,0,"{'java': [(5179, 5183)], 'spring': [(13276, 13...","{'IBM': [(2912, 2915), (5319, 5322), (5950, 59...","[(60, 64, 'skill'), (330, 334, 'skill'), (783,..."
3,ram nandyala.docx,"[' aws', ' swagger', ' node-js', ' angular2', ...",,,,,8+ years,RAMA KARTHIK NANDYALA,,157,0,"{'java': [(21403, 21407), (22667, 22671), (233...","{'Fifth Third Bank': [(13514, 13530)], 'IBM': ...","[(615, 619, 'skill'), (675, 679, 'skill'), (50..."
4,Neha Mugghala.docx,"[' java', ' spring', ' drools', ' oracle', ' ...",,,,,8+ years,Sourya,,146,0,"{'java': [(89, 93)], 'spring': [(9683, 9689), ...","{'IBM': [(1164, 1167), (3751, 3754), (4390, 43...","[(65, 69, 'skill'), (89, 93, 'skill'), (497, 5..."


In [7]:
# words followed by 'client:'
def extract_organisation(text):
    lines = text.split('\n')
    orgs = list()
    for line in lines:
        if line.find('client:')>-1:
            if len(line.split(':'))>1:
                ent = line.split(':')[1].split(',')[0].strip()
                if ent != '':
                    org = nltk.tokenize.sent_tokenize(ent)[0]
                    #print(org)
                    orgs.append(org)
    #print(orgs)
    return orgs

In [10]:
organisations = list()
for ix in tqdm.tqdm(resume_entities.index):
    doc = resume_entities.loc[ix]
    text = read_file(doc['file'])
    organisations.append(extract_organisation(text))
resume_entities['Organisations'] = organisations

100%|██████████| 228/228 [00:53<00:00,  4.25it/s]


In [11]:
#words followed by environment:
def extract_skills(text):
    skills = list()
    for line in text.split('\n'):
        line = line.lower()
        if line.find('environment:')>-1:
            skills.extend(line.replace('environment:','').split(','))
    return skills

In [12]:
skills = list()
for ix in tqdm.tqdm(resume_entities.index):
    doc = resume_entities.loc[ix]
    text = read_file(doc['file'])
    skills.append(extract_skills(text))
resume_entities['skills'] = skills

100%|██████████| 228/228 [00:49<00:00,  4.63it/s]


In [14]:
all_skills = list()
#x = [all_skills.extend(literal_eval(_)) for _ in resume_entities['skills']]
x = [all_skills.extend(_) for _ in resume_entities['skills']]

In [15]:
all_skills

[' microsoft office (excel word',
 ' powerpoint',
 ' project)',
 ' sharepoint',
 ' ms-visio 2010',
 ' oracle hcm',
 ' kronos',
 ' hp alm.',
 ' microsoft office (excel',
 ' word',
 ' powerpoint)',
 '  peoplesoft 9.2',
 ' qc alm. ',
 ' microsoft office (excel',
 ' word',
 ' powerpoint)',
 ' scrum wise',
 ' jira',
 ' sharepoint',
 ' jenkins ',
 'kronos',
 '  oracle hcm.',
 ' microsoft office (excel word',
 ' powerpoint',
 ' project)',
 ' innotas',
 ' rally',
 ' sharepoint',
 ' ms-visio 2010.',
 ' microsoft office (excel word',
 ' powerpoint)',
 ' ms visio 2010',
 ' sharepoint 2010.',
 ' sap ecc 6.0 ',
 ' sd ',
 ' eh&s',
 ' fico ',
 ' alm',
 ' scrum wise',
 ' microsoft office (excel word',
 ' powerpoint',
 ' project)',
 ' rational requisite pro',
 ' ms visio.',
 ' sap 4.7 and ecc 6.0',
 ' isu',
 ' sd ',
 ' fico ',
 ' alm',
 ' scrum wise',
 '\xa0html',
 ' css',
 ' xml',
 ' soap',
 ' hibernate',
 ' java',
 'j2ee',
 'java script',
 'mysql db',
 ' spring boot',
 ' pl/sql',
 ' log4j',
 ' jquery

In [16]:
len(all_skills)

16643

In [17]:
all_skills = set(all_skills)

In [18]:
len(all_skills)

3559

In [19]:
def get_matches():
    all_file_matches = list()
    files = list(resume_entities['file'])
    for file in tqdm.tqdm(files):
        text = read_file(file)
        all_matches = dict()
        for skill in all_skills:
            skill = skill.strip()
            if skill == '':
                continue
            matches = [_.span() for _ in re.finditer(r'\b(%s)\b' % re.escape(skill), text)]
            if len(matches)>0:
                all_matches[skill] = matches
        all_file_matches.append(all_matches)
    return all_file_matches

In [20]:
all_skill_matches = get_matches()

100%|██████████| 228/228 [05:08<00:00,  1.35s/it]


In [21]:
resume_entities['skill_matches'] = all_skill_matches

In [22]:
def get_entity_indexes(doc):
    text = read_file(doc['file']).lower()
    matches = list()
    print(doc['file'])
    for col in ['skill_matches','org_matches','experience','person name','city']:
        entities = doc[col]
        if col == 'skill_matches':
            col = 'skill'
            #entities = literal_eval(entities)
        elif col=='org_matches':
            col = 'org'
            #entities = literal_eval(entities)
        elif col == 'person name':
            col = 'person'
        if entities == 'None':
            continue
        if type(entities) in [dict,list]:
            for ent in entities:
                matches.extend([_.span()+(col,) for _ in re.finditer(r'\b(%s)\b' % re.escape(ent.lower().strip()), text)])
        elif type(entities) is str:
            temp = [_.span()+(col,) for _ in re.finditer(r'\b(%s)\b' % re.escape(entities.lower().strip()), text)]
            if len(temp)>0:
                matches.append(temp[0])
        else:
            continue
    return matches

In [23]:
all_entities = list()
for ix in resume_entities.index:
    doc = resume_entities.loc[ix]
    all_entities.append(get_entity_indexes(doc))

Venkat_BA.docx
Shashank.docx
Anudeep N_Sr Java Developer.docx
ram nandyala.docx
Neha Mugghala.docx
chenna kesava.docx
Utthan Silawal12.docx
Yohan BSA.docx
Madhu_BA_AW.DOCX
KIRAN KUMAR.docx
avinash G.docx
Harika_java.docx
Mounika_P.docx
HARI_Sr.Java Developer.docx
RaviRaju_Resume.docx
B Shaker-Sr BSA-Scrum Master .docx
Vamshi Teja_Business Analyst.docx
VIJETHA G.docx
SaravanaKumar.docx
Abiral_Pandey_Fullstack_Java.docx
Manohar Reddy.docx
Syed_Zia_Ashraf.docx
srinivas b.docx
Gururaja Murthy PMCPCSM.docx
Madhuri Pawar.docx
Sr. Business Analyst.docx
Naveen Kumar Bandela.docx
mohid_rj.docx
Nikki_Nimmagadd.docx
Chandler_BA.docx
Ramteja Motupalli.docx
Sri Gati.docx
Rajesh_k.docx
Ashok Jayakumar - PM.docx
Sahithi K.docx
Deepika Chintalapati.docx
RaviBurra_Certified PM_DevOps.docx
Business Analyst_GHyma.docx
Vivek.BSA.docx
Gokul Selvam S PM.docx
Naveen Arora.docx
jagadeesh k.docx
BA Kiran.docx
Gopi.docx
Raja Santhosam_PM Scrum Master.DOCX
Resume - PM Agile-Scrum.docx
Satish Uduta.docx
Othman - 

In [24]:
resume_entities['entities'] = all_entities

In [25]:
resume_entities.columns

Index(['file', 'skills', 'phone', 'Organisations', 'Institutes', 'email',
       'experience', 'person name', 'city', 'skills_len', 'orgs_len',
       'skill_matches', 'org_matches', 'entities'],
      dtype='object')

In [26]:
resume_entities.to_csv('resume_entities1.csv',index=False)

In [6]:
resume_entities['entities'] = [literal_eval(_) for _ in resume_entities['entities']]

In [27]:
resume_entities[(resume_entities['city']!='None')].shape

(32, 14)

In [28]:
train_data = resume_entities[(resume_entities['city']!='None')]

In [29]:
train_data.shape

(32, 14)

In [30]:
def write_train_data(file,labels,fp):
    newline =' '
    text = read_file(file)
    #data_tagged = 32
    count=-1
    count+=1
    #if count == data_tagged:
    #    break
    ann_last = 0
    print(labels)
    dtype = [('start', int), ('end', int), ('label', 'U15')]
    values = list()
    for label in labels:
        values.append((label[0],label[1],label[2]))
    temp_list = np.array(values,dtype=dtype)
    temp_list = np.sort(temp_list,order=['start'])
    for ann in temp_list:
        print(ann_last)
        outside_words = text[ann_last:ann[0]].split(' ')
        for word in outside_words:
            if word != '':
                fp.write(word + ' ' + 'O'+'\n')
        inside_words = text[ann[0]:ann[1]].split(' ')
        label = ann[2]
        ix = 0
        for word in inside_words:
            if word != ' ':
                if ix == 0:
                    fp.write(word+ ' B-'+label+'\n')
                else:
                    fp.write(word + ' I-'+label+'\n')
                ix+=1
        ann_last = ann[1]
    outside_words = text[ann_last:len(text)].split(' ')
    for word in outside_words:
        if word != '':
            fp.write(word + ' ' + 'O'+'\n')
    fp.write("\n")

In [31]:
with open('train_resume.bie','w') as fp:
    for ix in train_data.index:
        doc = train_data.loc[ix]
        write_train_data(doc['file'],doc['entities'],fp)

[(3204, 3211, 'skill'), (7478, 7485, 'skill'), (3523, 3546, 'skill'), (13737, 13760, 'skill'), (33, 34, 'skill'), (12280, 12281, 'skill'), (12997, 12998, 'skill'), (13878, 13879, 'skill'), (13887, 13888, 'skill'), (4701, 4712, 'skill'), (3195, 3202, 'skill'), (7469, 7476, 'skill'), (1155, 1158, 'skill'), (643, 655, 'skill'), (7180, 7192, 'skill'), (10231, 10243, 'skill'), (15092, 15104, 'skill'), (14664, 14680, 'skill'), (11540, 11554, 'skill'), (7125, 7135, 'skill'), (15037, 15047, 'skill'), (2285, 2293, 'skill'), (2254, 2259, 'skill'), (10602, 10607, 'skill'), (2233, 2246, 'skill'), (13067, 13083, 'skill'), (17298, 17301, 'skill'), (314, 317, 'skill'), (1264, 1267, 'skill'), (2113, 2116, 'skill'), (6794, 6797, 'skill'), (10466, 10469, 'skill'), (13574, 13577, 'skill'), (17508, 17511, 'skill'), (12280, 12284, 'skill'), (12997, 13001, 'skill'), (13878, 13882, 'skill'), (13887, 13891, 'skill'), (794, 798, 'skill'), (4544, 4548, 'skill'), (4696, 4700, 'skill'), (17061, 17065, 'skill'), (

[(1117, 1124, 'skill'), (1290, 1297, 'skill'), (2952, 2959, 'skill'), (3413, 3420, 'skill'), (5165, 5172, 'skill'), (5314, 5321, 'skill'), (1959, 1960, 'skill'), (2673, 2674, 'skill'), (4317, 4318, 'skill'), (6750, 6751, 'skill'), (9057, 9058, 'skill'), (16117, 16118, 'skill'), (1331, 1337, 'skill'), (2637, 2643, 'skill'), (1214, 1218, 'skill'), (1995, 2002, 'skill'), (1225, 1228, 'skill'), (1208, 1212, 'skill'), (1241, 1245, 'skill'), (1254, 1257, 'skill'), (1907, 1909, 'skill'), (1958, 1959, 'skill'), (2672, 2673, 'skill'), (3063, 3064, 'skill'), (3065, 3066, 'skill'), (3070, 3071, 'skill'), (5084, 5085, 'skill'), (5086, 5087, 'skill'), (6749, 6750, 'skill'), (9056, 9057, 'skill'), (9155, 9156, 'skill'), (9157, 9158, 'skill'), (10163, 10164, 'skill'), (15736, 15737, 'skill'), (15738, 15739, 'skill'), (16116, 16117, 'skill'), (17327, 17328, 'skill'), (17335, 17336, 'skill'), (1278, 1285, 'skill'), (1230, 1234, 'skill'), (9451, 9454, 'skill'), (1860, 1882, 'skill'), (2883, 2901, 'skill

[(2576, 2579, 'skill'), (2627, 2630, 'skill'), (2674, 2677, 'skill'), (9934, 9937, 'skill'), (9946, 9949, 'skill'), (10035, 10038, 'skill'), (10061, 10064, 'skill'), (10637, 10640, 'skill'), (12570, 12573, 'skill'), (12614, 12617, 'skill'), (12641, 12644, 'skill'), (12674, 12677, 'skill'), (13355, 13358, 'skill'), (13448, 13451, 'skill'), (13690, 13693, 'skill'), (15371, 15374, 'skill'), (15482, 15485, 'skill'), (16652, 16655, 'skill'), (10439, 10440, 'skill'), (10448, 10449, 'skill'), (5752, 5758, 'skill'), (5838, 5844, 'skill'), (13706, 13712, 'skill'), (20877, 20883, 'skill'), (22421, 22427, 'skill'), (22763, 22769, 'skill'), (23088, 23094, 'skill'), (24023, 24029, 'skill'), (24365, 24371, 'skill'), (24560, 24566, 'skill'), (22088, 22092, 'skill'), (22987, 22991, 'skill'), (23690, 23694, 'skill'), (24459, 24463, 'skill'), (23166, 23169, 'skill'), (24638, 24641, 'skill'), (5280, 5284, 'skill'), (13092, 13096, 'skill'), (15764, 15768, 'skill'), (5728, 5731, 'skill'), (13248, 13251, 's

[(3301, 3314, 'skill'), (6626, 6639, 'skill'), (12161, 12174, 'skill'), (5737, 5744, 'skill'), (5203, 5206, 'skill'), (9172, 9175, 'skill'), (10739, 10742, 'skill'), (12668, 12671, 'skill'), (13985, 13988, 'skill'), (14898, 14901, 'skill'), (5747, 5748, 'skill'), (3224, 3230, 'skill'), (6506, 6512, 'skill'), (11129, 11135, 'skill'), (17561, 17567, 'skill'), (14925, 14929, 'skill'), (11020, 11027, 'skill'), (11105, 11112, 'skill'), (14256, 14263, 'skill'), (14941, 14948, 'skill'), (6108, 6112, 'skill'), (3454, 3457, 'skill'), (16457, 16460, 'skill'), (16540, 16543, 'skill'), (3160, 3170, 'skill'), (6699, 6709, 'skill'), (7746, 7756, 'skill'), (8044, 8054, 'skill'), (8568, 8578, 'skill'), (8681, 8691, 'skill'), (9043, 9053, 'skill'), (9498, 9508, 'skill'), (10120, 10130, 'skill'), (10274, 10284, 'skill'), (11257, 11267, 'skill'), (10728, 10734, 'skill'), (11526, 11532, 'skill'), (13974, 13980, 'skill'), (14917, 14923, 'skill'), (23079, 23094, 'skill'), (9335, 9339, 'skill'), (3548, 3551,

24160
24228
24296
24360
24369
24427
24460
24498
24516
24530
24545
24553
24561
24580
24596
24616
24703
24738
24746
24746
24765
24805
24836
24907
24920
25175
25181
25187
25192
25202
25214
25224
25230
25237
25242
25248
25254
25262
25269
25274
25278
25286
25292
[(927, 930, 'skill'), (939, 942, 'skill'), (953, 956, 'skill'), (57, 58, 'skill'), (75, 76, 'skill'), (106, 107, 'skill'), (115, 116, 'skill'), (1029, 1030, 'skill'), (1977, 1978, 'skill'), (2079, 2080, 'skill'), (2170, 2171, 'skill'), (2175, 2176, 'skill'), (2274, 2275, 'skill'), (2282, 2283, 'skill'), (2286, 2287, 'skill'), (2289, 2290, 'skill'), (2304, 2305, 'skill'), (2319, 2320, 'skill'), (2342, 2343, 'skill'), (2368, 2369, 'skill'), (2405, 2406, 'skill'), (2429, 2430, 'skill'), (2454, 2455, 'skill'), (2476, 2477, 'skill'), (3014, 3015, 'skill'), (3016, 3017, 'skill'), (3087, 3088, 'skill'), (826, 833, 'skill'), (650, 660, 'skill'), (867, 876, 'skill'), (1458, 1467, 'skill'), (640, 645, 'skill'), (812, 821, 'skill'), (1029, 103

[(25123, 25136, 'skill'), (11689, 11696, 'skill'), (14500, 14517, 'skill'), (2742, 2745, 'skill'), (11018, 11021, 'skill'), (16979, 16982, 'skill'), (21500, 21503, 'skill'), (24035, 24038, 'skill'), (8082, 8083, 'skill'), (14306, 14307, 'skill'), (25768, 25775, 'skill'), (3189, 3195, 'skill'), (6256, 6262, 'skill'), (7901, 7907, 'skill'), (10764, 10770, 'skill'), (12797, 12803, 'skill'), (14203, 14209, 'skill'), (14752, 14758, 'skill'), (17274, 17280, 'skill'), (18877, 18883, 'skill'), (18895, 18901, 'skill'), (18918, 18924, 'skill'), (18972, 18978, 'skill'), (19003, 19009, 'skill'), (19039, 19045, 'skill'), (19271, 19277, 'skill'), (20056, 20062, 'skill'), (22097, 22103, 'skill'), (23986, 23992, 'skill'), (25242, 25248, 'skill'), (444, 448, 'skill'), (655, 659, 'skill'), (1456, 1460, 'skill'), (7372, 7376, 'skill'), (7837, 7841, 'skill'), (9422, 9426, 'skill'), (13936, 13940, 'skill'), (14617, 14621, 'skill'), (14806, 14810, 'skill'), (18438, 18442, 'skill'), (21957, 21961, 'skill'), 

31310
31318
31521
31546
31629
31647
31680
31692
31700
31720
31725
31729
31728
31732
31734
31741
31746
31746
31756
31761
31768
31776
31783
31783
31791
31797
31804
31809
[(10976, 10993, 'skill'), (1866, 1869, 'skill'), (2303, 2306, 'skill'), (3523, 3526, 'skill'), (4100, 4103, 'skill'), (4892, 4895, 'skill'), (5038, 5041, 'skill'), (7638, 7641, 'skill'), (16041, 16044, 'skill'), (20728, 20731, 'skill'), (2031, 2041, 'skill'), (3337, 3347, 'skill'), (5099, 5100, 'skill'), (5229, 5230, 'skill'), (5233, 5234, 'skill'), (5237, 5238, 'skill'), (5281, 5282, 'skill'), (5315, 5316, 'skill'), (17884, 17885, 'skill'), (20558, 20559, 'skill'), (2035, 2041, 'skill'), (3341, 3347, 'skill'), (4218, 4224, 'skill'), (5403, 5409, 'skill'), (6761, 6767, 'skill'), (13586, 13592, 'skill'), (16303, 16309, 'skill'), (16422, 16428, 'skill'), (17875, 17881, 'skill'), (18290, 18296, 'skill'), (20184, 20190, 'skill'), (20848, 20854, 'skill'), (333, 337, 'skill'), (2723, 2727, 'skill'), (4265, 4269, 'skill'), (452

[(1472, 1485, 'skill'), (2914, 2927, 'skill'), (3847, 3860, 'skill'), (16609, 16622, 'skill'), (10137, 10140, 'skill'), (15036, 15039, 'skill'), (3724, 3725, 'skill'), (3728, 3729, 'skill'), (3787, 3788, 'skill'), (3791, 3792, 'skill'), (3867, 3868, 'skill'), (4090, 4091, 'skill'), (4105, 4106, 'skill'), (7383, 7384, 'skill'), (9942, 9943, 'skill'), (10338, 10339, 'skill'), (12925, 12926, 'skill'), (15657, 15658, 'skill'), (16440, 16441, 'skill'), (1487, 1493, 'skill'), (3952, 3958, 'skill'), (11903, 11909, 'skill'), (12884, 12890, 'skill'), (15572, 15578, 'skill'), (16426, 16432, 'skill'), (18773, 18779, 'skill'), (20301, 20307, 'skill'), (227, 231, 'skill'), (1427, 1431, 'skill'), (2020, 2024, 'skill'), (2102, 2106, 'skill'), (3222, 3226, 'skill'), (3297, 3301, 'skill'), (3348, 3352, 'skill'), (6380, 6384, 'skill'), (8817, 8821, 'skill'), (10684, 10688, 'skill'), (11367, 11371, 'skill'), (12694, 12698, 'skill'), (13160, 13164, 'skill'), (18705, 18709, 'skill'), (1391, 1398, 'skill'),

[(5673, 5680, 'skill'), (1155, 1156, 'skill'), (1200, 1201, 'skill'), (4356, 4357, 'skill'), (5792, 5793, 'skill'), (1136, 1140, 'skill'), (5956, 5960, 'skill'), (9490, 9494, 'skill'), (9671, 9675, 'skill'), (1129, 1133, 'skill'), (5951, 5955, 'skill'), (9485, 9489, 'skill'), (9666, 9670, 'skill'), (9903, 9907, 'skill'), (88, 89, 'skill'), (1154, 1155, 'skill'), (1199, 1200, 'skill'), (4357, 4358, 'skill'), (4844, 4845, 'skill'), (5791, 5792, 'skill'), (1158, 1165, 'skill'), (9919, 9929, 'skill'), (1148, 1152, 'skill'), (9934, 9938, 'skill'), (1170, 1179, 'skill'), (1166, 1168, 'skill'), (1203, 1215, 'skill'), (1250, 1266, 'skill'), (9694, 9702, 'skill'), (1268, 1274, 'skill'), (4097, 4103, 'skill'), (88, 92, 'skill'), (8750, 8752, 'skill'), (1108, 1115, 'skill'), (9509, 9517, 'skill'), (9909, 9917, 'skill'), (101, 108, 'skill'), (427, 434, 'skill'), (669, 676, 'skill'), (1685, 1692, 'skill'), (1761, 1768, 'skill'), (2247, 2254, 'skill'), (3457, 3464, 'skill'), (3564, 3571, 'skill'), (

4600
4606
4612
4618
4624
4630
4635
4640
4640
4645
4654
4660
4668
4676
4682
4688
4695
4701
4712
4729
4729
4735
4747
4752
4758
4768
4789
4805
4805
4815
4815
4830
4856
4864
4873
4881
4907
4910
4910
4918
4923
4930
4938
4945
4945
4976
4976
4981
4996
5001
5019
5026
5053
5064
5081
5083
5180
5349
5396
5538
5722
5728
5734
5745
5754
5829
5832
5832
5855
5871
5884
5888
5888
5903
5942
5959
5964
5970
5975
5984
5987
5985
5987
6096
6107
6229
6250
6264
6267
6340
6343
6343
6397
6506
6522
6528
6537
6540
6540
6546
6549
6549
6555
6562
6567
6573
6581
6586
6592
6610
6679
6683
6782
6790
6803
6813
6828
6835
6842
6842
6863
6902
6913
6964
6978
6978
7005
7012
7024
7074
7078
7078
7181
7186
7186
7210
7214
7214
7246
7250
7280
7298
7408
7412
7412
7437
7446
7478
7490
7490
7521
7531
7582
7587
7587
7654
7734
7747
7738
7747
7787
7860
7907
7916
7925
7989
8048
8057
8085
8094
8107
8111
8120
8111
8120
8155
8163
8174
8177
8177
8190
8198
8220
8269
8296
8357
8386
8417
8441
8448
8473
8497
8532
8546
8577
8581
8590
8581
8590
8600


[(3721, 3722, 'skill'), (3516, 3522, 'skill'), (6420, 6426, 'skill'), (12667, 12673, 'skill'), (3431, 3435, 'skill'), (2291, 2294, 'skill'), (2845, 2848, 'skill'), (10159, 10162, 'skill'), (12703, 12706, 'skill'), (14731, 14734, 'skill'), (3473, 3479, 'skill'), (6462, 6468, 'skill'), (11254, 11265, 'skill'), (11254, 11274, 'skill'), (3413, 3417, 'skill'), (9153, 9157, 'skill'), (11282, 11286, 'skill'), (12681, 12685, 'skill'), (3403, 3406, 'skill'), (13985, 13988, 'skill'), (3546, 3553, 'skill'), (20, 21, 'skill'), (32, 33, 'skill'), (3720, 3721, 'skill'), (6429, 6430, 'skill'), (13958, 13959, 'skill'), (14175, 14176, 'skill'), (3443, 3447, 'skill'), (12675, 12679, 'skill'), (13990, 13994, 'skill'), (13952, 13955, 'skill'), (3385, 3390, 'skill'), (3366, 3373, 'skill'), (3807, 3814, 'skill'), (11291, 11298, 'skill'), (6487, 6499, 'skill'), (9491, 9503, 'skill'), (11262, 11274, 'skill'), (14061, 14073, 'skill'), (1468, 1480, 'skill'), (7298, 7310, 'skill'), (9147, 9151, 'skill'), (11276,

[(6250, 6253, 'skill'), (6732, 6735, 'skill'), (7030, 7033, 'skill'), (7067, 7070, 'skill'), (7074, 7077, 'skill'), (7778, 7781, 'skill'), (9755, 9758, 'skill'), (14357, 14360, 'skill'), (20523, 20526, 'skill'), (22840, 22843, 'skill'), (33427, 33430, 'skill'), (36169, 36172, 'skill'), (22143, 22153, 'skill'), (3128, 3129, 'skill'), (4151, 4152, 'skill'), (5007, 5008, 'skill'), (34344, 34345, 'skill'), (4490, 4496, 'skill'), (9488, 9494, 'skill'), (12206, 12212, 'skill'), (13373, 13379, 'skill'), (16105, 16111, 'skill'), (21543, 21549, 'skill'), (21751, 21757, 'skill'), (22147, 22153, 'skill'), (23394, 23400, 'skill'), (25347, 25353, 'skill'), (29617, 29623, 'skill'), (33117, 33123, 'skill'), (33851, 33857, 'skill'), (36239, 36245, 'skill'), (421, 425, 'skill'), (766, 770, 'skill'), (3810, 3814, 'skill'), (5445, 5449, 'skill'), (9966, 9970, 'skill'), (14583, 14587, 'skill'), (15707, 15711, 'skill'), (19368, 19372, 'skill'), (20479, 20483, 'skill'), (21567, 21571, 'skill'), (22490, 2249

[(1649, 1652, 'skill'), (2958, 2961, 'skill'), (1884, 1890, 'skill'), (2347, 2354, 'skill'), (1616, 1628, 'skill'), (2181, 2187, 'skill'), (1809, 1813, 'skill'), (1822, 1825, 'skill'), (1699, 1701, 'skill'), (3588, 3590, 'skill'), (41, 42, 'skill'), (45, 46, 'skill'), (87, 88, 'skill'), (1612, 1613, 'skill'), (1842, 1843, 'skill'), (2872, 2873, 'skill'), (6076, 6077, 'skill'), (6554, 6555, 'skill'), (11472, 11473, 'skill'), (2031, 2038, 'skill'), (5500, 5503, 'skill'), (2008, 2018, 'skill'), (8833, 8844, 'skill'), (2329, 2332, 'skill'), (2383, 2386, 'skill'), (2334, 2340, 'skill'), (3100, 3103, 'skill'), (2334, 2345, 'skill'), (1635, 1647, 'skill'), (4202, 4214, 'skill'), (2388, 2398, 'skill'), (2530, 2535, 'skill'), (1616, 1623, 'skill'), (2962, 2978, 'skill'), (4941, 4957, 'skill'), (2207, 2210, 'skill'), (2084, 2089, 'skill'), (2040, 2049, 'skill'), (1842, 1846, 'skill'), (7249, 7254, 'skill'), (1630, 1633, 'skill'), (4218, 4221, 'skill'), (2455, 2461, 'skill'), (1834, 1838, 'skill'

[(13028, 13034, 'skill'), (14526, 14532, 'skill'), (14715, 14721, 'skill'), (14726, 14732, 'skill'), (14738, 14744, 'skill'), (1788, 1794, 'skill'), (2386, 2389, 'skill'), (5682, 5692, 'skill'), (5928, 5938, 'skill'), (9248, 9258, 'skill'), (9288, 9294, 'skill'), (14709, 14713, 'skill'), (5705, 5711, 'skill'), (35, 36, 'skill'), (2341, 2342, 'skill'), (5692, 5693, 'skill'), (8672, 8673, 'skill'), (8674, 8675, 'skill'), (8690, 8691, 'skill'), (9258, 9259, 'skill'), (12673, 12700, 'skill'), (2861, 2872, 'skill'), (9362, 9365, 'skill'), (12832, 12835, 'skill'), (14669, 14681, 'skill'), (2272, 2279, 'skill'), (12852, 12859, 'skill'), (14697, 14704, 'skill'), (1995, 1998, 'skill'), (2581, 2584, 'skill'), (5718, 5721, 'skill'), (5766, 5769, 'skill'), (9271, 9274, 'skill'), (9312, 9315, 'skill'), (12823, 12826, 'skill'), (12804, 12821, 'skill'), (5766, 5772, 'skill'), (5781, 5793, 'skill'), (9321, 9333, 'skill'), (9271, 9278, 'skill'), (5718, 5734, 'skill'), (12852, 12862, 'skill'), (14697, 1

In [32]:
vocab = set()
tags = set()
def build_vocab():
    with open('train_resume.bie') as fp:
        for line in fp.readlines():
            words = line.split()
            if len(words)==2:
                word = words[0]
                tag = words[1]
                vocab.add(word)
                tags.add(tag)

In [33]:
build_vocab()

In [34]:
len(vocab)

9271

In [35]:
len(tags)

13

In [36]:
with open('vocab.txt','w') as fp:
    for word in vocab:
        fp.writelines(word+'\n')

In [37]:
with open('tags.txt','w') as fp:
    for tag in tags:
        fp.writelines(tag+'\n')