In [1]:
import re
import json
import spacy
import pandas as pd
from spacy import displacy
from spacy.tokens import DocBin
from spacy.util import filter_spans

nlp = spacy.blank("en")
doc_bin = DocBin()

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def trim_entity_spans(data: dict) -> dict:
    invalid_span_tokens = re.compile(r'\s')
    cleaned_data = []
    for entity in data:
        entities = entity['label']
        text = entity['text']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'label': valid_entities}])

    return cleaned_data

In [6]:
path = '../training/data'
data = [json.loads(line) for line in open(path, 'r')]
clean_data = trim_entity_spans(data)
clean_data

[['\nLast Updated : August 16 , 2021\nSource : Bdjobs.com Online CV Bank\nEMAM HASAN\nAddress : House # 35 , Road # 11 , Sector # 13 , Uttara East , Dhaka\nPrimary Mobile :01714238096\nPrimary Email : hasibhasan03@gmail.com\nCareer Objective :\nenjoy challenged engaging projects require work outside comfort\nknowledge set , continuing learn advance new technologies development techniques\nimportant .\nCareer Summary :\nIve seven years working experience core java developer , J2EE , Spring Boot , JPA Android\napplications . Im working Apex Holdings  Apex Group  senior web java developer last\nfive years .\nSpecial Qualification :\nIve working last 9 years core JAVA especially J2EE , Spring Boot , JPA Android .\ncertified Microsoft SQL Server database also work different databases like PostgreSQL ,\nMySQL , SQLite Oracle .\nEmployment History :\nTotal Year Experience : 11.6 yrs\n1 . Core Java Developer  9.3 yrs\nJanuary 11 , 2013 - Continuing\nApex Holdings  Apex Group\nArea Expertise :\

In [7]:
training_data = []

for text, labels in clean_data:
    dict = {}
    dict['text'] = text
    dict['label'] = labels['label']
    training_data.append(dict)

training_data

[{'text': '\nLast Updated : August 16 , 2021\nSource : Bdjobs.com Online CV Bank\nEMAM HASAN\nAddress : House # 35 , Road # 11 , Sector # 13 , Uttara East , Dhaka\nPrimary Mobile :01714238096\nPrimary Email : hasibhasan03@gmail.com\nCareer Objective :\nenjoy challenged engaging projects require work outside comfort\nknowledge set , continuing learn advance new technologies development techniques\nimportant .\nCareer Summary :\nIve seven years working experience core java developer , J2EE , Spring Boot , JPA Android\napplications . Im working Apex Holdings  Apex Group  senior web java developer last\nfive years .\nSpecial Qualification :\nIve working last 9 years core JAVA especially J2EE , Spring Boot , JPA Android .\ncertified Microsoft SQL Server database also work different databases like PostgreSQL ,\nMySQL , SQLite Oracle .\nEmployment History :\nTotal Year Experience : 11.6 yrs\n1 . Core Java Developer  9.3 yrs\nJanuary 11 , 2013 - Continuing\nApex Holdings  Apex Group\nArea Expe

In [8]:
for i, datas in enumerate(training_data):
    text = datas['text']
    labels = datas['label']
    doc = nlp.make_doc(text)
    
    entities = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label, alignment_mode="expand")
        if span is None:
            print(f'Skipped_{i}')
        else :
            entities.append(span)

    filter_entities = filter_spans(entities)
    doc.ents = filter_entities
    doc_bin.add(doc)

doc_bin.to_disk('train.spacy')

In [9]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [10]:
!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  0       0        6048.31   1966.52    0.04    0.02    0.25    0.00
  2     200      655200.62  72553.08   18.04   61.79   10.56    0.18
  4     400       25840.14  19285.07   67.70   65.24   70.35    0.68
  6     600        7326.91  15792.81   73.59   66.33   82.63    0.74
  8     800        8951.02  13811.67   71.80   64.06   81.66    0.72
 10    1000        8

In [301]:
nlp_ner = spacy.load('model-best')

In [317]:
path = "/home/mahmudul/122.k_n/project/cv_parsing/data/Cv_Clean-Text/Md._Abu_Taleb_Bdjobs.txt"

with open(path, 'r') as file:
    resume_text = file.read()

In [318]:
name, email, contact, technology, experience, url, technology_experience =[], [], [], [], [], [], []
df = pd.DataFrame(columns=['Name', 'Email', 'Contact', 'Technology', 'Experience', 'Url', 'Technological Experience' ])

previous_email_end_index = 0
previous_contact_end_index = 0


doc = nlp_ner(resume_text)

for ent in doc.ents:
    if ent.label_ == 'name':
        name.append(ent.text)

    elif ent.label_ == 'email':
        if len(email) != 0 and (ent.start - previous_email_end_index) < 20:
            previous_email_end_index = ent.end
            email.append(ent.text)

        if len(email) == 0:
            email.append(ent.text)
            previous_email_end_index = ent.end

    elif ent.label_ == 'contact':
        if len(contact) != 0 and (ent.start - previous_contact_end_index) < 20:
            previous_contact_end_index = ent.end
            contact.append(ent.text)

        if len(contact) == 0:
            contact.append(ent.text)
            previous_contact_end_index = ent.end

    elif ent.label_ == 'Technology':
        technology.append(ent.text)

    elif ent.label_ == 'experience':
        experience.append(ent.text)

    elif ent.label_ == 'url':
        url.append(ent.text)

df['Name'] = [name[0]]
df['Email'] = [email]
df['Contact']= [contact]
df['Technology'] = [technology]
df['Experience'] = [experience]
df['Url'] = [url]
df['Technological Experience'] = [technology_experience]


In [319]:
df

Unnamed: 0,Name,Email,Contact,Technology,Experience,Url,Technological Experience
0,MD . ABU TALEB,[abutaleb6@gmail.com],"[01711480713, 01521209406, 01312342275]","[J2EE, SQL, Java, Spring Framework, Servlets, ...","[7+ years experience TL Java, 7+ Yrs Exp . Jav...",[],[]


In [320]:
# comma separating multiple tagged experience

latest = []
for i in doc.ents:
    if i.label_ == 'experience':
        new = i.text.split(',')
        # print(new)
        for i in new:
            i = i.strip()
            latest.append(i)

# finding Technological Experience by comparing experience and technology column
tech = df['Technology'][0]
tech = set(tech)

data_list = []
for i in latest:
    for item in tech:
        if item in i:
            data_list.append(i)
            # print(i)
            break

df['Technological Experience'][0]= data_list


In [321]:
df['Technological Experience'][0]

['7+ years experience TL Java',
 '7+ Yrs Exp . Java J2EE\nOCJP- SE 6 Programmer  Java\nOOP',
 'Senior Software Engineer  Java  Spring   0.8 yr\nSeptember 1',
 'Senior Executive-IT  Java Backend  Android Developer   2.8 yrs\nNovember 10',
 'Spring Framework  0.9 yr',
 'Spring Mvc Framework  0.2 yr']

In [322]:
df

Unnamed: 0,Name,Email,Contact,Technology,Experience,Url,Technological Experience
0,MD . ABU TALEB,[abutaleb6@gmail.com],"[01711480713, 01521209406, 01312342275]","[J2EE, SQL, Java, Spring Framework, Servlets, ...","[7+ years experience TL Java, 7+ Yrs Exp . Jav...",[],"[7+ years experience TL Java, 7+ Yrs Exp . Jav..."


In [323]:
text = '''
Job Title: Senior Software Engineer / Software Engineer (Java)

Source: Bdjobs.com Online CV Bank
Last Updated: May 10, 2022

MD. ABU TALEB
Address: Village: Ranjitpur, Kalatia, Keraniganj, Dhaka 1313
Primary Mobile No :01711480713
Secondary Mobile No :01521209406
Emergency Contact No :01312342275
Primary Email :abutaleb6@gmail.com
Secondary Email: abu@gmail.com

Career Objective:

A skilled & reliable Sr. Software Engineer/Team Leader/Project Manager/Asst. Manager-IT seeking a
position in an organization where 7+ years of experience in TL with Java,J2EE,Spring
Framework,OOP,JSP,SQL,Android software development will be reflects

Career Summary:

*7+ Yrs Exp. in Java and J2EE
*OCJP- SE 6 Programmer(Java)
*OOP
*BSC in CSE
*Member of BCS (M02537)
*Developed (BPDB Unified Prepayment Metering System).
*GPLMOTION@BIZMOTION (Order Management)
*Extensively trained in ESAD-J2EE by IDB-BISEW
*Hands on experience in Core Java, Spring Framework, Servlets, JSP, JDBC, JPA, Hibernate, JSF,
Struts2, Android and Arduino.
*One Man Army
*Result oriented, self driven, highly motivated, smart and hungry to learn new technologies

Special Qualification:

I am an Oracle Certified Professional, Java SE 6 Programmer. I have more than 7+ years experience as a
Java & Android Developer for nazadaqTechnologies, IBCS-Primax, ASL, GPL-BD & Brac IT. I love my
team & solving problem using Java.

Employment History:

Total Year of Experience: 7.8 yrs

1. Senior Software Engineer (Java & Spring) (0.8 yr)

( September 1, 2021 - Continuing)

Brac IT Services Ltd.
Area of Expertise:
Business Development (0.1 yr), Software Development (0.1 yr), Team Leader (Software) (0.1 yr)

Duties/Responsibilities:
BRAC Bank Limited (BBL) Project. Team Lead, Requirement Collection & Analysis, Software
Development and Business Development. (Spring Boot, Spring Security, JPA, Thymeleaf, Javascript,
jQuery, Bootstrap, Ajax, Hibernate, Oracle SQL 19g, Linux, File Upload), Angular also. Project Name:
BBL Data Mart(Responsibilities: Alpha to Omega), Another Project: Obichol Web (only frontend using
Angular) with another team member Md. Khaled Mosharof )

2. Senior Executive-IT(Java Backend & Android Developer) (2.8 yrs)

( November 10, 2018 - September 7, 2021)

General Pharmaceuticals Ltd.
Area of Expertise:
Software Development (0.9 yr), Spring Framework (0.9 yr), Team Leader (Software) (0.9 yr)

Duties/Responsibilities:
Software Development (Backend Service and Android Apps Using Java, Spring Framework, Spring
Security, BIRT, jasper, PostgreSQL, Hibernate, Android Studio, InteliJ Idea, STS, Eclipse),
Requirement Collection, Team Lead, GPLMOTION@BIZMOTION, Order Management System, git

3. Senior Software Developer (1.3 yrs)
( August 24, 2017 - November 9, 2018)

nazdaqTechnologies Inc. (Lexicon, Synergy Inc.)
Area of Expertise:
Business Development (0.4 yr), Software Development (0.4 yr), Team Lead (0.4 yr)

Duties/Responsibilities:
New Software Development(Leave Management System, Android apps for LMS, Automatic Database
Backup System From Linux and windows Server to Local Drive and Dropbox, Travel Requisition
Management System etc.), Requirement Collection, Report Design, Database Management, DCIMCH
Overtime Management, DCIMCH Leave Management System, Synergy Travel Request Management
System, Synergy Employee Join Management System, PAF Management System Etc. Requirement
Collection, Make SRS,Team Lead

4. Senior Software Engineer (0.4 yr)
( March 1, 2017 - August 23, 2017)

Automation Services Ltd. (ASL)
Area of Expertise:
Requirement Collection (0.1 yr), Software Development (0.1 yr), Team Leader (Software) (0.1 yr)

Duties/Responsibilities:
Development & SUPPORT (BPDB UNIFIED PREPAID METERING SYSTEM PROJECT) with Java,
Spring Framework, Sturts2 Framework, Oracle and ibatis & Develop ERP with Grails and Oracle,
Requirement Collection, Make SRS,Team Lead, git & svn

5. PROGRAMMER (1.4 yrs)

( September 1, 2015 - February 28, 2017)

IBCS-PRIMAX SOFTWARE(BD) LTD.
Area of Expertise:
Requirement Collection (0.4 yr), Software Development (0.4 yr), Team Leader (Software) (0.4 yr)

Duties/Responsibilities:
SOFTWARE DEVELOPMENT(DESCO INVENTORY MANAGEMENT SYSTEM using Java, Spring MVC
framework, Spring Security, Hibernate, Oracle 12c Database, JSP, JSTL, jQuery, JavaScript, Ajax,
Bootstrap, CSS) , Development & SUPPORT (BPDB UNIFIED PREPAID METERING SYSTEM
PROJECT using Java, Spring Framework, Struts-2 Framework, ibatis, Extjs) , Requirement Collection,
Make SRS,Team Lead, svn

6. Junior Software Developper (0.7 yr)
( December 21, 2014 - August 31, 2015)

nazdaq Technologies Ltd (naztech Inc Ltd.)
Area of Expertise:
Banking Software (0.2 yr), Software Development (0.2 yr), Spring Mvc Framework (0.2 yr)

Duties/Responsibilities:
For nSMARTLite, nOFAC, nSMS-Dashboard-server, nSMS-Dashboard-client, nSMS, Develop software
according to business requirement using (Java, Spring MVC, SQL Server, Hibernate, JDBC, Jasper,
ExtJs, jQuery, JavaScript, Ajax), Day to day status reporting, Trouble shooting for existing software,
git

Bachelor of Social Science
(BSS)

Diploma in Engineering

Academic Qualification:

Exam Title

Concentration/Major

Institute

Result

Pas.Year

Duration

Achievement

Master of Social Science
(MSS)

SOCIAL WORK

National University

Bachelor in Engineering
(BEngg)

Computer Science &
Engineering

IBAIS University

First Class, Marks
:60.2%

CGPA:3.23
out of 4

Social Work

National University

Second Class

2010

2016

2009

1 YEAR

FIRST CLASS

4 Years

4 Years

BSC in CSE (Software
Engineering)

-

Java Enterprise
Edition - J2EE

IDB-BISEW IT
SCHOLARSHIP,
[Foreign Institute]

First Division,
Marks :93%

2014

1.5 Years

Oracle Certified
Professional, Java SE 6
Programmer - Oracle
University

HSC(BM)

Computer Operation

BTEB

SSC

Science

Dhaka Board

CGPA:4.12
out of 5

CGPA:2.88
out of 5

2005

2003

2 Years

2 Years

-

-

Training Summary:

Training Title

Topic

Institute

Country

Location

Computer
Operating

MS. Office,
Internet, Grafics

UCD

Bangladesh

Gazipur

Year

2010

Duration

6 mounth

Professional Qualification:

Certification

Institute

Location

From

To

Member (M02537)

Dakha, Bangladesh

June 3, 2021

September 30,
2021

Oracle Certified Professional,
Java SE 6 Programmer -(OCJP)

Dhaka

March 15,
2015

March 16,
2015

Career and Application Information:

Looking For
Available For
Present Salary
Expected Salary

Preferred Job Category

Preferred District

Preferred Country

: Mid Level Job
: Full Time
: Tk. 100000
: Tk. 150,000

:

Bank/Non-Bank Fin.
Institution,IT/Telecommunication,Medical/Pharma,Other Special
Skilled Jobs

: Chattogram, Dhaka, Gazipur, Jashore, Mymensingh, Rajshahi

:

Brazil,Germany,Hungary,Japan,Philippines,Qatar,Saudi
Arabia,Singapore,South Africa,United Arab Emirates

Preferred Organization Types

: Banks,Telecommunication,Manufacturing (FMCG),Manufacturing

(Light Engineering & Heavy Industry),Software Company,IT Enabled
Service,Multinational Companies,Engineering
Firms,Pharmaceutical/Medicine Companies,Overseas
Companies,Group of Companies,E-commerce

Fields of Specialization

Description

Specialization:

• Android application development
• JSP
• HTML5 & CSS3
• Spring Framework
• Relational database systems: MySQL
PostgreSQL Oracle
• Hibernate
• JavaScript
• PostgreSQL
• Sturts-2
• Java EE/ J2EE

I Know SPRING Framework, Spring MVC & Boot, Spring Boot
Rest Web Service, HIBERNATE, STRUTS2, JSF primefaces
Framework, Grails, Jqeary, Ajax, JSTL, JSP, UML, MongoDB,
Android Well

Extra Curricular Activities:

POEM, Voluntary Work & Social Work, Editor at Kobi o Kabbo.

Language Proficiency:

Language

Bangla

English

Reading

High

High

Writing

High

Medium

Speaking

High

Medium

Personal Details :

Father"s Name
Mother"s Name
Date of Birth
Gender
Marital Status
Nationality
Religion

: Md. Sahab Uddin
: Kad Banu
: August 21, 1987
: Male
: Married
: Bangladeshi
: Islam

Permanent Address

:

Vill:- Bashakair, P.O: Fulbaria, P.S: Kaliakair, Dist: -1703, B.O.F, Gazipur Sadar,
Gazipur 1703

Current Location
Blood Group

: Dhaka
: O+

Reference (s):

Name
Organization

Designation

Address

Reference: 01
'''

doc = nlp_ner(text)
spacy.displacy.render(doc, style="ent", jupyter=True)


In [324]:
ex = []
previous_entity_end = 0

for ent in doc.ents:
    if ent.label_ == 'email':
        print(ent.start, ent.end)

        if len(ex) != 0 and (ent.start - previous_entity_end) < 20:
            previous_entity_end = ent.end
            ex.append(ent.text)

        if len(ex) == 0:
            ex.append(ent.text)
            previous_entity_end = ent.end
ex

68 69
73 74


['abutaleb6@gmail.com', 'abu@gmail.com']