In [1]:
import os
import spacy
import json

from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans

In [2]:
nlp = spacy.load('en_core_web_trf')

In [2]:
def format_dataset(dataset:[]):
    
    formatted_dataset = []
    classes = dataset[0]['classes']
        
    for data in dataset:
        
        formatted_data = {}
        
        formatted_data['text'] = data['annotations'][0][0]
        
        entities = data['annotations'][0][1]['entities']
        
        temp_entities = []
        
        for entity in entities:
                        
            start = entity[0]
            end = entity[1]
            label = entity[2]
        
            temp_entities.append((start, end, label))
            
        formatted_data['entities'] = temp_entities
        
        formatted_dataset.append(formatted_data)
        

    return formatted_dataset, classes
        

def load_dataset(directory:str):
    
    dataset = []
    
    for path, _, files in os.walk(directory):
    
        for file_name in files:
            
            with open(os.path.join(directory, file_name), 'r', encoding='utf8') as f:
                dataset.append(json.load(f))
                
    return format_dataset(dataset)
        

In [3]:
TRAIN_PATH = '../annotations/train/'
VAL_PATH = '../annotations/val/'

TRAIN_DATASET, classes = load_dataset(TRAIN_PATH)
VAL_DATASET, _ = load_dataset(VAL_PATH)

NameError: name 'os' is not defined

In [5]:
classes

['EXPERIENCE', 'GPA', 'LANGUAGE', 'SOFT SKILL', 'DIPLOMA', 'JOB']

In [6]:
TRAIN_DATASET

[{'text': "INTERN Highlights Microsoft Office and Stata programs Experience Intern June 2014 to August 2014 Company Name - City Assisted lawyers in their cases paperwork and legal documents. Transcribed court notes and minutes,. managed schedules. Attended to local courts with lawyers to evaluate legal proceedings. Hoet, Pelaez, Castillo y Linares Law Firm. Intern May 2013 to August 2013 City Organized documentation for legal filings and procedures for lawyers and their cases. Collaborated with. several of the associates by going to local courts and following their assigned cases. May 2015 to August 2015 Company Name - City , State Performed deli servings, event organization, inventory control, and new staff training. Accomplishments Active member of the Latin American Association at Boston University (LatAm) 2015-Present Basic Training of the Israeli Army Sept 2013-Dec 2013 Leadership Camp in Israel Aug 2012 Models of the United Nations 2013 Community Service Experiences 2012-2013. Ed

In [7]:
def convert_dataset_to_doc_bin(dataset: [], path:str):
    
    nlp = spacy.blank('en')
    doc_bin = DocBin()
    
    for data in tqdm(dataset): # tqdm -> progress bar
        
        text = data['text']
        temp_entities = data['entities']
        
        doc = nlp.make_doc(text)
        entities = []
        
        for start, end, label in temp_entities:
            
            span = doc.char_span(start, end, label, alignment_mode='contract')
            
            if span is None:
                print('Empty entity')
                continue
            
            entities.append(span)
            
        filtered_entities = filter_spans(entities) # Remove duplicate entity
        doc.ents = filtered_entities
        
        doc_bin.add(doc)
        
    doc_bin.to_disk(path)
        

In [8]:
TRAIN_DOCBIN_PATH = './docbin/train.spacy'
VAL_DOCBIN_PATH = './docbin/val.spacy'

convert_dataset_to_doc_bin(TRAIN_DATASET, TRAIN_DOCBIN_PATH)
convert_dataset_to_doc_bin(VAL_DATASET, VAL_DOCBIN_PATH)

100%|██████████| 51/51 [00:00<00:00, 131.60it/s]
100%|██████████| 16/16 [00:00<00:00, 85.90it/s]


In [9]:
%pip install spacy-transformers

Note: you may need to restart the kernel to use updated packages.


In [10]:
# !python -m spacy init fill-config ./config/base_config.cfg ./config/config.cfg

In [11]:
# !python -m spacy train ./config/config.cfg --output ./model --paths.train ./docbin/train.spacy --paths.dev ./docbin/val.spacy

In [12]:
def cv_parse(text):
    
    MODEL_PATH = './model/model-best'

    nlp_trained = spacy.load(MODEL_PATH)

    doc = nlp_trained(text)
    for ent in doc.ents:
        print(f'{ent.label_.upper():{30}}- {ent.text}')

In [14]:
# Testing

from PyPDF2 import PdfReader

# Read PDF

PDF_PATH = '../../PDF/Anthonio Obert - Software Developer - CV (1).pdf'

reader = PdfReader(PDF_PATH)
n_pages = len(reader.pages)

extracted_text = ''

for i in range(n_pages):
    page = reader.pages[i]
    extracted_text += page.extract_text().strip()

cv_parse(extracted_text)



SOFT SKILL                    - Anthonio
SOFT SKILL                    - Obert
SOFT SKILL                    - Software
SOFT SKILL                    - Developer+62
SOFT SKILL                    - 81273724892
SOFT SKILL                    - laisobert2@gmail.com
SOFT SKILL                    - Jakarta
SOFT SKILL                    - ,
SOFT SKILL                    - Indonesia
SOFT SKILL                    - SUMMARY
SOFT SKILL                    - A
SOFT SKILL                    - passionate
SOFT SKILL                    - college
SOFT SKILL                    - student
SOFT SKILL                    - with
SOFT SKILL                    - a
SOFT SKILL                    - keen
SOFT SKILL                    - interest
SOFT SKILL                    - in
SOFT SKILL                    - software
SOFT SKILL                    - development
SOFT SKILL                    - that
SOFT SKILL                    - is
SOFT SKILL                    - able
SOFT SKILL                    - to
SOFT SKILL  