In [1]:
import os
import spacy
import json
import random

from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans

In [3]:
import torch
import spacy

# nlp = spacy.load('en_core_web_trf')

# Check if the model is using GPU
if torch.cuda.is_available():
    print("CUDA is available")
    print(f"Using device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
    print("CUDA is not available")


CUDA is available
Using device: NVIDIA GeForce RTX 3050 Laptop GPU


In [4]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.4.0+cu118
True


In [4]:
torch.zeros(1).cuda()

tensor([0.], device='cuda:0')

In [94]:
def format_dataset_2(dataset:[]):
    
    formatted_dataset = []
    classes = dataset[0]['classes']
        
    for data in dataset:
        
        formatted_data = {}
        
        formatted_data['text'] = data['annotations'][0][0]
        
        entities = data['annotations'][0][1]['entities']
        
        temp_entities = []
        
        for entity in entities:
                        
            start = entity[0]
            end = entity[1]
            label = entity[2]
        
            temp_entities.append((start, end, label))
            
        formatted_data['entities'] = temp_entities
        
        formatted_dataset.append(formatted_data)
        

    return formatted_dataset, classes
        

        
def format_dataset(dataset:list):
    
    formatted_dataset = []
        
    idx = 0
    for data in dataset:
        
        idx += 1
        
        formatted_data = {}
        
        formatted_data['text'] = data['content']
        
        entities = data['annotation']
                
        temp_entities = []
        
        # print(formatted_data['text'])
        # print(f"{idx}. {entities[0]['label'][0]}")
        
        for entity in entities:
                        
            start = entity['points'][0]['start']
            end = entity['points'][0]['end']
            
            if len(entity['label']) > 0:
                label = entity['label'][0]
            else:
                label = ''
        
            temp_entities.append((start, end, label))
            
        formatted_data['entities'] = temp_entities
        
        formatted_dataset.append(formatted_data)
        

    return formatted_dataset
        

def load_dataset(directory:str):
    
    dataset = []
    
    for path, _, files in os.walk(directory):
    
        for file_name in files:
            
            with open(os.path.join(directory, file_name), 'r', encoding='utf8') as f:
                dataset.append(json.load(f))
                
    return format_dataset_2(dataset)
        

In [116]:
# TRAIN_PATH = '../annotations/train_2/Entity Recognition in Resumes.json'
TRAIN_PATH = '../annotations/train_full/'
VAL_PATH = '../annotations/val/'

# TRAIN_DATASET = load_dataset(TRAIN_PATH)
#VAL_DATASET, _ = load_dataset(VAL_PATH)

In [7]:
# TRAIN_DATASET[167]['annotation'][0]

In [6]:
def train_test_split(data, test_size=0.2, shuffle=True, random_seed=None):
    """
    Split data into training and testing sets.

    Parameters:
    - data: List of data points to be split.
    - test_size: Proportion of the dataset to include in the test split (default is 0.2).
    - shuffle: Whether to shuffle the data before splitting (default is True).
    - random_seed: Seed for the random number generator for reproducibility (default is None).

    Returns:
    - train_data: List of training data points.
    - test_data: List of testing data points.
    """
    
    if random_seed is not None:
        random.seed(random_seed)

    if shuffle:
        random.shuffle(data)

    split_index = int(len(data) * (1 - test_size))
    
    train_data = data[:split_index]
    test_data = data[split_index:]

    return train_data, test_data    
    

In [7]:
import json
import re

# JSON formatting functions
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines=[]
    with open(dataturks_JSON_FilePath, 'r', encoding='utf8') as f:
        lines = f.readlines()

    for line in lines:
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        entities = []
        data_annotations = data['annotation']
        if data_annotations is not None:
            for annotation in data_annotations:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']

                    lstrip_diff = len(point_text) - len(point_text.lstrip())
                    rstrip_diff = len(point_text) - len(point_text.rstrip())
                    if lstrip_diff != 0:
                        point_start = point_start + lstrip_diff
                    if rstrip_diff != 0:
                        point_end = point_end - rstrip_diff
                    entities.append((point_start, point_end + 1 , label))
        training_data.append((text, {"entities" : entities}))
    return training_data

def convert_dataset_to_doc_bin(dataset: list, path:str):
    
    nlp = spacy.blank('en')
    doc_bin = DocBin()
    
    for data in tqdm(dataset): # tqdm -> progress bar
        text = data['text']
        temp_entities = data['entities']
        
        doc = nlp.make_doc(text)
        entities = []
        
        for start, end, label in temp_entities:
            
            span = doc.char_span(start, end, label, alignment_mode='contract')
            
            if span is None:
                print('Empty entity')
                continue
            else:
                entities.append(span)
            
        filtered_entities = filter_spans(entities) # Remove duplicate entity
        doc.ents = filtered_entities
        
        doc_bin.add(doc)
        
    doc_bin.to_disk(path)

def trim_entity_spans(dataset: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        dataset (list): The data to be cleaned in SpaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    whitespace_pattern = re.compile(r'\s')
    
    cleaned_data = []
    for text, annotations in dataset:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and whitespace_pattern.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and whitespace_pattern.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data


In [40]:
import spacy
from spacy.training import offsets_to_biluo_tags

def debug_entity_alignment(text, entities):
    nlp = spacy.blank('en')  # Create a blank spaCy model
    doc = nlp.make_doc(text)
    biluo_tags = offsets_to_biluo_tags(doc, entities)
    print("Text:", text)
    print("Entities:", entities)
    print("BILUO Tags:", biluo_tags)

# Example usage
text = "Nidhi Pandit Test Engineer - Infosys Limited"
entities = [(13, 29, 'Skills'), (30, 50, 'Skills')]  # Example entities

debug_entity_alignment(text, entities)


Text: Nidhi Pandit Test Engineer - Infosys Limited
Entities: [(13, 29, 'Skills'), (30, 50, 'Skills')]
BILUO Tags: ['O', 'O', '-', '-', '-', '-', '-']




In [117]:

TRAIN_DOCBIN_PATH = './docbin/train_2.spacy'
VAL_DOCBIN_PATH = './docbin/val_2.spacy'

# CLEANED_TRAIN_DATASET = trim_entity_spans(TRAIN_DATASET)
# CLEANED_VAL_DATASET = trim_entity_spans(VAL_DATASET)

# convert_dataset_to_doc_bin(CLEANED_TRAIN_DATASET, TRAIN_DOCBIN_PATH)
# convert_dataset_to_doc_bin(CLEANED_VAL_DATASET, VAL_DOCBIN_PATH)

# TRAIN_DATASET = trim_entity_spans(convert_dataturks_to_spacy(TRAIN_PATH))
TRAIN_DATASET, _ = load_dataset(TRAIN_PATH)
VAL_DATASET, _ = load_dataset(VAL_PATH)




# TRAIN_DATASET, VAL_DATASET = train_test_split(TRAIN_DATASET, test_size=0.2)


In [84]:
import spacy
import random
from spacy.training import Example
from spacy.util import filter_spans

import spacy
import random
from spacy.training import Example
from spacy.util import filter_spans

import spacy
from spacy.training import Example

def train_spacy(train_data, iterations, dropout_rate):
    # Initialize a blank model
    nlp = spacy.blank("en")
    
    # Create the NER pipeline component and add it to the pipeline
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner", last=True)

    # Add labels to the NER pipeline
    for data in train_data:
    
        
        # print(data['entities'])
        for entity in data['entities']:
            ner.add_label(entity[2])

    # Disable other pipes during training
    pipe_exceptions = ["ner"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    # Begin training the NER model
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()

        for iteration in range(iterations):
            print(f"Iteration {iteration}")
            losses = {}
            # Shuffle the training data before each iteration
            for data in train_data:
                text = data['text']
                annotations = {'entities': data['entities']}  # Format as dict
                
                # Create Example object from text and annotations
                example = Example.from_dict(nlp.make_doc(text), annotations)
                # Update the model with the Example
                nlp.update([example], drop=dropout_rate, sgd=optimizer, losses=losses)
            print(f"Losses at iteration {iteration}: {losses}")

    return nlp


In [132]:
nlp = train_spacy(TRAIN_DATASET, iterations=100, dropout_rate= 0.2)


Iteration 0
Losses at iteration 0: {'ner': 14141.217596370803}
Iteration 1
Losses at iteration 1: {'ner': 6069.867130672563}
Iteration 2
Losses at iteration 2: {'ner': 4261.406314664102}
Iteration 3
Losses at iteration 3: {'ner': 3590.9947881091857}
Iteration 4
Losses at iteration 4: {'ner': 3200.2056669127346}
Iteration 5
Losses at iteration 5: {'ner': 3089.7735169190146}
Iteration 6
Losses at iteration 6: {'ner': 3038.2600473654566}
Iteration 7
Losses at iteration 7: {'ner': 2657.2054157114917}
Iteration 8
Losses at iteration 8: {'ner': 2602.787337627715}
Iteration 9
Losses at iteration 9: {'ner': 2546.864725593591}
Iteration 10
Losses at iteration 10: {'ner': 2415.8585150285026}
Iteration 11
Losses at iteration 11: {'ner': 2230.2860653436273}
Iteration 12
Losses at iteration 12: {'ner': 2181.4572818592433}
Iteration 13
Losses at iteration 13: {'ner': 2004.5517171970605}
Iteration 14
Losses at iteration 14: {'ner': 2011.979171023835}
Iteration 15
Losses at iteration 15: {'ner': 1974.

In [None]:
def validate_data(dataset: list):
    """Prints and checks entity spans for leading and trailing whitespace."""
    whitespace_pattern = re.compile(r'\s')

    for data in dataset:
        text = data['text']
        entities = data['entities']
        for start, end, label in entities:
            if (whitespace_pattern.match(text[start]) or whitespace_pattern.match(text[end - 1])):
                print(f"Invalid entity span detected in text: '{text}'")
                print(f"Start: {start}, End: {end}, Label: {label}")
                print(f"Entity Span: '{text[start:end]}'")

# Validate your data
validate_data(trim_entity_spans(TRAIN_DATASET))


Invalid entity span detected in text: 'Sai Dhir
- Email me on Indeed: indeed.com/r/Sai-Dhir/e6ed06ed081f04cf

WORK EXPERIENCE

Sasken Technologies Pvt. Ltd -  Pune, Maharashtra -

January 2017 to Present

ORACLE -

January 2011 to Present

STP is basically a router that realys ss7 messages through various signally points. In project all
STPs were replaced by ORACLE STP due to its advanced features, high end support, flexibility.
The STP is connected to adjacent SEPs and STPs via signaling links. Based on the address fields
of the SS7 messages, the STP routes the messages to the appropriate outgoing signaling link.

Client: ORACLE
Team Size: 4
Role: fetching data, analyzing, monitoring, troubleshooting
Technologies: filezilla, putty
Major Development
➢ Currently working on External Browser Integration for the Payment Gateway
➢ Multiple Shipping methods Inside Order Invoice

ORACLE -  Gurgaon, Haryana -

October 2016 to January 2017

Karizma Order Manager & Karizma Order System

ORACLE -

In [None]:
!python -m spacy init fill-config ./config/base_config.cfg ./config/config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config\config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
import spacy
spacy.require_gpu()

True

In [None]:
%%capture

!python -m spacy train ./config/config.cfg --output ./model_2 --paths.train ./docbin/train.spacy --paths.dev ./docbin/val.spacy --gpu-id 0 --verbose

In [3]:
def cv_parse(text):
    
    MODEL_PATH = './model_ner/'

    nlp_trained = spacy.load(MODEL_PATH)

    doc = nlp_trained(text)
    for ent in doc.ents:
        print(f'{ent.label_.upper():{30}}- {ent.text}')

In [106]:
job_description = """
As Software Engineer - Backend, you will work in a cross-functional project team to ensure a multitude of systems across products, business, and operations integrate efficiently. You’ll be responsible for designing, building, improving, or maintaining Traveloka services related to new products, business models, business growth, market expansion and process optimization. In addition, you will be Identifying pain points in internal business processes and developing effective, reliable and scalable solutions to work within the expected complexity of the processes. You’ll be expected to deliver the best in class architecture, solution and code. The successful candidate will encounter challenges related to information systems, business process and technology.

As a Backend Software Engineer, you are expected to:

Be responsible for designing, building, improving, or maintaining our backend applications, third-party data integration, data API, backend systems, or working with monitoring tools and infrastructure
Work in cross-functional teams and meet great people regularly from top tier technology, consulting, product, or academic background
Be encouraged to speak your mind, propose ideas, influence others, and continuously grow yourself
Participate and contribute to engineering hygiene such as code review, unit testing, and integration testing
Participate and contribute to the solution and architectural design review.
Participate in the service support as on-call
Participate and contribute to innovation and problem-solving

‎

Requirements

Bachelor's degree in Computer Science or equivalent from a reputable university with good academic results is preferred.
Having minimum 3 years of experience in software engineering (Java), application development or system development + experience in RDBMS and NoSQL databases.
Experience in version control (Git/SVN/Mercurial) and familiarity with development collaboration tools (GitHub/Phabricator/BitBucket).
Experience in CI/CD like Jenkins/Travis CI/TeamCity and related technologies is a plus.
Experience in AWS/GCP/Azure and other technologies like Ansible, Containers, Kubernetes etc is a plus.
Strong object-oriented analysis and design skills.
Passion in software engineering, application development, or systems development.
Good business acumen, excellent problem skills and broad understanding of software and system design.
Comfortable working up and down the technology stack.
Curiosity to explore creative solutions and try new things to solve challenging problems to pull it all together into a user accepted solution.
Participation in multiple end-to-end implementations of system integration, data migration, internal business applications, or configuring vendor-provided solutions.
Excellent interpersonal, communication, and influence skills and personal maturity.

"""


In [8]:
# Testing

from PyPDF2 import PdfReader

# Read PDF

PDF_PATH = '../../PDF/Anthonio Obert - Software Developer - CV (1).pdf'

reader = PdfReader(PDF_PATH)
n_pages = len(reader.pages)

extracted_text = ''

for i in range(n_pages):
    page = reader.pages[i]
    extracted_text += page.extract_text().strip()

cv_parse(extracted_text)



SOFT SKILL                    - Capable of working under pressure
EXPERIENCE                    - Manage and maintain student scores for laboratory subjects across six campuses:
EXPERIENCE                    - Create and maintain SQL query
EXPERIENCE                    - Provide student's scores data to identify and improve laboratory processes.
EXPERIENCE                    - Schedule important dates for laboratory activities.
EXPERIENCE                    - Built a Desktop Application (Next.js, Electron, Typescript, Firebase, UML), Facebook Clone (React + Vite, Typescript, Go, 
GraphQL, PostgreSQL), Android Mobile Application (Kotlin, Firebase) for assistant's self-development project. 
EDUCATION
09/2022 - 09/2026 Computer Science 
Bina Nusantara University - Bachelor's Degree 
GPA:


In [137]:
nlp.to_disk('./model_ner/')