In [1]:
from pathlib import Path
from kaggle.api.kaggle_api_extended import KaggleApi

DATASET_DIR = Path.cwd() / "datasets" / "dataturks"
DATASET_PATH = DATASET_DIR / "Entity Recognition in Resumes.json"

if DATASET_DIR.exists():
    print("Dataset already loaded")
else:
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_files('dataturks/resume-entities-for-ner', path=DATASET_DIR, quiet=False, unzip=True)

Dataset already loaded


In [2]:
import spacy
import json
import re

  
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    nlp = spacy.blank("en")
    training_data = []
    
    with open(dataturks_JSON_FilePath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    for line in lines:
        data = json.loads(line)
        text = data['content'].replace("\n", " ").replace("\t", " ").strip()
        doc = nlp.make_doc(text)
        
        entities = []
        data_annotations = data['annotation']
        
        if data_annotations is not None:
            for annotation in data_annotations:
                point = annotation['points'][0]
                labels = annotation['label']
                if not isinstance(labels, list):
                    labels = [labels]
                
                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text'].strip()
                    
                    if not point_text:
                        continue
                    
                    matches = list(re.finditer(re.escape(point_text), text))
                    if matches:
                        match_start, match_end = matches[0].span()
                        
                        token_starts = [token.idx for token in doc]
                        token_ends = [token.idx + len(token.text) for token in doc]
                        
                        if match_start in token_starts and match_end in token_ends:
                            entities.append((match_start, match_end, label))
                        else:
                            containing_tokens = []
                            for token in doc:
                                if token.idx <= match_start < token.idx + len(token.text):
                                    containing_tokens.append(token)
                                elif token.idx < match_end <= token.idx + len(token.text):
                                    containing_tokens.append(token)
                            
                            if containing_tokens:
                                start_token = containing_tokens[0]
                                end_token = containing_tokens[-1]
                                entities.append((
                                    start_token.idx,
                                    end_token.idx + len(end_token.text),
                                    label
                                ))
        
        training_data.append((text, {"entities": entities}))
    
    return training_data

def trim_entity_spans(data: list) -> list:
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [3]:
data = trim_entity_spans(convert_dataturks_to_spacy(DATASET_PATH))
data[0]

["Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

In [4]:
import random
import math

def train_test_split(data, test_size, random_state):

    random.Random(random_state).shuffle(data)
    test_idx = len(data) - math.floor(test_size * len(data))
    train_set = data[0: test_idx]
    test_set = data[test_idx: ]

    return train_set, test_set

In [5]:
train_data, test_data = train_test_split(data, test_size = 0.1, random_state = 42)

In [6]:
def filter_overlapping_entities(data):
    """Remove overlapping entities from training data"""
    cleaned_data = []
    
    for text, annotations in data:
        entities = annotations.get("entities", [])
        
        if not entities:
            cleaned_data.append((text, annotations))
            continue
        
        entities.sort(key=lambda x: x[0])
        
        non_overlapping = []
        prev_end = -1
        
        for start, end, label in entities:
            if start < prev_end:
                print(f"Warning: Skipping overlapping entity '{text[start:end]}' ({label})")
                continue
            
            non_overlapping.append((start, end, label))
            prev_end = end
        
        cleaned_data.append((text, {"entities": non_overlapping}))
    
    return cleaned_data

In [7]:
train_data = filter_overlapping_entities(train_data)
test_data = filter_overlapping_entities(test_data)



In [8]:
import spacy
from spacy.training import Example

def train_spacy():
    nlp = spacy.blank('en')
    
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner', last=True)
    
    for _, annotations in train_data:
         for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.initialize()
        
        for itn in range(10):
            print("Starting iteration " + str(itn))
            random.shuffle(train_data)
            losses = {}
            
            for text, annotations in train_data:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                
                nlp.update(
                    [example],
                    drop=0.2,
                    sgd=optimizer,
                    losses=losses
                )
            print(losses)
    return nlp

In [9]:
nlp = train_spacy()

Starting iteration 0
{'ner': np.float32(12204.455)}
Starting iteration 1
{'ner': np.float32(3686.8716)}
Starting iteration 2
{'ner': np.float32(2579.921)}
Starting iteration 3
{'ner': np.float32(2505.3833)}
Starting iteration 4
{'ner': np.float32(1832.9576)}
Starting iteration 5
{'ner': np.float32(1684.4795)}
Starting iteration 6
{'ner': np.float32(1543.4381)}
Starting iteration 7
{'ner': np.float32(1472.4783)}
Starting iteration 8
{'ner': np.float32(1449.6616)}
Starting iteration 9
{'ner': np.float32(1326.3341)}


In [10]:
import spacy
from itertools import chain
from sklearn.metrics import accuracy_score

def calculate_ner_accuracy(nlp, test_data):
    all_true_labels = []
    all_pred_labels = []
    
    for text, annotations in test_data:
        doc = nlp.make_doc(text)
        true_entities = annotations.get("entities", [])
        
        true_labels = ['O'] * len(doc)
        for start, end, label in true_entities:
            entity_text = text[start:end]
            
            entity_tokens = []
            for token in doc:
                if token.idx >= start and token.idx + len(token.text) <= end:
                    entity_tokens.append(token)
            
            if entity_tokens:
                for i, token in enumerate(entity_tokens):
                    if len(entity_tokens) == 1:
                        true_labels[token.i] = f'U-{label}' 
                    elif i == 0:
                        true_labels[token.i] = f'B-{label}'
                    elif i == len(entity_tokens) - 1:
                        true_labels[token.i] = f'L-{label}'
                    else:
                        true_labels[token.i] = f'I-{label}' 
        
        pred_doc = nlp(text)
        pred_labels = [token.ent_iob_ + ('-' + token.ent_type_ if token.ent_type_ else '') 
                      for token in pred_doc]
        
        pred_labels = [label.replace('B-', 'B-').replace('I-', 'I-').replace('O', 'O') 
                      for label in pred_labels]
        
        min_len = min(len(true_labels), len(pred_labels))
        all_true_labels.extend(true_labels[:min_len])
        all_pred_labels.extend(pred_labels[:min_len])
    
    accuracy = accuracy_score(all_true_labels, all_pred_labels)
    return accuracy



In [11]:
accuracy = calculate_ner_accuracy(nlp, test_data)
print(f"SPACY Accuracy: {accuracy:.4f}")

SPACY Accuracy: 0.9637
