In [2]:
import numpy as np
import pandas as pd
import os
import json
import re

In [6]:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines = []
    with open(dataturks_JSON_FilePath, 'r') as f:
        lines = f.readlines()
    
    for line in lines:
        data = json.loads(line)
        text = data['content'].replace("\n",' ')
        entities = []
        data_annotations = data['annotation']
        if data_annotations is not None:
            for annotations in data_annotations:
                point = annotations['points'][0]
                labels = annotations['label']
                if not isinstance(labels, list):
                    labels = [labels]
                
                for label in labels:
                    point_start = point['start']
                    point_end = point['end']
                    point_text = point['text']
                    
                    lstrip_diff = len(point_text) - len(point_text.lstrip())
                    rstrip_diff = len(point_text) - len(point_text.rstrip())
                    if lstrip_diff != 0:
                        point_start = point_start + lstrip_diff
                    if rstrip_diff != 0:
                        point_end = point_end - rstrip_diff
                    
                    entities.append((point_start, point_end + 1, label))
                    
        training_data.append((text, {"entities": entities}))
    return training_data
                    
        

In [7]:
training_data = convert_dataturks_to_spacy("./Entity Recognition in Resumes.json")

In [16]:
training_data[0]

("Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

In [13]:
def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r"\s")
    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data

In [39]:
data = trim_entity_spans(training_data)

In [15]:
data[0]

["Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to: Bangalore, Karnataka  WORK EXPERIENCE  Application Development Associate  Accenture -  November 2017 to Present  Role: Currently working on Chat-bot. Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input. Also, Training the bot for different possible utterances (Both positive and negative), which will be given as input by the user.  EDUCATION  B.E in Information science and engineering  B.v.b college of engineering and technology -  Hubli, Karnataka  August 2013 to June 2017  12th in Mathematics  Woodbine modern school  April 2011 to March 2013  10th  Kendriya Vidyalaya  April 2001 to March 2011  SKILLS  C (Le

### Overlapping Entities

In [25]:
def clean_entities(training_data):
    
    clean_data = []
    for text, annotation in training_data:
        entities = annotation.get('entities')
        entities_copy = entities.copy()
        
        # append entity only if it is longer than its overlapping entity
        i = 0
        for entity in entities_copy:
            j = 0
            for overlapping_entity in entities_copy:
                # Skip self
                if i != j:
                    e_start, e_end, oe_start, oe_end = entity[0], entity[1], overlapping_entity[0], overlapping_entity[1]
                    # Delete any entity that overlaps, keep if longer
                    if ((e_start >= oe_start and e_start <= oe_end) or (e_end <= oe_end and e_end >= oe_start)) \
                    and ((e_end - e_start) <= (oe_end - oe_start)):
                        entities.remove(entity)
                j += 1
            i += 1
        clean_data.append((text, {'entities': entities}))
                
    return clean_data

In [40]:
data = clean_entities(data)

### Entity Mapping

In [17]:
import spacy

Init Plugin
Init Graph Optimizer
Init Kernel


In [21]:
from spacy.lang.en import English
from spacy.training import offsets_to_biluo_tags

In [22]:
def bilou_tags(data):
    docs = []
    annots = []
    nlp = English()
    for text, annotations in data:
        offsets = annotations['entities']
        doc = nlp(text)
        tags = offsets_to_biluo_tags(doc, offsets)
        docs.append([token.text for token in doc])
        annots.append(tags)
    df_data = pd.DataFrame({'docs':docs, 'annots':annots})
    return df_data

In [28]:
import warnings
warnings.filterwarnings("ignore")

In [31]:
df_data = bilou_tags(data)
df_data.head(5)

Unnamed: 0,docs,annots
0,"[Abhishek, Jha, Application, Development, Asso...","[B-Name, L-Name, B-Designation, I-Designation,..."
1,"[Afreen, Jamadar, Active, member, of, IIIT, Co...","[B-Name, L-Name, O, O, O, O, O, O, O, O, O, U-..."
2,"[Akhil, Yadav, Polemaina, Hyderabad, ,, Telang...","[B-Name, I-Name, L-Name, U-Location, O, O, O, ..."
3,"[Alok, Khandai, Operational, Analyst, (, SQL, ...","[B-Name, L-Name, B-Designation, I-Designation,..."
4,"[Ananya, Chavan, lecturer, -, oracle, tutorial...","[B-Name, L-Name, U-Designation, O, B-Companies..."


In [33]:
df_data.shape

(220, 2)

### Removing Mislabeled Examples

In [34]:
for i in range(len(df_data)):
    if "-" in df_data.loc[i, "annots"]:
        df_data.drop(i, axis='index', inplace=True)
df_data.reset_index(inplace=True)
df_data.shape

(110, 3)

In [41]:
spacy_data = []

for i in range(len(df_data)):
    resume = df_data.loc[i, "docs"]
    annots = df_data.loc[i, "annots"]
    example = []
    for word, annot in zip(resume, annots):
        if not(word.isalnum() or len(word) > 1):
            annot = "O"
        example.append((word, annot))
    spacy_data.append(example)
spacy_data[0]

[('Abhishek', 'B-Name'),
 ('Jha', 'L-Name'),
 ('Application', 'B-Designation'),
 ('Development', 'I-Designation'),
 ('Associate', 'L-Designation'),
 ('-', 'O'),
 ('Accenture', 'U-Companies worked at'),
 (' ', 'O'),
 ('Bengaluru', 'U-Location'),
 (',', 'O'),
 ('Karnataka', 'O'),
 ('-', 'O'),
 ('Email', 'O'),
 ('me', 'O'),
 ('on', 'O'),
 ('Indeed', 'B-Email Address'),
 (':', 'O'),
 ('indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a', 'L-Email Address'),
 (' ', 'O'),
 ('•', 'O'),
 ('To', 'O'),
 ('work', 'O'),
 ('for', 'O'),
 ('an', 'O'),
 ('organization', 'O'),
 ('which', 'O'),
 ('provides', 'O'),
 ('me', 'O'),
 ('the', 'O'),
 ('opportunity', 'O'),
 ('to', 'O'),
 ('improve', 'O'),
 ('my', 'O'),
 ('skills', 'O'),
 ('and', 'O'),
 ('knowledge', 'O'),
 ('for', 'O'),
 ('my', 'O'),
 ('individual', 'O'),
 ('and', 'O'),
 ('company', 'O'),
 ("'s", 'O'),
 ('growth', 'O'),
 ('in', 'O'),
 ('best', 'O'),
 ('possible', 'O'),
 ('ways', 'O'),
 ('.', 'O'),
 (' ', 'O'),
 ('Willing', 'O'),
 ('to', 'O'),
 ('reloca

### From BILOU to SpaCy

In [36]:
data = [[(doc, annot) for doc, annot in zip(df_data["docs"][i], df_data["annots"][i]) if doc.isalnum() or len(doc) > 1] 
         for i in range(len(df_data))]
data

[[('Abhishek', 'B-Name'),
  ('Jha', 'L-Name'),
  ('Application', 'B-Designation'),
  ('Development', 'I-Designation'),
  ('Associate', 'L-Designation'),
  ('Accenture', 'U-Companies worked at'),
  ('Bengaluru', 'U-Location'),
  ('Karnataka', 'O'),
  ('Email', 'O'),
  ('me', 'O'),
  ('on', 'O'),
  ('Indeed', 'B-Email Address'),
  ('indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a', 'L-Email Address'),
  ('To', 'O'),
  ('work', 'O'),
  ('for', 'O'),
  ('an', 'O'),
  ('organization', 'O'),
  ('which', 'O'),
  ('provides', 'O'),
  ('me', 'O'),
  ('the', 'O'),
  ('opportunity', 'O'),
  ('to', 'O'),
  ('improve', 'O'),
  ('my', 'O'),
  ('skills', 'O'),
  ('and', 'O'),
  ('knowledge', 'O'),
  ('for', 'O'),
  ('my', 'O'),
  ('individual', 'O'),
  ('and', 'O'),
  ('company', 'O'),
  ("'s", 'O'),
  ('growth', 'O'),
  ('in', 'O'),
  ('best', 'O'),
  ('possible', 'O'),
  ('ways', 'O'),
  ('Willing', 'O'),
  ('to', 'O'),
  ('relocate', 'O'),
  ('to', 'O'),
  ('Bangalore', 'O'),
  ('Karnataka', 'O'),
  ('

In [42]:
spacy_data1 = []
for example in spacy_data:
    text = ""
    entities = []
    start = 0
    for w, t in example:
        text += w + " "
        if t != "O":
            entities.append((start, start + len(w), t[2: ]))
        start += len(w) + 1
    spacy_data1.append((text, {"entities": entities}))

spacy_data1[0]

("Abhishek Jha Application Development Associate - Accenture   Bengaluru , Karnataka - Email me on Indeed : indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a   • To work for an organization which provides me the opportunity to improve my skills and knowledge for my individual and company 's growth in best possible ways .   Willing to relocate to : Bangalore , Karnataka   WORK EXPERIENCE   Application Development Associate   Accenture -   November 2017 to Present   Role : Currently working on Chat - bot . Developing Backend Oracle PeopleSoft Queries for the Bot which will be triggered based on given input . Also , Training the bot for different possible utterances ( Both positive and negative ) , which will be given as input by the user .   EDUCATION   B.E in Information science and engineering   B.v.b college of engineering and technology -   Hubli , Karnataka   August 2013 to June 2017   12th in Mathematics   Woodbine modern school   April 2011 to March 2013   10th   Kendriya Vidyalaya   Apr

In [38]:
import random
import math

def train_test_split(data, test_size, random_state):

    random.Random(random_state).shuffle(data)
    test_idx = len(data) - math.floor(test_size * len(data))
    train_set = data[0: test_idx]
    test_set = data[test_idx: ]

    return train_set, test_set

In [43]:
train_data, test_data = train_test_split(data, test_size = 0.1, random_state = 42)

In [52]:
from spacy.training.example import Example

def train_spacy():
    nlp = spacy.blank('en') # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner', last=True)
        
    #add labels
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])
            
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe!='ner']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for i in range(10):
            print("Startting iteration: " + str(i))
            random.shuffle(train_data)
            losses = {}
            for text, annotations in train_data:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update(
                    [example],
                    drop=0.2,
                    sgd=optimizer,
                    losses=losses
                )
            print(losses)
    return nlp

In [53]:
nlp = train_spacy()

Startting iteration: 0
{'ner': 12714.107547996162}
Startting iteration: 1
{'ner': 5185.351196570317}
Startting iteration: 2
{'ner': 3936.6018085440624}
Startting iteration: 3
{'ner': 3388.3289496835555}
Startting iteration: 4
{'ner': 3472.2929171825235}
Startting iteration: 5
{'ner': 2989.025504717005}
Startting iteration: 6
{'ner': 2873.7848576008696}
Startting iteration: 7
{'ner': 2721.5860096061974}
Startting iteration: 8
{'ner': 2481.70971136244}
Startting iteration: 9
{'ner': 2457.8511689123843}


In [77]:
from itertools import groupby

def doc_to_bilou(nlp, text):
    
    doc = nlp(text)
    tokens = [(tok.text, tok.idx, tok.ent_type_) for tok in doc]
    entities = []
    for entity, group in groupby(tokens, key=lambda t: t[-1]):
        if not entity:
            continue
        group = list(group)
        _, start_word_idx, _ = group[0]
        last_word, last_word_idx, _ = group[-1]
        end_idx = last_word_idx + len(last_word)
        entities.append((
                start_word_idx,
                end_idx,
                entity
            ))
    #print(entities)
    parse = Example.from_dict(nlp.make_doc(text), {'entities':entities})
    pred_ents = parse.get_aligned_ner()
    
    return pred_ents

In [79]:
y_test = []
y_pred = []

for text, annots in test_data:
    
    gold = Example.from_dict(nlp.make_doc(text), annots)
    ents = gold.get_aligned_ner()
    pred_ents = doc_to_bilou(nlp, text)
    
    y_test.append(ents)
    y_pred.append(pred_ents)

In [84]:
from spacy.scorer import Scorer
from spacy.tokens import Doc
from spacy.training.example import Example

def evaluate(nlp, test_data):
    scorer = Scorer()
    example = []
    for text, annots in test_data:
        pred = nlp(text)
        temp = Example.from_dict(pred, annots)
        example.append(temp)
    scores = scorer.score(example)
    return scores

In [85]:
scores = evaluate(nlp,test_data)

In [86]:
scores

{'token_acc': 1.0,
 'token_p': 1.0,
 'token_r': 1.0,
 'token_f': 1.0,
 'sents_p': None,
 'sents_r': None,
 'sents_f': None,
 'tag_acc': None,
 'pos_acc': None,
 'morph_acc': None,
 'morph_micro_p': None,
 'morph_micro_r': None,
 'morph_micro_f': None,
 'morph_per_feat': None,
 'dep_uas': None,
 'dep_las': None,
 'dep_las_per_type': None,
 'ents_p': 0.6616915422885572,
 'ents_r': 0.4290322580645161,
 'ents_f': 0.5205479452054794,
 'ents_per_type': {'Name': {'p': 0.9545454545454546,
   'r': 0.9130434782608695,
   'f': 0.9333333333333332},
  'Location': {'p': 0.75, 'r': 0.42857142857142855, 'f': 0.5454545454545454},
  'Email Address': {'p': 0.6842105263157895,
   'r': 0.7647058823529411,
   'f': 0.7222222222222222},
  'Companies worked at': {'p': 0.36666666666666664,
   'r': 0.30985915492957744,
   'f': 0.3358778625954198},
  'Skills': {'p': 0.46153846153846156,
   'r': 0.20689655172413793,
   'f': 0.28571428571428575},
  'Degree': {'p': 0.9545454545454546,
   'r': 0.7777777777777778,
   