In [None]:
import json
import random
from sklearn.model_selection import train_test_split
import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
from tqdm import tqdm
from spacy.training import Example
import pandas as pd
from sklearn.metrics import classification_report
from spacy.training import offsets_to_biluo_tags
from spacy.tokens import Span
from spacy.training import Example, offsets_to_biluo_tags
from spacy.util import minibatch
from sklearn.metrics import precision_recall_fscore_support
import re
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
from pathlib import Path


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import random
import spacy
import re
from spacy.training import Example, offsets_to_biluo_tags
from spacy.util import minibatch
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support


class SpacyNERTrainer:
    def __init__(self, data_path):
        self.data_path = data_path
        self.raw_data = self.load_jsonl_data()
        self.train_data, temp_data = train_test_split(self.raw_data, test_size=0.2, random_state=42)
        self.val_data, self.test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

        self.train_spacy = self.convert_to_spacy_format(self.train_data)
        self.val_spacy = self.convert_to_spacy_format(self.val_data)
        self.test_spacy = self.convert_to_spacy_format(self.test_data)

        self.nlp, self.ner = self.initialize_model()
        self.labels = self.add_labels()
        self.train_model()

    def load_jsonl_data(self):
        data = []
        with open(self.data_path, 'r', encoding='utf-8') as f:
            try:
                data = [json.loads(line) for line in f]
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON line: {e}")
        return data

    def filter_valid_entities(self, text, entities):
        nlp = spacy.blank("en")
        doc = nlp.make_doc(text)
        valid_entities = []
        seen = set()

        entities.sort(key=lambda x: (x[0], - (x[1] - x[0])))  # sort by start, longest first
        last_end = -1

        for start, end, label in entities:
            if (start, end, label) in seen:
                print(f"Skipping duplicate entity: ({start}, {end}, {label})")
                continue
            if start < last_end:
                print(f"Skipping overlapping entity: ({start}, {end}, {label})")
                continue
            try:
                biluo = offsets_to_biluo_tags(doc, [(start, end, label)])
                if '-' not in biluo:
                    valid_entities.append((start, end, label))
                    last_end = end
                    seen.add((start, end, label))
                else:
                    print(f"Skipping misaligned entity: ({start}, {end}, {label})")
            except ValueError as e:
                print(f"Skipping entity due to alignment error: ({start}, {end}, {label}) - {e}")
        return valid_entities

    def convert_to_spacy_format(self, data):
        nlp = spacy.blank("en")
        spacy_data = []

        for item in data:
            text = item['content'].replace("\n", " ")
            entities = []

            for ann in item.get('annotation', []):
                labels = ann.get('label', [])
                if not labels or not isinstance(labels, list):
                    point_text = ann.get('points', [{}])[0].get('text', '').lower()
                    if 'b.b.m' in point_text or 'b.e' in point_text or 'mba' in point_text:
                        labels = ['Degree']
                    elif 'oracle' in point_text or 'microsoft' in point_text or 'accenture' in point_text:
                        labels = ['Companies worked at']
                    else:
                        labels = ['UNKNOWN']
                    print(f"Assigned label {labels} to annotation: {ann}")

                for point in ann.get('points', []):
                    try:
                        point_start = point['start']
                        point_end = point['end']
                        point_text = point.get('text', '')

                        lstrip_diff = len(point_text) - len(point_text.lstrip())
                        rstrip_diff = len(point_text) - len(point_text.rstrip())
                        if lstrip_diff:
                            point_start += lstrip_diff
                        if rstrip_diff:
                            point_end -= rstrip_diff

                        for label in labels:
                            entities.append((point_start, point_end + 1, label))
                    except (KeyError, TypeError) as e:
                        print(f"Skipping invalid point in annotation {ann}: {e}")
                        continue

            filtered_entities = self.filter_valid_entities(text, entities)
            if filtered_entities:
                spacy_data.append((text, {"entities": filtered_entities}))

        return spacy_data

    def initialize_model(self):
        nlp = spacy.blank("en")
        if "ner" not in nlp.pipe_names:
            ner = nlp.add_pipe("ner")
        else:
            ner = nlp.get_pipe("ner")
        return nlp, ner

    def add_labels(self):
        labels = set()
        for _, annotations in self.train_spacy:
            for _, _, label in annotations.get('entities', []):
                labels.add(label)
        for label in labels:
            self.ner.add_label(label)
        return labels

    def train_model(self, n_iter=50):
        optimizer = self.nlp.begin_training()
        other_pipes = [pipe for pipe in self.nlp.pipe_names if pipe != "ner"]

        with self.nlp.disable_pipes(*other_pipes):
            for itn in range(n_iter):
                random.shuffle(self.train_spacy)
                losses = {}
                batches = minibatch(self.train_spacy, size=8)

                for batch in batches:
                    for text, annotations in batch:
                        doc = self.nlp.make_doc(text)
                        try:
                            example = Example.from_dict(doc, annotations)
                            self.nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)
                        except ValueError as e:
                            print(f"Skipping example due to error: {e}")
                            continue

                val_loss = self.evaluate_model(self.val_spacy)
                print(f"Iteration {itn + 1}, Losses: {losses}, Validation Loss: {val_loss}")

    def evaluate_model(self, data):
        losses = {}
        for text, annotations in data:
            doc = self.nlp.make_doc(text)
            try:
                example = Example.from_dict(doc, annotations)
                self.nlp.update([example], losses=losses)
            except ValueError as e:
                print(f"Skipping evaluation example due to error: {e}")
                continue
        return losses.get("ner", 0.0)


    def print_evaluation_metrics(self):
        metrics = self.calculate_metrics()

        # Print NER Level Metrics
        print("\n" + "="*50)
        print("NER LEVEL METRICS".center(50))
        print("="*50)
        if metrics['ner_level']:
            ner_data = []
            for label, scores in metrics['ner_level'].items():
                if label == "overall":
                    continue
                ner_data.append([
                    label,
                    f"{scores['precision']:.3f}",
                    f"{scores['recall']:.3f}",
                    f"{scores['f1']:.3f}",
                    scores['support']
                ])

            # Add overall row
            overall = metrics['ner_level'].get('overall', {})
            if overall:
                ner_data.append([
                    "OVERALL",
                    f"{overall['precision']:.3f}",
                    f"{overall['recall']:.3f}",
                    f"{overall['f1']:.3f}",
                    overall['support']
                ])

            # Print table
            from tabulate import tabulate
            print(tabulate(
                ner_data,
                headers=["Label", "Precision", "Recall", "F1-Score", "Support"],
                tablefmt="grid",
                floatfmt=".3f"
            ))
        else:
            print("No NER level metrics available")

        # Print Token Level Metrics
        print("\n" + "="*50)
        print("TOKEN LEVEL METRICS".center(50))
        print("="*50)
        if metrics['token_level']:
            token_data = []
            for label, scores in metrics['token_level'].items():
                if label == "overall":
                    continue
                token_data.append([
                    label,
                    f"{scores['precision']:.3f}",
                    f"{scores['recall']:.3f}",
                    f"{scores['f1_score']:.3f}",
                    scores['support']
                ])

            # Add overall row
            overall = metrics['token_level'].get('overall', {})
            if overall:
                token_data.append([
                    "OVERALL",
                    f"{overall['precision']:.3f}",
                    f"{overall['recall']:.3f}",
                    f"{overall['f1_score']:.3f}",
                    overall['support']
                ])

            # Print table
            print(tabulate(
                token_data,
                headers=["Label", "Precision", "Recall", "F1-Score", "Support"],
                tablefmt="grid",
                floatfmt=".3f"
            ))
        else:
            print("No token level metrics available")
    def calculate_metrics(self):
        ner_true, ner_pred = [], []
        token_true, token_pred = [], []

        for text, annotations in self.test_spacy:
            true_entities = annotations.get("entities", [])
            doc = self.nlp(text)
            pred_entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

            ner_true.append(true_entities)
            ner_pred.append(pred_entities)

            example = Example.from_dict(self.nlp.make_doc(text), annotations)
            true_doc = example.reference
            true_token_labels = ["O"] * len(true_doc)
            for ent in true_entities:
                start_char, end_char, label = ent
                for token in true_doc:
                    if token.idx >= start_char and token.idx + len(token) <= end_char:
                        true_token_labels[token.i] = label

            pred_token_labels = ["O"] * len(doc)
            for ent in pred_entities:
                start_char, end_char, label = ent
                for token in doc:
                    if token.idx >= start_char and token.idx + len(token) <= end_char:
                        pred_token_labels[token.i] = label

            token_true.extend(true_token_labels)
            token_pred.extend(pred_token_labels)

        flat_true_labels = []
        flat_pred_labels = []
        for true_ents, pred_ents in zip(ner_true, ner_pred):
            true_set = set(true_ents)
            pred_set = set(pred_ents)

            for true_ent in true_ents:
                _, _, true_label = true_ent
                matched = False
                for pred_ent in pred_ents:
                    if pred_ent[:2] == true_ent[:2]:
                        flat_true_labels.append(true_label)
                        flat_pred_labels.append(pred_ent[2])
                        matched = True
                        break
                if not matched:
                    flat_true_labels.append(true_label)
                    flat_pred_labels.append("O")

            for pred_ent in pred_ents:
                if pred_ent not in true_set:
                    flat_true_labels.append("O")
                    flat_pred_labels.append(pred_ent[2])

        ner_labels = sorted(set(flat_true_labels + flat_pred_labels) - {"O"})
        ner_metrics = {}
        if flat_true_labels and flat_pred_labels:
            precision, recall, f1, support = precision_recall_fscore_support(
                flat_true_labels, flat_pred_labels, labels=ner_labels, zero_division=0
            )
            for label, p, r, f, s in zip(ner_labels, precision, recall, f1, support):
                ner_metrics[label] = {
                    "precision": p, "recall": r, "f1": f, "support": s
                }

            precision, recall, f1, _ = precision_recall_fscore_support(
                flat_true_labels, flat_pred_labels, average="weighted", zero_division=0
            )
            ner_metrics["overall"] = {
                "precision": precision, "recall": recall, "f1": f,
                "support": len(flat_true_labels)
            }

        token_labels = sorted(set(token_true + token_pred))
        token_metrics = {}
        if token_true and token_pred:
            precision, recall, f1, support = precision_recall_fscore_support(
                token_true, token_pred, labels=token_labels, zero_division=0
            )
            for label, p, r, f, s in zip(token_labels, precision, recall, f1, support):
                token_metrics[label] = {
                    "precision": p, "recall": r, "f1_score": f, "support": s
                }

            precision, recall, f1, _ = precision_recall_fscore_support(
                token_true, token_pred, average="weighted", zero_division=0
            )
            token_metrics["overall"] = {
                "precision": precision, "recall": recall, "f1_score": f,
                "support": len(token_true)
            }

        return {
            "ner_level": ner_metrics,
            "token_level": token_metrics
        }


# Usage
data_path = "/content/drive/MyDrive/Entity.json"
trainer = SpacyNERTrainer(data_path)
trainer.print_evaluation_metrics()




Skipping misaligned entity: (1801, 1842, Email Address)




Skipping misaligned entity: (1012, 1021, Companies worked at)
Skipping misaligned entity: (1050, 1071, Degree)
Skipping misaligned entity: (1073, 1095, College Name)
Skipping misaligned entity: (1131, 1135, Graduation Year)
Skipping misaligned entity: (1263, 1378, Skills)
Skipping misaligned entity: (1502, 1600, Skills)
Skipping misaligned entity: (1514, 1523, Companies worked at)
Skipping misaligned entity: (1573, 1577, Graduation Year)
Skipping misaligned entity: (1860, 1872, Name)




Skipping misaligned entity: (3108, 3150, Email Address)
Skipping misaligned entity: (3232, 3241, Location)
Skipping misaligned entity: (3261, 3265, Graduation Year)
Skipping misaligned entity: (6599, 6666, College Name)
Skipping misaligned entity: (6703, 6707, Graduation Year)
Skipping misaligned entity: (6733, 6848, Skills)




Skipping misaligned entity: (1894, 2173, Skills)
Skipping overlapping entity: (46, 55, Location)
Skipping overlapping entity: (3803, 3807, Graduation Year)
Skipping overlapping entity: (3981, 3990, Location)




Skipping misaligned entity: (1667, 1705, Email Address)
Skipping overlapping entity: (2528, 2532, College Name)
Skipping duplicate entity: (13, 40, Designation)
Skipping misaligned entity: (1173, 1200, Designation)
Skipping misaligned entity: (1180, 1200, Designation)
Skipping misaligned entity: (1202, 1217, Companies worked at)
Skipping misaligned entity: (1531, 1551, Designation)
Skipping misaligned entity: (1553, 1583, Companies worked at)
Skipping misaligned entity: (3982, 4412, Skills)




Skipping duplicate entity: (13, 34, Designation)
Skipping duplicate entity: (370, 391, Designation)
Skipping misaligned entity: (1192, 1234, Email Address)
Skipping misaligned entity: (1767, 1808, Email Address)




Skipping overlapping entity: (1161, 1164, Companies worked at)
Skipping overlapping entity: (2886, 2889, Companies worked at)
Skipping misaligned entity: (4215, 4219, Graduation Year)
Skipping misaligned entity: (5312, 5367, Degree)
Skipping misaligned entity: (5369, 5423, College Name)
Skipping misaligned entity: (5457, 5461, Graduation Year)
Skipping misaligned entity: (5471, 5838, Skills)




Skipping misaligned entity: (6642, 6696, Degree)
Skipping misaligned entity: (6698, 6729, College Name)
Skipping misaligned entity: (6778, 6843, Degree)
Skipping misaligned entity: (6845, 6875, College Name)
Skipping misaligned entity: (6930, 7494, Skills)
Skipping misaligned entity: (2177, 2180, Degree)
Skipping misaligned entity: (2182, 2210, College Name)




Skipping misaligned entity: (2109, 2151, Email Address)
Skipping misaligned entity: (1811, 1848, Email Address)
Skipping misaligned entity: (2331, 2339, Degree)




Skipping overlapping entity: (4186, 4191, Companies worked at)
Skipping duplicate entity: (34, 49, Companies worked at)
Skipping overlapping entity: (941, 946, Companies worked at)
Skipping overlapping entity: (1077, 1120, Email Address)
Skipping overlapping entity: (1198, 1203, Companies worked at)
Skipping overlapping entity: (1319, 1324, Companies worked at)
Skipping overlapping entity: (1794, 1799, Companies worked at)




Skipping misaligned entity: (283, 327, Email Address)
Skipping misaligned entity: (1754, 1792, Email Address)




Skipping misaligned entity: (2474, 2514, Email Address)
Skipping misaligned entity: (2714, 2726, Degree)
Skipping misaligned entity: (2728, 2742, College Name)
Skipping misaligned entity: (2877, 3030, Skills)
Skipping overlapping entity: (21, 35, Designation)
Skipping overlapping entity: (823, 837, Designation)
Skipping misaligned entity: (2211, 2254, Email Address)
Skipping overlapping entity: (6682, 6687, Companies worked at)
Skipping overlapping entity: (6693, 6698, Companies worked at)
Skipping overlapping entity: (6708, 6713, Companies worked at)
Skipping overlapping entity: (7028, 7033, Companies worked at)
Skipping overlapping entity: (7068, 7073, Companies worked at)
Skipping overlapping entity: (7127, 7132, Companies worked at)
Skipping overlapping entity: (7153, 7158, Companies worked at)




Skipping misaligned entity: (389, 393, Graduation Year)
Skipping misaligned entity: (978, 1026, College Name)
Skipping misaligned entity: (1027, 1031, Graduation Year)
Skipping misaligned entity: (1089, 1093, Graduation Year)
Skipping misaligned entity: (1358, 1400, Email Address)
Skipping overlapping entity: (3535, 3541, Companies worked at)
Skipping overlapping entity: (3714, 3720, Companies worked at)
Skipping overlapping entity: (458, 467, Skills)




Skipping misaligned entity: (2529, 2573, Email Address)
Skipping misaligned entity: (3110, 3846, Skills)
Skipping misaligned entity: (3878, 3937, Degree)
Skipping overlapping entity: (381, 390, Companies worked at)
Skipping overlapping entity: (411, 420, Companies worked at)
Skipping overlapping entity: (16, 25, Companies worked at)
Skipping overlapping entity: (26, 58, Designation)
Skipping overlapping entity: (539, 548, Companies worked at)
Skipping overlapping entity: (549, 581, Designation)




Skipping misaligned entity: (1794, 1826, Degree)
Skipping misaligned entity: (1338, 1345, Location)
Skipping misaligned entity: (3711, 3723, Degree)
Skipping misaligned entity: (3725, 3742, College Name)
Skipping misaligned entity: (3758, 4638, Skills)




Skipping misaligned entity: (3173, 3200, Degree)
Skipping misaligned entity: (3202, 3216, College Name)
Skipping misaligned entity: (3234, 3294, Degree)
Skipping misaligned entity: (3296, 3311, College Name)
Skipping misaligned entity: (3321, 3376, Skills)
Skipping misaligned entity: (1454, 1499, Email Address)
Skipping overlapping entity: (3939, 3948, Companies worked at)
Skipping misaligned entity: (4167, 4176, Companies worked at)




Skipping misaligned entity: (1394, 1437, Email Address)
Skipping duplicate entity: (15, 44, Designation)
Skipping duplicate entity: (47, 57, Companies worked at)
Skipping misaligned entity: (1678, 1688, Companies worked at)
Skipping misaligned entity: (1435, 1480, Email Address)




Skipping misaligned entity: (1412, 1421, Companies worked at)
Skipping misaligned entity: (1909, 1951, Email Address)
Skipping misaligned entity: (2547, 2550, Degree)
Skipping misaligned entity: (2552, 2569, College Name)
Skipping misaligned entity: (2684, 2688, Location)
Skipping misaligned entity: (2734, 2845, Skills)
Skipping misaligned entity: (2889, 3089, Skills)
Skipping overlapping entity: (894, 897, Skills)
Skipping overlapping entity: (1153, 1183, College Name)
Skipping misaligned entity: (1210, 1247, Email Address)
Skipping overlapping entity: (1742, 1745, Skills)
Skipping overlapping entity: (1765, 1771, Skills)
Skipping overlapping entity: (1811, 1815, Skills)
Skipping overlapping entity: (1844, 1860, Skills)




Skipping misaligned entity: (2163, 2205, Email Address)
Skipping misaligned entity: (792, 807, Designation)
Skipping misaligned entity: (809, 834, Companies worked at)
Skipping misaligned entity: (1094, 1134, Email Address)
Skipping misaligned entity: (1445, 1470, Companies worked at)
Skipping misaligned entity: (2025, 2040, Designation)
Skipping misaligned entity: (2042, 2067, Companies worked at)
Skipping misaligned entity: (2080, 2086, Degree)
Skipping misaligned entity: (2088, 2130, College Name)
Skipping overlapping entity: (38, 44, Companies worked at)
Skipping misaligned entity: (1789, 1827, Email Address)
Skipping misaligned entity: (20, 40, Designation)




Skipping misaligned entity: (973, 1703, Skills)
Skipping misaligned entity: (2022, 2061, Email Address)
Skipping overlapping entity: (2128, 2143, Companies worked at)




Skipping misaligned entity: (872, 875, Degree)
Skipping misaligned entity: (877, 895, College Name)
Skipping misaligned entity: (971, 1015, Email Address)




Skipping misaligned entity: (5795, 5812, Companies worked at)
Skipping misaligned entity: (5853, 5870, Companies worked at)
Skipping misaligned entity: (5913, 5930, Companies worked at)
Skipping misaligned entity: (5995, 6012, Companies worked at)
Skipping overlapping entity: (15, 34, Designation)
Skipping misaligned entity: (2999, 3043, Email Address)
Skipping overlapping entity: (4708, 4717, Companies worked at)




Skipping misaligned entity: (5167, 5186, Designation)
Skipping misaligned entity: (10352, 10361, Companies worked at)
Skipping misaligned entity: (10409, 10418, Companies worked at)
Skipping misaligned entity: (10611, 10620, Companies worked at)
Skipping misaligned entity: (11438, 11447, Companies worked at)
Skipping misaligned entity: (14240, 14249, Companies worked at)
Skipping misaligned entity: (17215, 17234, Designation)
Skipping misaligned entity: (17772, 17791, Designation)




Skipping misaligned entity: (18315, 18334, Designation)
Skipping misaligned entity: (19383, 19402, Designation)




Skipping misaligned entity: (2176, 2215, Email Address)




Skipping misaligned entity: (1877, 1890, Location)
Skipping misaligned entity: (4058, 4101, Degree)
Skipping misaligned entity: (4232, 4330, Skills)




Skipping misaligned entity: (174, 183, Degree)
Skipping overlapping entity: (43, 50, Companies worked at)
Skipping overlapping entity: (996, 1003, Companies worked at)
Skipping overlapping entity: (1106, 1113, Companies worked at)
Skipping overlapping entity: (729, 735, Location)
Assigned label ['Degree'] to annotation: {'label': [], 'points': [{'start': 7878, 'end': 7882, 'text': 'B.B.M'}]}




Skipping misaligned entity: (1432, 1470, Designation)
Skipping misaligned entity: (1471, 1478, Companies worked at)
Skipping misaligned entity: (2090, 2137, Email Address)
Skipping misaligned entity: (5840, 5847, Companies worked at)
Skipping misaligned entity: (7878, 7883, Degree)
Skipping overlapping entity: (4231, 4238, Companies worked at)




Skipping misaligned entity: (126, 132, Years of Experience)
Skipping misaligned entity: (368, 409, Email Address)




Skipping misaligned entity: (700, 715, Designation)
Skipping misaligned entity: (718, 759, College Name)
Skipping misaligned entity: (763, 769, Location)
Skipping misaligned entity: (4359, 4365, Location)
Skipping misaligned entity: (6897, 6934, Degree)
Skipping misaligned entity: (6936, 6972, College Name)
Skipping misaligned entity: (6994, 7349, Skills)
Skipping misaligned entity: (1234, 1277, Email Address)
Assigned label ['Companies worked at'] to annotation: {'label': [], 'points': [{'start': 2585, 'end': 2590, 'text': 'Oracle'}]}
Skipping overlapping entity: (60, 66, Companies worked at)
Skipping overlapping entity: (263, 269, Companies worked at)
Skipping overlapping entity: (556, 562, Companies worked at)
Skipping overlapping entity: (2184, 2190, Companies worked at)
Skipping overlapping entity: (2407, 2413, Companies worked at)
Skipping overlapping entity: (2562, 2568, Skills)
Skipping overlapping entity: (2585, 2591, Companies worked at)
Skipping overlapping entity: (2665, 26



Skipping misaligned entity: (370, 405, Designation)
Skipping misaligned entity: (409, 416, Companies worked at)
Skipping misaligned entity: (515, 550, Designation)
Skipping misaligned entity: (554, 561, Companies worked at)
Skipping misaligned entity: (3484, 3499, College Name)
Skipping misaligned entity: (3533, 3537, Graduation Year)
Skipping misaligned entity: (3539, 3544, Degree)
Skipping misaligned entity: (3546, 3559, College Name)
Skipping misaligned entity: (3598, 3602, Graduation Year)
Skipping misaligned entity: (3660, 3664, Graduation Year)




Skipping misaligned entity: (2272, 2316, Email Address)
Skipping misaligned entity: (962, 1095, Skills)
Skipping overlapping entity: (6861, 6870, Companies worked at)




Skipping misaligned entity: (35, 48, Years of Experience)
Skipping misaligned entity: (1345, 1354, Companies worked at)
Skipping misaligned entity: (2302, 2320, Designation)
Skipping misaligned entity: (2322, 2331, Companies worked at)
Skipping misaligned entity: (2396, 2399, Degree)
Skipping misaligned entity: (2416, 2493, Skills)
Skipping misaligned entity: (2537, 2756, Skills)
Skipping misaligned entity: (2537, 2546, Companies worked at)
Skipping misaligned entity: (2733, 2742, Companies worked at)




Skipping misaligned entity: (2391, 2433, Email Address)
Skipping duplicate entity: (10, 32, Designation)
Skipping misaligned entity: (396, 418, Designation)
Skipping misaligned entity: (420, 449, Companies worked at)
Skipping misaligned entity: (552, 610, Degree)
Skipping misaligned entity: (612, 640, College Name)
Skipping misaligned entity: (676, 680, Graduation Year)
Skipping misaligned entity: (800, 858, Skills)
Skipping misaligned entity: (872, 911, Email Address)
Skipping overlapping entity: (295, 304, Companies worked at)




Skipping misaligned entity: (1143, 1163, Designation)
Skipping misaligned entity: (1167, 1182, Companies worked at)
Skipping misaligned entity: (3373, 3419, Degree)
Skipping misaligned entity: (3421, 3462, College Name)
Skipping misaligned entity: (3501, 3557, Degree)
Skipping misaligned entity: (3559, 3589, College Name)
Skipping misaligned entity: (3615, 3673, Degree)
Skipping misaligned entity: (3675, 3704, College Name)
Skipping misaligned entity: (3753, 3794, College Name)
Skipping misaligned entity: (3828, 3931, Skills)
Skipping overlapping entity: (1258, 1262, Location)
Skipping misaligned entity: (1308, 1349, Email Address)




Skipping misaligned entity: (1656, 1698, Email Address)
Skipping overlapping entity: (116, 122, Companies worked at)
Skipping overlapping entity: (463, 469, Companies worked at)




Skipping misaligned entity: (3813, 3814, Skills)
Skipping misaligned entity: (4068, 4069, Skills)




Skipping misaligned entity: (1945, 1989, Email Address)




Skipping misaligned entity: (2473, 2498, Designation)
Skipping misaligned entity: (277, 328, Email Address)




Skipping misaligned entity: (3111, 3152, Email Address)




Skipping misaligned entity: (1361, 1408, Email Address)




Skipping misaligned entity: (1103, 1131, Degree)
Skipping misaligned entity: (1133, 1145, College Name)
Skipping misaligned entity: (1467, 1476, Companies worked at)
Skipping misaligned entity: (1750, 1759, Companies worked at)
Skipping misaligned entity: (2035, 2077, Email Address)




Skipping misaligned entity: (2907, 2938, Designation)
Skipping misaligned entity: (2907, 2937, Designation)
Skipping misaligned entity: (3078, 3082, Location)
Skipping misaligned entity: (4490, 4494, Location)
Skipping misaligned entity: (5268, 5272, Location)
Skipping misaligned entity: (10893, 10953, Degree)
Skipping misaligned entity: (10955, 10999, College Name)
Skipping misaligned entity: (11015, 11048, Degree)
Skipping misaligned entity: (11050, 11085, College Name)
Skipping misaligned entity: (11132, 11234, Skills)
Skipping misaligned entity: (265, 307, Email Address)




Skipping misaligned entity: (1369, 1413, Email Address)
Skipping misaligned entity: (3996, 3999, Skills)




Skipping misaligned entity: (50, 56, Companies worked at)
Skipping duplicate entity: (667, 670, Skills)
Skipping duplicate entity: (3243, 3246, Skills)
Skipping duplicate entity: (11578, 11581, Skills)
Skipping duplicate entity: (13883, 13886, Skills)




Skipping misaligned entity: (1305, 1347, Email Address)




Skipping misaligned entity: (5670, 5780, Skills)




Skipping misaligned entity: (268, 288, Companies worked at)
Skipping overlapping entity: (1342, 1348, Companies worked at)




Skipping misaligned entity: (4368, 4396, Degree)
Skipping misaligned entity: (4398, 4410, College Name)
Skipping misaligned entity: (4431, 4443, College Name)
Skipping misaligned entity: (4445, 4449, Designation)
Skipping misaligned entity: (4459, 4959, Skills)




Skipping misaligned entity: (2105, 2148, Email Address)




Skipping misaligned entity: (34, 45, Companies worked at)
Skipping misaligned entity: (1718, 1729, Companies worked at)
Skipping misaligned entity: (1960, 1971, Companies worked at)
Skipping misaligned entity: (2178, 2223, Email Address)
Skipping overlapping entity: (497, 506, Companies worked at)




Skipping misaligned entity: (1155, 1199, Email Address)
Skipping overlapping entity: (203, 207, Skills)
Skipping overlapping entity: (210, 213, Skills)
Skipping overlapping entity: (819, 835, Designation)
Skipping misaligned entity: (1085, 1125, Email Address)




Skipping misaligned entity: (1943, 2050, Skills)
Skipping overlapping entity: (1438, 1447, Companies worked at)
Skipping overlapping entity: (1476, 1485, Companies worked at)
Skipping misaligned entity: (2562, 2597, Skills)




Skipping misaligned entity: (1522, 1566, Email Address)




Skipping misaligned entity: (948, 1179, Skills)
Skipping overlapping entity: (1865, 1868, Skills)
Skipping overlapping entity: (2058, 2074, Skills)
Skipping overlapping entity: (2076, 2079, Skills)




Skipping misaligned entity: (330, 368, Designation)
Skipping misaligned entity: (370, 387, Companies worked at)
Skipping misaligned entity: (1622, 1639, Companies worked at)
Skipping misaligned entity: (2311, 2331, Designation)
Skipping misaligned entity: (2333, 2340, Companies worked at)
Skipping misaligned entity: (3054, 3363, Skills)
Skipping overlapping entity: (1209, 1215, Companies worked at)
Skipping overlapping entity: (1417, 1423, Companies worked at)
Skipping overlapping entity: (1696, 1702, Companies worked at)
Skipping overlapping entity: (1749, 1755, Companies worked at)
Skipping misaligned entity: (723, 797, Skills)
Skipping misaligned entity: (814, 842, College Name)
Skipping misaligned entity: (975, 1020, Skills)




Skipping misaligned entity: (774, 896, Skills)
Skipping overlapping entity: (4774, 4778, Location)
Skipping misaligned entity: (793, 810, College Name)
Skipping misaligned entity: (870, 894, Skills)




Skipping misaligned entity: (7767, 7803, Degree)
Skipping misaligned entity: (7805, 7823, College Name)
Skipping misaligned entity: (7849, 7868, Degree)
Skipping misaligned entity: (7872, 7890, College Name)
Skipping misaligned entity: (7924, 8040, Skills)
Skipping misaligned entity: (2392, 2434, Email Address)
Skipping misaligned entity: (8133, 8136, Degree)
Skipping misaligned entity: (8133, 8136, Degree)
Skipping misaligned entity: (8138, 8165, College Name)
Skipping overlapping entity: (0, 4, Location)
Skipping misaligned entity: (1563, 1608, Email Address)
Skipping misaligned entity: (1576, 1580, Location)




Skipping misaligned entity: (1200, 1209, Location)
Skipping misaligned entity: (2586, 2595, Location)
Skipping misaligned entity: (2671, 2680, Location)
Skipping misaligned entity: (2803, 2812, Location)
Skipping misaligned entity: (3299, 3308, Location)
Skipping misaligned entity: (3345, 3380, College Name)
Skipping overlapping entity: (3, 8, Location)
Skipping overlapping entity: (67, 72, Location)
Skipping misaligned entity: (372, 380, Designation)
Skipping misaligned entity: (382, 391, Companies worked at)
Skipping misaligned entity: (626, 630, Degree)
Skipping misaligned entity: (632, 650, College Name)
Skipping misaligned entity: (863, 868, Location)
Skipping misaligned entity: (970, 1002, Skills)
Skipping misaligned entity: (983, 992, Companies worked at)




Skipping misaligned entity: (2088, 2132, Email Address)
Skipping overlapping entity: (3461, 3465, Graduation Year)
Skipping overlapping entity: (3847, 3851, Graduation Year)
Skipping misaligned entity: (1334, 1377, Email Address)




Skipping misaligned entity: (1130, 1174, Email Address)
Skipping overlapping entity: (7777, 7784, Years of Experience)
Skipping overlapping entity: (7795, 7802, Years of Experience)
Skipping overlapping entity: (7810, 7817, Years of Experience)
Skipping overlapping entity: (7829, 7836, Years of Experience)
Skipping misaligned entity: (523, 562, Email Address)




Skipping misaligned entity: (998, 1038, Email Address)
Skipping duplicate entity: (15, 49, Designation)
Skipping misaligned entity: (1708, 1752, Email Address)




Skipping misaligned entity: (2198, 2239, Email Address)
Skipping misaligned entity: (3081, 3104, College Name)
Skipping misaligned entity: (3144, 3495, Skills)
Skipping misaligned entity: (937, 980, Email Address)




Skipping misaligned entity: (1704, 1746, Email Address)




Skipping misaligned entity: (2284, 2288, Graduation Year)
Skipping misaligned entity: (3644, 3682, Degree)
Skipping misaligned entity: (3684, 3725, College Name)
Skipping misaligned entity: (3750, 3754, Graduation Year)
Skipping misaligned entity: (3756, 3813, Degree)
Skipping misaligned entity: (3815, 3859, College Name)
Skipping misaligned entity: (3880, 3884, Graduation Year)
Skipping misaligned entity: (3913, 4370, Skills)
Skipping overlapping entity: (1803, 1809, Companies worked at)
Skipping misaligned entity: (3883, 3907, Degree)
Skipping misaligned entity: (3909, 3931, College Name)




Skipping misaligned entity: (1584, 1588, Graduation Year)




Skipping misaligned entity: (0, 17, Name)
Skipping misaligned entity: (363, 411, Email Address)




Skipping misaligned entity: (2784, 2829, Email Address)
Skipping misaligned entity: (7632, 7636, Graduation Year)
Skipping misaligned entity: (11328, 11332, Graduation Year)




Skipping misaligned entity: (1586, 1592, Companies worked at)
Skipping overlapping entity: (3385, 3391, Companies worked at)
Skipping overlapping entity: (3427, 3433, Companies worked at)
Skipping overlapping entity: (3532, 3538, Companies worked at)
Skipping overlapping entity: (3630, 3636, Companies worked at)
Skipping overlapping entity: (3758, 3764, Companies worked at)




Skipping misaligned entity: (1341, 1384, Email Address)
Skipping overlapping entity: (651, 656, Location)
Skipping overlapping entity: (658, 663, Location)
Skipping overlapping entity: (707, 712, Location)
Skipping overlapping entity: (714, 719, Location)




Skipping misaligned entity: (2210, 2251, Email Address)
Skipping overlapping entity: (52, 55, Designation)




Skipping misaligned entity: (4729, 4733, Graduation Year)




Skipping misaligned entity: (2643, 2676, Degree)
Skipping misaligned entity: (2678, 2719, College Name)
Skipping misaligned entity: (2721, 2725, UNKNOWN)
Skipping misaligned entity: (2912, 3288, Skills)




Skipping misaligned entity: (1369, 1402, Designation)
Skipping misaligned entity: (1404, 1415, Companies worked at)
Skipping misaligned entity: (4838, 4841, Degree)
Skipping misaligned entity: (4843, 4861, College Name)
Skipping misaligned entity: (4901, 4910, Location)




Skipping misaligned entity: (61, 105, Email Address)
Skipping example due to error: [E024] Could not find an optimal move to supervise the parser. Usually, this means that the model can't be updated in a way that's valid and satisfies the correct annotations specified in the GoldParse. For example, are all labels added to the model? If you're training a named entity recognizer, also make sure that none of your annotated entity spans have leading or trailing whitespace or punctuation. You can also use the `debug data` command to validate your JSON-formatted training data. For details, run:
python -m spacy debug data --help
Skipping example due to error: [E024] Could not find an optimal move to supervise the parser. Usually, this means that the model can't be updated in a way that's valid and satisfies the correct annotations specified in the GoldParse. For example, are all labels added to the model? If you're training a named entity recognizer, also make sure that none of your annotated