# *Required Libraries*

In [1]:
import spacy
import json
import random
import pickle
from spacy.util import filter_spans
from spacy.tokens import DocBin
from tqdm import tqdm

# *Loading Data*

In [2]:
train_data=pickle.load(open("data\\train\\train_data.pkl","rb"))

# *Building the Model*

In [3]:
# Initialize a blank English NLP model
nlp = spacy.blank('en')
doc_bin = DocBin()

# Iterate through the training data
for text, annotations in tqdm(train_data):
    # Create a Doc object
    doc = nlp.make_doc(text)
    
    # Create spans for the entities
    ents = []
    for start, end, label in annotations['entities']:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print(f"Skipping entity: ({start}, {end}, {label})")
        else:
            ents.append(span)
    
    # Filter overlapping spans
    filtered_ents = filter_spans(ents)
    
    # Set the filtered entities in the Doc
    doc.ents = filtered_ents
    
    # Add the Doc to the DocBin
    doc_bin.add(doc)

# Save the DocBin to disk
doc_bin.to_disk('train.spacy')


  6%|▌         | 11/200 [00:00<00:01, 97.60it/s]

Skipping entity: (1435, 1480, Email Address)
Skipping entity: (2022, 2061, Email Address)
Skipping entity: (1210, 1247, Email Address)
Skipping entity: (1394, 1437, Email Address)


 20%|██        | 40/200 [00:00<00:01, 86.23it/s]

Skipping entity: (1358, 1400, Email Address)
Skipping entity: (1089, 1093, Graduation Year)
Skipping entity: (1027, 1031, Graduation Year)
Skipping entity: (389, 393, Graduation Year)
Skipping entity: (1704, 1746, Email Address)
Skipping entity: (1656, 1698, Email Address)
Skipping entity: (1754, 1792, Email Address)
Skipping entity: (1085, 1125, Email Address)
Skipping entity: (210, 213, Skills)
Skipping entity: (203, 207, Skills)
Skipping entity: (50, 56, Companies worked at)
Skipping entity: (1586, 1592, Companies worked at)
Skipping entity: (2176, 2215, Email Address)
Skipping entity: (3996, 3999, Skills)
Skipping entity: (2109, 2151, Email Address)
Skipping entity: (1341, 1384, Email Address)
Skipping entity: (1192, 1234, Email Address)
Skipping entity: (1667, 1705, Email Address)


 36%|███▌      | 72/200 [00:00<00:01, 94.61it/s]

Skipping entity: (265, 307, Email Address)
Skipping entity: (2211, 2254, Email Address)
Skipping entity: (1801, 1842, Email Address)
Skipping entity: (4067, 4069, Skills)
Skipping entity: (3812, 3814, Skills)
Skipping entity: (1077, 1120, Email Address)
Skipping entity: (4167, 4176, Companies worked at)
Skipping entity: (1865, 1868, Skills)
Skipping entity: (3111, 3152, Email Address)
Skipping entity: (1454, 1499, Email Address)
Skipping entity: (1945, 1989, Email Address)


 48%|████▊     | 96/200 [00:01<00:01, 102.10it/s]

Skipping entity: (3847, 3851, Graduation Year)
Skipping entity: (2272, 2316, Email Address)
Skipping entity: (1708, 1752, Email Address)
Skipping entity: (61, 106, Email Address)
Skipping entity: (2088, 2132, Email Address)
Skipping entity: (1522, 1566, Email Address)
Skipping entity: (872, 911, Email Address)
Skipping entity: (1130, 1174, Email Address)
Skipping entity: (4901, 4910, Location)
Skipping entity: (1811, 1848, Email Address)
Skipping entity: (14240, 14249, Companies worked at)
Skipping entity: (11438, 11447, Companies worked at)
Skipping entity: (2999, 3043, Email Address)


 70%|███████   | 140/200 [00:01<00:00, 154.64it/s]

Skipping entity: (1576, 1580, Location)
Skipping entity: (1563, 1608, Email Address)
Skipping entity: (0, 4, Location)
Skipping entity: (368, 409, Email Address)
Skipping entity: (2474, 2514, Email Address)
Skipping entity: (2284, 2288, Graduation Year)
Skipping entity: (1573, 1578, Graduation Year)
Skipping entity: (4729, 4733, Graduation Year)
Skipping entity: (2684, 2688, Location)
Skipping entity: (863, 868, Location)
Skipping entity: (626, 631, Degree)
Skipping entity: (67, 72, Location)
Skipping entity: (283, 327, Email Address)
Skipping entity: (2396, 2399, Degree)
Skipping entity: (1338, 1345, Location)
Skipping entity: (8133, 8136, Degree)
Skipping entity: (8133, 8136, Degree)
Skipping entity: (2392, 2434, Email Address)
Skipping entity: (5457, 5461, Graduation Year)
Skipping entity: (4215, 4219, Graduation Year)
Skipping entity: (971, 1015, Email Address)
Skipping entity: (872, 875, Degree)
Skipping entity: (1234, 1277, Email Address)
Skipping entity: (3108, 3150, Email Addre

 93%|█████████▎| 186/200 [00:01<00:00, 131.99it/s]

Skipping entity: (363, 411, Email Address)
Skipping entity: (1305, 1347, Email Address)
Skipping entity: (1584, 1588, Graduation Year)
Skipping entity: (2080, 2086, Degree)
Skipping entity: (1094, 1134, Email Address)
Skipping entity: (3660, 3664, Graduation Year)
Skipping entity: (3539, 3545, Degree)
Skipping entity: (2105, 2148, Email Address)
Skipping entity: (2198, 2239, Email Address)
Skipping entity: (2178, 2223, Email Address)
Skipping entity: (1334, 1377, Email Address)
Skipping entity: (998, 1038, Email Address)
Skipping entity: (277, 328, Email Address)
Skipping entity: (1308, 1349, Email Address)


100%|██████████| 200/200 [00:01<00:00, 116.25it/s]


Skipping entity: (1369, 1413, Email Address)
Skipping entity: (937, 980, Email Address)
Skipping entity: (523, 562, Email Address)
Skipping entity: (2163, 2205, Email Address)


In [4]:
! python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [5]:
! python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
[38;5;3m⚠ Aborting and saving the final best model. Encountered exception:
ValueError("[E024] Could not find an optimal move to supervise the parser.
Usually, this means that the model can't be updated in a way that's valid and
satisfies the correct annotations specified in the GoldParse. For example, are
all labels added to the model? If you're training a named entity recognizer,
also make sure that none of your annotated entity spans have leading or trailing
whitespace or punctuation. You can also use the `debug data` command to validate
your JSON-formatted training data. For details, run:\npython -m spacy debug data
--help")[0m


Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\Mitesh Gupta\AppData\Local\Programs\Python\Python311\Lib\site-packages\spacy\__main__.py", line 4, in <module>
    setup_cli()
  File "c:\Users\Mitesh Gupta\AppData\Local\Programs\Python\Python311\Lib\site-packages\spacy\cli\_util.py", line 87, in setup_cli
    command(prog_name=COMMAND)
  File "c:\Users\Mitesh Gupta\AppData\Local\Programs\Python\Python311\Lib\site-packages\click\core.py", line 1157, in __call__
    return self.main(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Mitesh Gupta\AppData\Local\Programs\Python\Python311\Lib\site-packages\typer\core.py", line 723, in main
    return _main(
           ^^^^^^
  File "c:\Users\Mitesh Gupta\AppData\Local\Programs\Python\Python311\Lib\site-packages\typer\core.py", line 193, in _main
    rv = self.invoke(ctx)
         ^^^^^^^^^^^^^^^^
  File "c:\Users\M