## Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Extract Dataset to List

In [49]:
def convert_ner_format(df):
    sentences = []
    labels = []

    current_sentence = []
    current_labels = []

    for _, row in df.iterrows():
        sentence_marker = row["Sentence #"]

        if isinstance(sentence_marker, str) and sentence_marker.startswith("Sentence:"):
            # A new sentence starts, save the previous one if not empty
            if current_sentence:
                sentences.append(current_sentence)
                labels.append(current_labels)

            # Reset for the new sentence
            current_sentence = []
            current_labels = []

        # Add words and labels to the current sentence
        current_sentence.append(row["Word"])
        current_labels.append(row["Tag"])

    # Append the last sentence if not empty
    if current_sentence:
        sentences.append(current_sentence)
        labels.append(current_labels)

    return sentences, labels

sentences, labels = convert_ner_format(df)

## Check Sample

In [65]:
print(' '.join(sentences[5000]))
print(labels[5000])

Separately , officials say a policeman was killed in Mosul when he tried to move a decapitated body that was rigged with explosives .
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


## Convert List to Spacy's Dataset Format

In [53]:
import spacy
from spacy.tokens import DocBin

db = DocBin()
nlp = spacy.load('en_core_web_sm')

for i in range(len(sentences[:1000])):
  text = ' '.join([str(word) for word in sentences[i]])
  print(text)
  print('-----------')
  doc = nlp.make_doc(text)
  ents = []
  current_char_index = 0
  for y in range(len(labels[i])):
    tag = labels[i][y]
    word = sentences[i][y]

    if tag != 'O':
      start_char = text.index(word, current_char_index)
      end_char = start_char + len(word)
      span = doc.char_span(start_char, end_char, label=tag, alignment_mode='contract')
      if span is not None: #checking to make sure span is not None to avoid errors
        # Check for overlaps with existing entities
        valid_span = True
        for existing_span in ents:
          if existing_span.end_char > span.start_char and existing_span.start_char < span.end_char:
            valid_span = False # Overlap found, mark as invalid
            print(f"Skipping overlapping entity: {span} overlaps with {existing_span}") #Print statement for debugging to tell which spans overlap
            break  # Exit the inner loop as soon as an overlap is found
        if valid_span: #append span if there are no overlapping entities or if the span is valid
          ents.append(span)
          current_char_index = end_char
      else:
            print("Skipping entity")

  doc.ents = ents
  db.add(doc)

db.to_disk('./train.spacy') #Changed to_dis to to_disk

Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
-----------
Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "
-----------
They marched from the Houses of Parliament to a rally in Hyde Park .
-----------
Police put the number of marchers at 10,000 while organizers claimed it was 1,00,000 .
-----------
The protest comes on the eve of the annual conference of Britain 's ruling Labor Party in the southern English seaside resort of Brighton .
-----------
The party is divided over Britain 's participation in the Iraq conflict and the continued deployment of 8,500 British troops in that country .
-----------
The London march came ahead of anti-war protests today in other cities , including Rome , Paris , and Madrid .
-----------
The International Atomic Energy Agency is to hold secon

In [54]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     54.09    0.76    0.85    0.68    0.01
  0     200         83.74   3346.85   65.50   65.63   65.37    0.65
  1     400        257.52   1847.20   83.54   83.58   83.50    0.84
  3     600        249.13   1270.42   91.71   92.34   91.09    0.92
  4     800        446.21   1114.89   96.20   96.08   96.32    0.96
  7    1000        580.90    894.27   97.45   97.43   97.46    0.97
  9    1200        405.22    695.01   98.31   98.30   98.33    0.98
 12    1400        840.66    596.30   99.24   99.29   99.20    0.99
 16    1600        349.58    431.39   99.60   99.63   99.57    1.00
 21    1800        486.58    402.88   99.77   

Mapping Label

In [61]:
nlp1 = spacy.load(r"/content/output/model-best") #load the best model
doc = nlp1("Separately , officials say a policeman was killed in Mosul when he tried to move a decapitated body that was rigged with explosives .") # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter



In [62]:
import joblib

joblib.dump(nlp1, "ner_model.joblib")

['ner_model.joblib']

In [64]:
# Load the model back
nlp = joblib.load("ner_model.joblib")

# Test the loaded model
doc = nlp("Separately , officials say a policeman was killed in Mosul when he tried to move a decapitated body that was rigged with explosives .")
for ent in doc.ents:
    print(ent.text, ent.label_)

Mosul GPE
