In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Import libraries

In [2]:
#Download packages
!python3 -m spacy download en
!python3 -m spacy download en_core_web_sm

#Import libraries
import json 
import spacy
import random
import en_core_web_sm
from spacy import displacy



Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 3.9 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 3.2 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


Load training data

In [None]:
TRAIN_DATA_EXAMPLE = [
    ('Who is Elon Musk?', {
        'entities': [(7, 15, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]


In [None]:
# Load the training data
example_path='/content/drive/MyDrive/training_data1.json'
with open(example_path) as fp:
     training_data = json.load(fp)


In [None]:
training_data

{'annotations': [['Villages on the east shore of Abbadan island from the bifurcation of the Karun and Bahmanshir rivers to the mouth of the latter on the Persian Gulf.',
   {'entities': [[30, 37, 'GPE_CUSTOM'],
     [73, 78, 'LOC_CUSTOM'],
     [83, 93, 'LOC_CUSTOM'],
     [135, 147, 'LOC_CUSTOM']]}],
  ['', {'entities': []}],
  ['150 mud houses; opposite Muhammareh Town. There are about 15 shops; Ballams and sailing craft are built, and water-pots, jars and Abas are manufactured. Kut-ash-Shaikh was formerly called Kut Farsis and belonged to the Kaab Shaikhs in the time of their supremacy.',
   {'entities': [[25, 41, 'GPE_CUSTOM'], [219, 231, 'TRIBE']]}],
  ['1 mile below Shakhat Haji Is-haq.',
   {'entities': [[13, 32, 'LOC_CUSTOM']]}],
  ['A term sometimes applied by mariners to the entire coastal region between the towns of Kuwait and Qatif in Eastern Arabia. Bedouins, however, seem to be unaware of any such general application of the name; and among them, as among the settled popul

Pretrained model setup


In [None]:
#Load pretrained model loaded from spacy
nlp_pretrained = en_core_web_sm.load()
print("Loaded model : %s" % nlp_pretrained)


Loaded model : <spacy.lang.en.English object at 0x7fb04ad4bf90>


In [None]:
#Retrieve the existing ner component
ner_pretrained=nlp.get_pipe('ner')
#Print exisiting pipeline components
print("Pipeline objects: %s" % nlp_pretrained.pipe_names)

Pipeline objects: ['tagger', 'parser', 'ner']


Blank model setup


In [None]:
# Prepare an empty model to train
nlp_blank = spacy.blank('en') 
# Assign name to the component
nlp_blank.vocab.vectors.name = 'demo' 
print("Loaded model : %s" % nlp_blank)

Loaded model : <spacy.lang.en.English object at 0x7fb04ad465d0>


In [None]:
#Create a new component 'ner' to the factory
ner_blank= nlp_blank.create_pipe('ner')
#Component ner is added last to the pipeline
nlp_blank.add_pipe(ner_blank, last = True)
#Print exisiting pipeline components
print("Pipeline objects: %s" % nlp_blank.pipe_names)

Pipeline objects: ['ner']


Add custom Named Entity labels: Blank model

In [None]:
# Existing entity labels for the blank model
blank_previous_ents = ner_blank.labels 
print('[Blank model: existing entities]')
if (len(ner_blank.labels))==0:
  print("No Named Entity labels")
for name in ner_blank.labels:
  print(name)


[Blank model: existing entities]
No Named Entity labels


In [None]:
# Add new Named Entity labels to the NER pipeline for the blank model
for label in training_data["classes"]:
  ner_blank.add_label(label)

In [None]:
# Updated entity labels for the pretrained model
blank_new_ents = ner_blank.labels 
print('[Blank model: new entities]')
for name in ner_blank.labels:
  print(name)

[Blank model: new entities]
GPE_CUSTOM
LOC_CUSTOM
TRIBE


Add custom Named Entity labels: Pretrained model

In [None]:
# Existing entity labels for the model
pretrained_previous_ents = ner_pretrained.labels
print('[Pretrained model: existing entities]')
for name in ner_pretrained.labels[::-1]: 
  print(name) # print starting from last element

[Pretrained model: existing entities]
WORK_OF_ART
TRIBE
TIME
QUANTITY
PRODUCT
PERSON
PERCENT
ORG
ORDINAL
NORP
MONEY
LOC_CUSTOM
LOC
LAW
LANGUAGE
GPE_CUSTOM
GPE
FAC
EVENT
DATE
CARDINAL


In [None]:
#Description of the Named Entity label
spacy.explain("GPE")

'Countries, cities, states'

In [None]:
# Add new Named Entity labels to the NER pipeline for the blank model
for label in training_data["classes"]:
  ner_pretrained.add_label(label)

In [None]:
# Updated entity labels for the pretrained model
pretrained_new_ents = ner_pretrained.labels
print('[Pretrained model: new entities]')
for name in ner_pretrained.labels[::-1]: 
  print(name) # print starting from last element

[Pretrained model: new entities]
WORK_OF_ART
TRIBE
TIME
QUANTITY
PRODUCT
PERSON
PERCENT
ORG
ORDINAL
NORP
MONEY
LOC_CUSTOM
LOC
LAW
LANGUAGE
GPE_CUSTOM
GPE
FAC
EVENT
DATE
CARDINAL


Create an optimizer



In [None]:
#Create a new optimizer for the blank model
optimizer_blank = nlp_blank.begin_training()
#Train the existing optimizer for the pretrained model
optimizer_pretrained = nlp_pretrained.resume_training()


Train the blank and pretrained models

In [None]:
#Disable other pipelines to train only the ner pipeline
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER component
#Iterate through several instances to have more training data
  for iter in range(50):
    losses={}
    random.shuffle(training_data["annotations"])
    #For each text and annotations
    for text, annotations in training_data["annotations"]:
      #Skip the segments of the text that were not annotated--skip empty entities
        if len(text) > 0:
          # Update model parameters with text and annotations for that text
          # Text, annotations are required and optimizer,losses are optional parameters
            nlp_blank.update([text], [annotations], sgd=optimizer_blank, losses=losses)
            nlp_pretrained.update([text], [annotations], sgd=optimizer_pretrained, losses=losses)
            print(losses)


{'ner': 0.03795467153476584, 'tagger': 0.0008274633437395096, 'parser': 0.0}
{'ner': 5.453621135321458, 'tagger': 0.0008346593067471986, 'parser': 0.0}
{'ner': 7.329643497717046, 'tagger': 0.0008395431068493053, 'parser': 0.0}
{'ner': 44.403811287069985, 'tagger': 0.014930748890037648, 'parser': 0.0}
{'ner': 46.92952604212827, 'tagger': 0.014930751356329042, 'parser': 0.0}
{'ner': 49.97160988535662, 'tagger': 0.01512404834622072, 'parser': 0.0}
{'ner': 62.250827437785716, 'tagger': 0.015519725747904944, 'parser': 0.0}
{'ner': 68.24050309163832, 'tagger': 0.10929934180577772, 'parser': 0.0}
{'ner': 241.5872773651969, 'tagger': 4.195588270791373, 'parser': 0.0}
{'ner': 245.7979194487946, 'tagger': 4.195591418638483, 'parser': 0.0}
{'ner': 250.19195198193052, 'tagger': 4.195591950858298, 'parser': 0.0}
{'ner': 253.7771119045019, 'tagger': 4.277363875950042, 'parser': 0.0}
{'ner': 257.3324151734339, 'tagger': 4.277364051425335, 'parser': 0.0}
{'ner': 259.5988415954563, 'tagger': 4.27751243

In [None]:
#which labels have more/smaller loss

Load the test data

In [None]:
#read the combined txt file as input file (same as training)
test_txt = open("/content/drive/MyDrive/abbas_bandar.txt")
text=test_txt.read()
text=text.replace("\n", " ")

In [None]:
text

'ABBAS (BANDAR) (3076 words) بندر عبّاس   An important town on the Persian Coast at the entrance of the Persian Gulf; it constitutes in itself an administrative district and is surrounded, except on the south side which is to the sea, by the district of Shamil. Bandar Abbas is situated about 280 miles north-northwest of Masqat and 96 miles east-north-east of Lingeh Town.  Site and buildings.— Bandar Abbas stands at the foot of a bay upon a low, sandy, shelving beach; the buildings approach within 100 yards of the waters edge and at high spring tides the sea washes their walls. The town looks out, between the islands of Hormuz and Larak , upon the junction off the Ruus-al-Jibal promontory of the Gulfs of Persia and Oman: its background, as viewed from the sea, is formed by the massive pile of Kuh-i-Ginau, which at a distance of only 18 miles north by west of the town attains an altitude of 7783 feet. The immediate surroundings of Bandar Abbas are utterly bare; even garden cultivation an

In [None]:
len(text)


18049

Test the blank model

In [None]:
# Named Entity Recognition of the test data for the blank model
doc = nlp_blank(text)
# Display entities
doc.ents

(Persian Coast,
 Persian Gulf,
 Bandar Abbas,
 Masqat,
 Lingeh Town,
 Persia,
 Bandar Abbas,
 Kulah-i-Farangi2كلاه,
 Bandar Abbas,
 Bandar Abbas,
 Bandar Abbas,
 Bandar,
 Bandar Abbas,
 Bandar Abbas,
 Sistan,
 Khurasan,
 Bandar Abbas,
 Bombay,
 Karachi,
 Bandar Abbas,
 Bandar Abbas,
 Persia,
 Bandar Abbas,
 Persia,
 Tabriz,
 Qiyas,
 Bandar Abbas,
 Bandar Abbas,
 Bandar Abbas,
 Bandar Abbas,
 Bandar Abbas,
 Bandar Abbas,
 Bandar Abbas,
 Bandar Abbas,
 Bandar Abbas,
 Bandar Abbas,
 Persian,
 Persian Deputy-Governor,
 Persian Government,
 Bandar Abbas,
 Bandar Abbas,
 Hindu,
 Bandar Abbas,
 Bandar Abbas,
 Bandar Abbas,
 Bandar Abbas,
 Persia,
 Persian Gulf)

In [None]:
#Display Named Entities with their names and labels for the blank model
for ent in doc.ents:
    print(ent.text,ent.label_)
    print()

Persian Coast LOC_CUSTOM

Persian Gulf LOC_CUSTOM

Bandar Abbas GPE_CUSTOM

Masqat GPE_CUSTOM

Lingeh Town GPE_CUSTOM

Persia GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Kulah-i-Farangi2كلاه LOC_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas TRIBE

Bandar Abbas TRIBE

Bandar TRIBE

Bandar Abbas GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Sistan GPE_CUSTOM

Khurasan LOC_CUSTOM

Bandar Abbas GPE_CUSTOM

Bombay GPE_CUSTOM

Karachi GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Persia GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Persia TRIBE

Tabriz TRIBE

Qiyas TRIBE

Bandar Abbas GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas TRIBE

Bandar Abbas GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas TRIBE

Bandar Abbas GPE_CUSTOM

Bandar Abbas TRIBE

Persian TRIBE

Persian Deputy-Governor LOC_CUSTOM

Persian Government LOC_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas TRIBE

Hindu GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas TRIBE

Bandar Abba

In [None]:
# Named Entity Recognition of the test data for the blank model
doc = nlp_pretrained(text)
# Display entities
doc.ents

(Persian Coast,
 Persian Gulf,
 Bandar Abbas,
 Masqat,
 Lingeh Town,
 Bandar Abbas,
 Hormuz,
 Ruus-al-Jibal,
 Persia,
 Oman,
 Bandar Abbas,
 Bandar Abbas,
 Bandar Abbas,
 Naiband,
 Bandar Abbas,
 Bandar Abbas,
 Shafi,
 Avazis,
 Bandar Abbas,
 Bandar Abbas,
 Southern Persia,
 Shiraz,
 Bandar Abbas,
 Sistan,
 Khurasan,
 Bandar Abbas,
 China,
 Britain,
 France,
 Bombay,
 Bandar Abbas,
 Bandar Abbas,
 Persia,
 Belgium,
 Yakshahi,
 Bandar Abbas,
 Persia,
 Shipping,
 Bandar Abbas,
 Bandar Abbas,
 Bandar Abbas,
 Bandar Abbas,
 Bandar Abbas,
 Bandar Abbas,
 Gulf Ports,
 Kalantar,
 Ziyarat,
 Kalantar,
 Maidani,
 Bandar Abbas,
 Customs,
 Gulf Ports,
 Persian Coast,
 Bandar Abbas,
 Bandar Abbas,
 Haji Husain,
 Bandar Abbas,
 Customs,
 Customs,
 Customs.,
 Bandar Abbas,
 Bandar Abbas,
 British Consulate,
 Naiband,
 Bandar Abbas,
 Russia,
 Consulate,
 Bandar Abbas,
 Bandar Abbas,
 Eastern Caliphate,
 Bandar Abbas,
 Persian Gulf,
 Upper India)

In [None]:
#Display Named Entities with their names and labels for the pretrained model
for ent in doc_pretrained.ents:
    print(ent.text,ent.label_)
    print()

Persian Coast LOC_CUSTOM

Persian Gulf LOC_CUSTOM

Bandar Abbas GPE_CUSTOM

Masqat GPE_CUSTOM

Lingeh Town GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Hormuz GPE_CUSTOM

Ruus-al-Jibal GPE_CUSTOM

Persia GPE_CUSTOM

Oman GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Naiband GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Shafi TRIBE

Avazis LOC_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Southern Persia GPE_CUSTOM

Shiraz TRIBE

Bandar Abbas GPE_CUSTOM

Sistan GPE_CUSTOM

Khurasan GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

China GPE_CUSTOM

Britain GPE_CUSTOM

France GPE_CUSTOM

Bombay GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Persia GPE_CUSTOM

Belgium GPE_CUSTOM

Yakshahi TRIBE

Bandar Abbas GPE_CUSTOM

Persia GPE_CUSTOM

Shipping GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Bandar Abbas GPE_CUSTOM

Gulf Ports GPE_CUSTO

End of the pipeline

Default ner pipeline for en_core_web_sm.load

Future: 'catastrophic forgetting problem'

https://spacy.io/api/language#resume_training

https://github.com/explosion/spaCy/issues/2124


In [3]:
# Load English tokenizer, tagger, parser and NER
nlp = en_core_web_sm.load()
#Load the en_core_web library and create the nlp pipeline
ner=nlp.get_pipe('ner')
#Read the test text
test_txt = open("/content/drive/MyDrive/abbas_bandar.txt")
text=test_txt.read()
text=text.replace("\n", " ")
#Process the text and recognize Named Entities
doc = nlp(text)


[Default Entities] =  ('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')


In [4]:
#Visualize the NER
displacy.render(doc, style="ent", jupyter=True)

In [None]:
#Print all entities
for ent in doc.ents:
    print(ent.text,ent.label_)


3076 CARDINAL
بندر PERSON
the Persian Coast LOC
the Persian Gulf LOC
Shamil GPE
Bandar Abbas PERSON
about 280 miles QUANTITY
Masqat GPE
96 miles QUANTITY
Lingeh Town GPE
Bandar Abbas PERSON
100 yards QUANTITY
Hormuz GPE
Larak PERSON
the Gulfs of Persia ORG
Oman GPE
Kuh PERSON
only 18 miles QUANTITY
7783 feet QUANTITY
Bandar Abbas PERSON
up to 18 feet QUANTITY
2½ miles QUANTITY
the south-east LOC
as much as 4 miles QUANTITY
about ¼ CARDINAL
as much as 100 yards QUANTITY
100 yards QUANTITY
20 CARDINAL
6 feet QUANTITY
Gach PERSON
Portuguese NORP
Dutch NORP
about half a mile QUANTITY
English LANGUAGE
four CARDINAL
4 CARDINAL
Bandar Abbas PERSON
the quarters DATE
Dutch NORP
the Kulah-i-Farangi2كلاه PRODUCT
the Imperial Customs Department ORG
Bandar Abbas ORG
summer DATE
the depths of winter DATE
the morning of the 29th of January 19054 TIME
The months of January and February DATE
March DATE
April DATE
May DATE
June, July DATE
August DATE
September DATE
October, November and December DATE
th

In [None]:
#Print only GPE, LOC, ORG entities
for ent in doc.ents:
    if (ent.label_=="GPE" or ent.label_=="ORG" or ent.label_=="LOC" ):
      print(ent.text,ent.label_)

the Persian Coast LOC
the Persian Gulf LOC
Shamil GPE
Masqat GPE
Lingeh Town GPE
Hormuz GPE
the Gulfs of Persia ORG
Oman GPE
the south-east LOC
the Imperial Customs Department ORG
Bandar Abbas ORG
Baluchi GPE
Avaz ORG
Bastak ORG
Shafi GPE
Laris GPE
Shamil GPE
Southern Persia LOC
India GPE
the United Kingdom GPE
medicines(1⅓ GPE
China GPE
India GPE
India GPE
the United Kingdom GPE
tea(13¾ ORG
Britain GPE
France GPE
India GPE
the Imperial Persian Customs Department ORG
Bombay GPE
Karachi GPE
Bandar Abbas ORG
Persia GPE
Belgium GPE
Dupul GPE
Persia GPE
عباسي GPE
Chaharak GPE
شاه ORG
Shipping GPE
Baghlahs ORG
Ghunchahs ORG
Mashuwahs ORG
Horis GPE
TOTAL ORG
Shamil GPE
the Shamil District LOC
Hanjam island LOC
the Gulf Ports LOC
the Kalantar of the Shamil District ORG
Ziyarat GPE
Kalantar ORG
the Imperial Persian Customs ORG
½ a Qran LOC
Rahdari ORG
Customs ORG
Powers ORG
the Gulf Ports LOC
the Persian Coast LOC
The Persian Government ORG
the Imperial Customs Department ORG
the Persian Forei

In [None]:
for ent in doc.ents:
    print(ent.text,ent.label_)
