# Spacy NER Model

In [1]:
import spacy
import json
from spacy.scorer import Scorer
from spacy.training.example import Example
from pathlib import Path
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Load data
with open('/content/Relabeled_Training_Data.json') as fp:
  training_data = json.load(fp)

In [3]:
# Data
print("Classes: ", training_data['classes'])
print("Train Data Samples: ", len(training_data['annotations']))
print("Train Data (2 samples): ", training_data['annotations'][:-2])

Classes:  ['NAME', 'TIME', 'DATE', 'AREA', 'CITY', 'DISTRICT', 'STATE', 'COUNTRY', 'ORGANISATION', 'INCIDENT']
Train Data Samples:  460
Train Data (2 samples):  [['Wednesday morning 01-08-07 at 11 am local time, Mujahideen of Islamic Emirate of Afghanistan attacked American Christian kafirs terrorists occupation military base in Trara area Managi district of Kunar province. In the attack a firefight started which lasted for one hour, in which Mujahideen killed four American Christian kafirs terrorists occupation military terrorists and wounded a number of them, after the attack Mujahideen return to their positions without any difficulties. Later out of frustration the American Christian kafirs terrorists occupation military planes bombarded the area martyring and wounding civilians, destroyed houses and fields. Reported by Zabihuallah Mujahid\r', {'entities': [[0, 17, 'TIME'], [18, 26, 'DATE'], [30, 35, 'TIME'], [48, 58, 'NAME'], [167, 177, 'AREA'], [178, 193, 'DISTRICT'], [197, 212, '

In [4]:
# Shuffle the data
import random
random.shuffle(training_data['annotations'])

# Train - Test Split (90 - 10)
m = len(training_data['annotations'])
train_data = training_data['annotations'][:int(m * 0.9)]
test_data = training_data['annotations'][int(m * 0.9): ]

print("Length of Train data", len(train_data))
print("Length of Test data", len(test_data))

Length of Train data 414
Length of Test data 46


In [5]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [23]:
# Create a model
#nlp = spacy.blank('en')
#nlp.vocab.vectors.name = 'demo'
#nlp.add_pipe('ner')

# Pre-trained model
#nlp = spacy.load("en_core_web_md")

# From drive
!unzip '/content/drive/MyDrive/Trained_Model.zip' -d '/content/Trained_Model'

Archive:  /content/drive/MyDrive/Trained_Model.zip
   creating: /content/Trained_Model/content/Trained Model/
   creating: /content/Trained_Model/content/Trained Model/attribute_ruler/
  inflating: /content/Trained_Model/content/Trained Model/attribute_ruler/patterns  
   creating: /content/Trained_Model/content/Trained Model/lemmatizer/
   creating: /content/Trained_Model/content/Trained Model/lemmatizer/lookups/
  inflating: /content/Trained_Model/content/Trained Model/lemmatizer/lookups/lookups.bin  
   creating: /content/Trained_Model/content/Trained Model/tagger/
  inflating: /content/Trained_Model/content/Trained Model/tagger/model  
  inflating: /content/Trained_Model/content/Trained Model/tagger/cfg  
  inflating: /content/Trained_Model/content/Trained Model/config.cfg  
   creating: /content/Trained_Model/content/Trained Model/senter/
  inflating: /content/Trained_Model/content/Trained Model/senter/model  
 extracting: /content/Trained_Model/content/Trained Model/senter/cfg  


In [24]:
nlp = spacy.load('/content/Trained_Model/content/Trained Model')



In [9]:
# Add entity tags
labels = {"NAME":[], "DATE":[], "TIME":[], "AREA":[], "CITY": [],
           "DISTRICT":[], "STATE":[], "COUNTRY":[], "ORGANISATION":[], "INCIDENT":[]}
for label in labels.keys():
  nlp.get_pipe('ner').add_label(label)

In [10]:
# Pre-trained Optimizer
optimizer = nlp.create_optimizer()

# Blank Model
# optimizer = nlp.begin_training()

In [12]:
# Train
EPOCHS = 30
for i in range(EPOCHS):
  losses = {}
  random.shuffle(train_data)

  # Training batches
  for batch in spacy.util.minibatch(train_data, size=8):
    for text, annotations in batch:
      # create Example
      doc = nlp.make_doc(text)
      example = Example.from_dict(doc, annotations)
      # Update the model
      nlp.update([example], sgd=optimizer, losses=losses, drop=0.5)

  if i % 10 == 0:
    print("Epoch {0} -> {1}".format(i, losses['ner']))



Epoch 0 -> 6091.613408949928
Epoch 10 -> 3047.69962381572
Epoch 20 -> 2551.033155390408


In [13]:
# Metrics
def evaluate(ner_model, examples):
    scorer = Scorer()
    example = []
    preds = []
    for input_, annotations in examples:
        pred = ner_model(input_)
        preds.append([(ent.text, ent.label_) for ent in pred.ents])
        temp = Example.from_dict(pred, annotations)
        example.append(temp)

    scores = scorer.score(example)
    return scores, preds

In [14]:
# Evaluation
result, preds = evaluate(nlp, test_data)



In [15]:
def annotate(test):
  out = []
  for x in test:
    text = x[0]
    ents = x[1]
    res = []
    for e in ents['entities']:
      res.append((text[e[0] : e[1]], e[2]))
    out.append(res)
  return out

true_labels = annotate(test_data)

In [16]:
def performance(true, pred):
  tp, fn = 0, 0  # tp -> True Positive; fn -> False Negative
  fp, tn = 0, 0  # fp -> False Positive; tn -> True Negative

  assert len(true) == len(pred)

  m = len(true)

  for i in range(m):
    true_sample = true[i]
    pred_sample = pred[i]

    # True Positive
    for ent in true_sample:
      if ent in pred_sample:
        tp += 1
      else:
        fn += 1

    # True Negative
    for ent in pred_sample:
      if not ent in true_sample:
        fp += 1

  # Precision, Recall
  precision = tp / (tp + fp)
  recall = tp / (tp + fn)
  f1_score = (2 * precision * recall) / (precision + recall)

  print("Preformance:")
  print("Precision: ", precision)
  print("Recall: ", recall)
  print("F1 Score: ", f1_score)

  return f1_score, precision, recall

performance(true_labels, preds)

Preformance:
Precision:  0.7432712215320911
Recall:  0.6773584905660377
F1 Score:  0.7087857847976308


(0.7087857847976308, 0.7432712215320911, 0.6773584905660377)

In [None]:
# Save the model
output_dir = Path('/content/Trained Model')
if not output_dir.exists():
  output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved")

# Upload the folder into drive
!zip -r '/content/Trained_Model.zip' '/content/Trained Model'
!mv '/content/Trained_Model.zip' '/content/drive/MyDrive/'
print("Uploaded")

Saved
  adding: content/Trained Model/ (stored 0%)
  adding: content/Trained Model/attribute_ruler/ (stored 0%)
  adding: content/Trained Model/attribute_ruler/patterns (deflated 84%)
  adding: content/Trained Model/lemmatizer/ (stored 0%)
  adding: content/Trained Model/lemmatizer/lookups/ (stored 0%)
  adding: content/Trained Model/lemmatizer/lookups/lookups.bin (deflated 56%)
  adding: content/Trained Model/tagger/ (stored 0%)
  adding: content/Trained Model/tagger/model (deflated 8%)
  adding: content/Trained Model/tagger/cfg (deflated 65%)
  adding: content/Trained Model/config.cfg (deflated 73%)
  adding: content/Trained Model/senter/ (stored 0%)
  adding: content/Trained Model/senter/model (deflated 9%)
  adding: content/Trained Model/senter/cfg (stored 0%)
  adding: content/Trained Model/tok2vec/ (stored 0%)
  adding: content/Trained Model/tok2vec/model (deflated 7%)
  adding: content/Trained Model/tok2vec/cfg (stored 0%)
  adding: content/Trained Model/vocab/ (stored 0%)
  add

In [17]:
# Test on known
doc = nlp(test_data[-1][0])
print(test_data[-1][0])
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Saturday 25-08-07 at 4 pm local time, a Mujahid of Islamic Emirate of Afghanistan Mulwi Muhammad, carried out a martyrdom attack against a convoy of NATO Christian kafirs terrorists occupation militaries terrorists in on oli Charkhi highway in east of Kabul city. The martyrdom attack destroyed two vehicles, after ISAF Christian kafirs terrorists occupation militaries terrorists blocked the highway and transferred the terrorists died bodies by helicopter from area. Reported by Zabihuallah Mujahid
Entities [('25-08-07', 'DATE'), ('4 pm', 'TIME'), ('Mujahid', 'NAME'), ('Islamic Emirate of Afghanistan', 'ORGANISATION'), ('Mulwi Muhammad', 'NAME'), ('NATO Christian kafirs', 'ORGANISATION'), ('oli Charkhi highway', 'AREA'), ('Kabul city.', 'CITY'), ('martyrdom attack destroyed two vehicles', 'INCIDENT'), ('ISAF Christian kafirs', 'ORGANISATION'), ('Zabihuallah Mujahid', 'NAME')]


In [18]:
# Create a output table
def create_table(model, data, name, save=True, hasLabels=True):
  """
  Gives out a csv file with Name, Date, Time, Area, City, District, State,
  Country as Column names
  """
  # Dictionary to store data
  table = {"NAME":[], "DATE":[], "TIME":[], "AREA":[], "CITY": [],
           "DISTRICT":[], "STATE":[], "COUNTRY":[], "ORGANISATION":[], "INCIDENT":[]}

  if hasLabels:
    for d in data:
      # Single sample
      row = {"NAME":[], "DATE":[], "TIME":[], "AREA":[], "CITY": [],
           "DISTRICT":[], "STATE":[], "COUNTRY":[], "ORGANISATION":[], "INCIDENT":[]}

      text = d
      pred = model(text)
      for ent in pred.ents:
        row[ent.label_].append(ent.text)

      # Append sample to table
      for k in table:
        table[k].append(row[k])
  else:
    for d in data:
      # Single sample
      row = {"NAME":[], "DATE":[], "TIME":[], "AREA":[], "CITY": [],
           "DISTRICT":[], "STATE":[], "COUNTRY":[], "ORGANISATION":[], "INCIDENT":[]}

      pred = model(d)
      for ent in pred.ents:
        row[ent.label_].append(ent.text)

      # Append sample to table
      for k in table:
        table[k].append(row[k])

  df = pd.DataFrame(table)

  # Save csv
  if save:
    df.to_csv(name)
  return df

#df = create_table(nlp, test_data, "Test_data_output.csv")
#df.sample(5)

In [19]:
# For the entire data
df = create_table(nlp, training_data["annotations"], "Outputs_Entire_Data.csv")
df.sample(5)

ValueError: ignored

# Geocoding

In [20]:
# cleaning csv file
import pandas as pd

#df = pd.read_csv('/content/Outputs_Entire_Data.csv')

def destrip_table(df):
  #Removing "[]" from table
  df = df.apply(lambda x: x.astype(str).str.strip("[]"))
  return df

def deduplicate(cell):
  #Removing dulplicate values in each cell
  cell_list = cell.split(',')
  deduplicated_list = list(set(cell_list))
  return ','.join(deduplicated_list)

def clean_data(df):
  df = destrip_table(df)
  df = df.applymap(deduplicate)
  return df

##df.to_csv('deduplicated_data.csv', index=False)

In [21]:
import geopy
from geopy.geocoders import Nominatim
import requests
import csv

In [22]:
# Geocoding of CSV by concatenating different entities
def geocode(in_file):
  with open(in_file, 'r') as f_in:
    csv_reader = csv.reader(f_in)

    # we are using OpenStreetmaps Geocoder
    geolocator = Nominatim(user_agent="Garudaltics")

    # Removing heading
    header = next(csv_reader)
    with open('geocoded_data.csv', 'w', newline='') as f_out:
      csv_writer = csv.writer(f_out)

      # Adding lat, long headings
      header.append("latitude")
      header.append("longitude")
      csv_writer.writerow(header)

      # Concatenating Area, city, district, State, and Country
      for row in csv_reader:
        loc = row[4]
        loc = loc  + row[5]
        loc = loc  + row[6]
        loc = loc  + row[7]
        loc = loc + row[8]

        print(loc)

        # Geocoder on loc
        location = geolocator.geocode(loc, timeout=10000)
        # Removing low level entities one by one to get geocoding
        if location == None:
          loc1 = row[5]
          loc1 = loc1 + row[6]
          loc1 = loc1 + row[7]
          location = geolocator.geocode(loc1, timeout=10000)

          if location == None:
            loc2 = row[6]
            loc2 = loc2 + row[7]
            location = geolocator.geocode(loc2, timeout=10000)

            if location == None:
              loc3 = row[7]
              location = geolocator.geocode(loc3, timeout=10000)

              if location == None:
                print("Location not found")

              else:
                print((location.latitude, location.longitude))

                # Adding lat, long values to same column
                row.append(location.latitude)
                row.append(location.longitude)
                csv_writer.writerow(row)
            else:
              print((location.latitude, location.longitude))

              row.append(location.latitude)
              row.append(location.longitude)
              csv_writer.writerow(row)
          else:
            print((location.latitude, location.longitude))
            row.append(location.latitude)
            row.append(location.longitude)
            csv_writer.writerow(row)
        else:
          print((location.latitude, location.longitude))
          row.append(location.latitude)
          row.append(location.longitude)
          csv_writer.writerow(row)
  print("Done.")

# Map visualization

In [None]:
!pip install --upgrade plotly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting plotly
  Downloading plotly-5.13.0-py2.py3-none-any.whl (15.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.2/15.2 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: plotly
  Attempting uninstall: plotly
    Found existing installation: plotly 5.5.0
    Uninstalling plotly-5.5.0:
      Successfully uninstalled plotly-5.5.0
Successfully installed plotly-5.13.0


In [None]:
# load csv into dataframe
import pandas as pd
import plotly.express as px

# df_map = pd.read_csv('/content/geocoded_test_data_1.csv')

In [None]:
def plot_on_map(df_map):
  fig = px.scatter_mapbox(df_map, lat="latitude", lon="longitude",
                        color_continuous_scale=px.colors.cyclical.IceFire,
                        size_max=15, zoom=5, mapbox_style="open-street-map",
                        hover_name="NAME", hover_data=["DATE", "TIME"])
  fig.update_traces(marker=dict(size=10, color="black"))
  fig.show()

#Export to shapefile

In [None]:
!pip install geopandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting geopandas
  Downloading geopandas-0.12.2-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
Collecting fiona>=1.8
  Downloading Fiona-1.8.22-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.6/16.6 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyproj>=2.6.1.post1
  Downloading pyproj-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m81.4 MB/s[0m eta [36m0:00:00[0m
Collecting munch
  Downloading munch-2.5.0-py2.py3-none-any.whl (10 kB)
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting click-plugins>=1.0
  Downloading click_plugins-1.1.1-py2.py3-

In [None]:
import geopandas as gpd

def export(infile):
  df = pd.read_csv("geocoded_data.csv")

  # convert the DataFrame to a GeoDataFrame
  gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
  gdf.crs = "EPSG:4326"
  gdf.to_file("point_data.shp", driver="ESRI Shapefile")

#gdf.to_file("point_data.geojson", driver="GeoJSON")

DriverError: ignored

## Inference - Sample Procedure

In [None]:
# Load the data
with open("/content/report_96_5_05-11-07.txt", "r") as infile:
  data = infile.read()
  data = data.split("\n")
print(data)

['Sunday night 4-11-07 at 9 pm local time, Mujahideen of Islamic Emirate of Afghanistan ambushed an enemy convoy in Pashi Band area of Sewri district of Zabul province. In the attack a firefight started which lasted for half an hour, in which three tanks were destroyed and more than 12 Christian terrorists were killed and a number of them were wounded. Later occupation military blocked the road between district Qallat capital city of province and bombarded the area but their were no causalities on Mujahideen side. Reported by Qari Muhammad Yousuf', 'Sunday morning 4-11-07 at 10 am local time, NATO terrorists attacked a Mujahideen post in Khol abad and Chini area of Kajaki district of Helmand province. In the attack firefight started in which Mujahideen killed 10 terrorists and enemy attack was defeated.', 'Mujahideen of Islamic Emirate of Afghanistan have blocked Kandahar and Uruzgan highway from the use of enemy in Buragamn area of Shah Walikot district of Kandahar province. Saturday 

In [None]:
# Load the model
nlp = spacy.load('/content/Trained_Model/content/Trained Model')

In [None]:
ner = nlp.get_pipe("ner")
labels = ner.labels
print(labels)

('AREA', 'CARDINAL', 'CITY', 'COUNTRY', 'DATE', 'DISTRICT', 'EVENT', 'FAC', 'GPE', 'INCIDENT', 'LANGUAGE', 'LAW', 'LOC', 'LOCATION', 'MONEY', 'NAME', 'NORP', 'ORDINAL', 'ORG', 'ORGANISATION', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'STATE', 'TIME', 'WORK_OF_ART')


In [None]:
# Get output table
df = create_table(nlp, data, "Outputs.csv", save=False)
df.head(3)

Unnamed: 0,NAME,DATE,TIME,AREA,CITY,DISTRICT,STATE,COUNTRY,ORGANISATION,INCIDENT
0,"[Mujahideen, Christian, Mujahideen, Qari Muham...",[4-11-07],"[Sunday night, 9 pm, 12]",[Pashi Band area],[Qallat capital city],[Sewri],[Zabul],[],[Islamic Emirate of Afghanistan],"[ambushed an enemy convoy, three tanks were de..."
1,"[Mujahideen, Mujahideen]",[4-11-07],"[Sunday morning, 10 am]",[Chini area],[],"[Khol abad, Kajaki district]",[Helmand province.],[],[NATO terrorists],[]
2,"[Mujahideen, Mujahideen, Mujahideen]",[],[],"[Kandahar and Uruzgan highway, Buragamn area]",[],[Shah Walikot district],[Kandahar province.],[],[Islamic Emirate of Afghanistan],[]


In [None]:
# Clean the table
df = clean_data(df)
df.head(3)

# Save the output
df.to_csv("Outputs.csv")

In [None]:
# Geocode
geocode("Outputs.csv")

'Pashi Band area''Qallat capital city''Sewri''Zabul'
(32.3068942, 67.129904)
'Chini area''Khol abad', 'Kajaki district''Helmand province.'
(31.0, 64.0)
 'Buragamn area','Kandahar and Uruzgan highway''Shah Walikot district''Kandahar province.'
(30.9910033, 65.7068525)
 'Fareab','Meadan Shar airport'
Location not found
'Khaton area''Ghormach''Badghis province.'
(35.720631, 63.779506)
'Boragan and Kata Sang''Kandahar''Shahwalikot district''Kandahar province'
(30.9910033, 65.7068525)
 'Herat and','Ashigho area''Kandahar''Zhari''Kandahar'
(31.6161203, 65.5755008)
'Baladeh area''Gardize capital city''Paktia province.'
(33.705724, 69.4083555)
 'Ghol azbak area','Torjan area''Khaton''Ghormach', 'Sang Atash' 'Badghis province.','Badghis province.'
(34.7582024, 63.0203661)

Location not found
Done.


In [None]:
# Visualize
df_map = pd.read_csv("geocoded_data.csv")
plot_on_map(df_map)