In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
import re
import spacy
import warnings
import string
warnings.filterwarnings('ignore')

# nltk.download('average_perceptron_tagger')
# nltk.download('punkt_tab')

# Data Loading

In [2]:
def read_data(path):
    sentences = []
    sentence = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:  # sentence boundary
                if sentence:
                    sentences.append(sentence)
                    sentence = []
                continue
            if line.startswith("-DOCSTART-"):
                continue
            parts = line.split()
            word, pos, chunk, ner = parts
            sentence.append({"word": word, "pos": pos, "chunk": chunk, "ner": ner})
    if sentence:
        sentences.append(sentence)
    return sentences # Returns a list of sentences, inside each sentence a list of dictionaries for each word.

In [3]:
train_sentences = read_data("Dataset/train.txt")
test_sentences  = read_data("Dataset/test.txt")
valid_sentences = read_data("Dataset/valid.txt")

# I think we'll perform NER on the test sentences

In [4]:
full_train_sentences = ' '.join(word for sentence in train_sentences for word in [d['word'] for d in sentence])

# Model-Based NER

In [8]:
# Using (probably) largest model.
nlp = spacy.load('en_core_web_trf')
nlp.max_length = 1100000
doc = nlp(full_train_sentences)
spacy.displacy.render(nlp(full_train_sentences[:2000]), style='ent', jupyter=True)

In [10]:
# Using small model
nlp_small = spacy.load('en_core_web_sm')
nlp_small.max_length = 1100000
doc_small = nlp_small(full_train_sentences)
spacy.displacy.render(nlp_small(full_train_sentences[:2000]), style='ent', jupyter=True)

# Rule-Based NER

In [11]:
# Preparation for rule-based NER
train_sentences_tokenized = word_tokenize(full_train_sentences)
#tests_tokenized_no_puncutations = [words for words in train_sentences_tokenized if words not in string.punctuation]
tagged_train_data = pos_tag(train_sentences_tokenized)
tagged_train_data

[('EU', 'NNP'),
 ('rejects', 'VBZ'),
 ('German', 'JJ'),
 ('call', 'NN'),
 ('to', 'TO'),
 ('boycott', 'VB'),
 ('British', 'JJ'),
 ('lamb', 'NN'),
 ('.', '.'),
 ('Peter', 'NNP'),
 ('Blackburn', 'NNP'),
 ('BRUSSELS', 'NNP'),
 ('1996-08-22', 'CD'),
 ('The', 'DT'),
 ('European', 'JJ'),
 ('Commission', 'NNP'),
 ('said', 'VBD'),
 ('on', 'IN'),
 ('Thursday', 'NNP'),
 ('it', 'PRP'),
 ('disagreed', 'VBD'),
 ('with', 'IN'),
 ('German', 'JJ'),
 ('advice', 'NN'),
 ('to', 'TO'),
 ('consumers', 'NNS'),
 ('to', 'TO'),
 ('shun', 'VB'),
 ('British', 'JJ'),
 ('lamb', 'NNS'),
 ('until', 'IN'),
 ('scientists', 'NNS'),
 ('determine', 'VBP'),
 ('whether', 'IN'),
 ('mad', 'JJ'),
 ('cow', 'JJ'),
 ('disease', 'NN'),
 ('can', 'MD'),
 ('be', 'VB'),
 ('transmitted', 'VBN'),
 ('to', 'TO'),
 ('sheep', 'VB'),
 ('.', '.'),
 ('Germany', 'NNP'),
 ("'s", 'POS'),
 ('representative', 'NN'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('European', 'NNP'),
 ('Union', 'NNP'),
 ("'s", 'POS'),
 ('veterinary', 'JJ'),
 ('committee', 'NN'),
 

In [12]:
import json

organizations = {pd.read_csv('Dictionaries/Organizations.csv')['orgLabel'].str}
locations = {val for entry in json.load(open("Dictionaries/Cities, Countries, States.json", encoding="utf-8")) for val in (entry["country_name"], entry["state_name"], entry["name"])}
person_first_names = {name.strip() for name in open("Dictionaries/First Names.txt", encoding="utf-8")}
person_last_names = set(pd.read_csv('Dictionaries/Last Names.csv')['name'].str.lower().str.capitalize())

def rule_based_ner(tagged_text):
    entities = []

    # Iteration index
    i = 0
    while i < len(tagged_text):
        token, pos = tagged_text[i]

        # Rule 1: Detect multi-word locations (e.g., "Washington D.C.")
        # This rule checks for a two-word capitalized sequence that is in our locations dictionary.
        if i + 1 < len(tagged_text):
            next_token, next_pos = tagged_text[i+1]
            candidate = f"{token} {next_token}"
            if pos == 'NNP' and next_pos == 'NNP' and candidate in locations:
                entities.append((candidate, "LOCATION"))
                i += 2
                continue

        # Rule 2: Detect person names
        # This rule checks for a two-word sequence of proper nouns (NNP)
        # where the first word is a known first name and the second is a known last name.
        if i + 1 < len(tagged_text):
            next_token, next_pos = tagged_text[i+1]
            if pos == 'NNP' and next_pos == 'NNP' and token in person_first_names and next_token in person_last_names:
                entities.append((f"{token} {next_token}", "PERSON"))
                i += 2
                continue

        # Rule 3: Detect organizations
        # This rule checks for a single capitalized word that is in our organizations dictionary.
        if pos == 'NNP' and token in organizations:
            entities.append((token, "ORGANIZATION"))
            i += 1
            continue

        # Rule 4: Detect single-word locations
        # This rule checks for a single capitalized word that is in our locations dictionary.
        # This should be checked after multi-word locations to avoid partial matches.
        if pos == 'NNP' and token in locations:
            entities.append((token, "LOCATION"))
            i += 1
            continue

        # Default increment
        i += 1

    return entities

In [13]:
found_entities = rule_based_ner(tagged_train_data)
print(found_entities)

[('Peter Blackburn', 'PERSON'), ('Germany', 'LOCATION'), ('Union', 'LOCATION'), ('Union', 'LOCATION'), ('France', 'LOCATION'), ('Loyola', 'LOCATION'), ('France', 'LOCATION'), ('Germany', 'LOCATION'), ('Welsh', 'LOCATION'), ('Union', 'LOCATION'), ('John Lloyd', 'PERSON'), ('Jones', 'LOCATION'), ('Bonn', 'LOCATION'), ('March', 'LOCATION'), ('Germany', 'LOCATION'), ('Jimi', 'LOCATION'), ('Florida', 'LOCATION'), ('London', 'LOCATION'), ('English', 'LOCATION'), ('Nottingham', 'LOCATION'), ('China', 'LOCATION'), ('Taiwan', 'LOCATION'), ('China', 'LOCATION'), ('Taipei', 'LOCATION'), ('Taiwan', 'LOCATION'), ('Vice', 'LOCATION'), ('Lien Chan', 'PERSON'), ('Beijing', 'LOCATION'), ('Taiwan', 'LOCATION'), ('Taiwan', 'LOCATION'), ('China', 'LOCATION'), ('Taipei', 'LOCATION'), ('Taiwan', 'LOCATION'), ('Taiwan', 'LOCATION'), ('Beijing', 'LOCATION'), ('China', 'LOCATION'), ('Taipei', 'LOCATION'), ('Vice', 'LOCATION'), ('China', 'LOCATION'), ('Taiwan', 'LOCATION'), ('China', 'LOCATION'), ('Taiwan', 'LO

In [24]:
ner_results_rulebased = pd.DataFrame(found_entities, columns=['Identified', 'Entity'])
ner_results_rulebased.head()

Unnamed: 0,Identified,Entity
0,Peter Blackburn,PERSON
1,Germany,LOCATION
2,Union,LOCATION
3,Union,LOCATION
4,France,LOCATION


In [25]:
ner_results_rulebased['Entity'].value_counts(normalize=True)

Entity
LOCATION    0.828451
PERSON      0.171549
Name: proportion, dtype: float64

In [16]:
ner_results_large_spacy = pd.DataFrame([(e.label_, e.text) for e in doc.ents], columns=['Entity', 'Identified'])
ner_results_large_spacy.head()

Unnamed: 0,Entity,Identified
0,ORG,EU
1,NORP,German
2,NORP,British
3,PERSON,Peter Blackburn
4,GPE,BRUSSELS


In [26]:
ner_results_large_spacy['Entity'].value_counts(normalize=True)

Entity
GPE            0.198610
PERSON         0.190622
CARDINAL       0.157352
ORG            0.139554
DATE           0.136274
NORP           0.058804
ORDINAL        0.032990
MONEY          0.019984
TIME           0.012977
QUANTITY       0.012333
EVENT          0.012108
PERCENT        0.010427
FAC            0.006306
LOC            0.005325
PRODUCT        0.003924
WORK_OF_ART    0.001429
LAW            0.000561
LANGUAGE       0.000420
Name: proportion, dtype: float64

In [27]:
ner_results_small_spacy = pd.DataFrame([(e.label_, e.text) for e in doc_small.ents], columns=['Entity', 'Identified'])
ner_results_small_spacy.head()

Unnamed: 0,Entity,Identified
0,ORG,EU
1,NORP,German
2,NORP,British
3,PERSON,Peter Blackburn
4,DATE,1996-08-22


In [28]:
ner_results_small_spacy['Entity'].value_counts(normalize=True)

Entity
DATE           0.185113
GPE            0.174123
PERSON         0.170817
CARDINAL       0.167808
ORG            0.127718
NORP           0.059451
ORDINAL        0.029398
MONEY          0.018348
QUANTITY       0.012808
TIME           0.012450
PERCENT        0.009889
PRODUCT        0.009740
LOC            0.006463
EVENT          0.005272
FAC            0.005004
WORK_OF_ART    0.002770
LAW            0.001996
LANGUAGE       0.000834
Name: proportion, dtype: float64