Named Entity Recognition

In [None]:
# read file

def read_file(filepath):

    sentences = []
    sentence = []

    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
                continue
            parts = line.split()
            if len(parts) != 4:
                continue
            token, pos, chunk, ner = parts
            sentence.append((token, pos, chunk, ner))

        if sentence:
            sentences.append(sentence)

    return sentences

In [None]:
# extracting entity patterns (rule-based)

def extract_entity_patterns(sentences):

    patterns = []
    phrase = []
    current_label = None

    for sentence in sentences:
        for token, pos, chunk, ner in sentence:
            if ner.startswith('B-'):
                if phrase:
                    patterns.append((' '.join(phrase), current_label))
                phrase = [token]
                current_label = ner[2:]

            elif ner.startswith('I-') and current_label:
                phrase.append(token)

            else:
                if phrase:
                    patterns.append((' '.join(phrase), current_label))
                phrase = []
                current_label = None

        if phrase:
            patterns.append((' '.join(phrase), current_label))
            phrase = []
            current_label = None

    return patterns

In [None]:
# get entity types

def get_entity_types(filepath):

    entity_types = set()

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split()
            if len(parts) != 4:
                continue
            ner = parts[3]
            if ner != 'O':
                entity_types.add(ner[2:])

    return sorted(entity_types)

file_path = '/content/train.txt'
entity_types = get_entity_types(file_path)

print("Entity types:")
print(entity_types)


Entity types:
['LOC', 'MISC', 'ORG', 'PER']


In [None]:
# normalization of labels

label_map = {
    "PER": "PERSON",
    "PERSON": "PERSON",
    "LOC": "LOC",
    "GPE": "LOC",
    "ORG": "ORG",
    "FAC": "ORG",
    "MISC": "MISC",
    "PRODUCT": "MISC",
    "EVENT": "MISC",
    "WORK_OF_ART": "MISC",
    "LAW": "MISC",
    "LANGUAGE": "MISC",
    "DATE": "MISC",
    "TIME": "MISC",
    "PERCENT": "MISC",
    "MONEY": "MISC",
    "QUANTITY": "MISC",
    "ORDINAL": "MISC",
    "CARDINAL": "MISC",
    "NORP": "MISC",
}

def normalize_labels(labels):
    normalized = []
    for seq in labels:
        normalized_seq = []
        for label in seq:
            if label == 'O':
                normalized_seq.append(label)
            else:
                prefix = label[:2]  # B- or I-
                ent = label[2:]
                mapped_ent = label_map.get(ent, ent)
                normalized_seq.append(prefix + mapped_ent)
        normalized.append(normalized_seq)
    return normalized

In [None]:
# convert the extracted entity patterns to spacy patterns (rule-based)

def convert_to_spacy_patterns(patterns):

    spacy_patterns = []
    added = set()

    for phrase, label in patterns:
        phrase_lower = phrase.lower()
        if phrase_lower not in added:
            spacy_patterns.append({"label": label, "pattern": phrase})
            added.add(phrase_lower)

    return spacy_patterns

In [None]:
# create spacy model (rule-based)

import spacy
from spacy.pipeline import EntityRuler

def create_spacy_model(spacy_patterns):

    nlp = spacy.blank('en')
    ruler = nlp.add_pipe('entity_ruler')
    ruler.add_patterns(spacy_patterns)
    return nlp

In [None]:
import seqeval
from seqeval.metrics import classification_report, f1_score

def evaluate(nlp, test_sentences):
    gold_labels = []
    pred_labels = []

    total_tokens = 0
    correct_tokens = 0

    for sentence in test_sentences:
        tokens = [token for token, pos, chunk, ner in sentence]
        golds = [ner for token, pos, chunk, ner in sentence]
        text = ' '.join(tokens)
        doc = nlp(text)

        preds = ['O'] * len(tokens)

        for ent in doc.ents:
            ent_tokens = ent.text.split()
            ent_len = len(ent_tokens)
            for i in range(len(tokens) - ent_len + 1):
                window = tokens[i:i+ent_len]
                if [w.lower() for w in window] == [t.lower() for t in ent_tokens]:
                    preds[i] = f"B-{ent.label_}"
                    for j in range(1, ent_len):
                        preds[i+j] = f"I-{ent.label_}"

        gold_labels.append(golds)
        pred_labels.append(preds)

        for g, p in zip(golds, preds):
            total_tokens += 1
            if g == p:
                correct_tokens += 1

    gold_labels = normalize_labels(gold_labels)
    pred_labels = normalize_labels(pred_labels)

    print(classification_report(gold_labels, pred_labels))
    print("F1 Score:", f1_score(gold_labels, pred_labels))


In [None]:
# file paths

train_file = "/content/train.txt"
test_file = "/content/test.txt"

In [None]:
# NER approachs

train_sentences = read_file(train_file) # rule-based
entity_patterns = extract_entity_patterns(train_sentences) # rule-based
spacy_patterns = convert_to_spacy_patterns(entity_patterns) # rule-based

nlp_rule_based = create_spacy_model(spacy_patterns) # rule-based
nlp_md = spacy.load('en_core_web_md') # spacy model md
nlp_lg = spacy.load('en_core_web_lg') # spacy model lg

In [None]:
# adding PER and MISC to model because PER and MISC labels doesn't exist in spacy models

nlp_md.get_pipe("ner").add_label("PER")
nlp_md.get_pipe("ner").add_label("MISC")

nlp_lg.get_pipe("ner").add_label("PER")
nlp_lg.get_pipe("ner").add_label("MISC")

1

In [None]:
# bonus: testing, comparing and evaluating using rule-based and 2 spacy models (md and lg)

test_sentences = read_file(test_file)

print("Rule-Based NER:")
evaluate(nlp_rule_based, test_sentences)

print("\n\nSpacy Model (MD) NER:")
evaluate(nlp_md, test_sentences)

print("\n\nSpacy Model (LG) NER:")
evaluate(nlp_lg, test_sentences)

Rule-Based NER:
              precision    recall  f1-score   support

         LOC       0.61      0.69      0.65      1668
        MISC       0.76      0.57      0.65       702
         ORG       0.71      0.48      0.58      1661
      PERSON       0.57      0.14      0.23      1617

   micro avg       0.66      0.46      0.54      5648
   macro avg       0.66      0.47      0.53      5648
weighted avg       0.65      0.46      0.51      5648

F1 Score: 0.5402250937890788


Spacy Model (MD) NER:
              precision    recall  f1-score   support

         LOC       0.68      0.77      0.72      1668
        MISC       0.11      0.59      0.18       702
         ORG       0.37      0.33      0.35      1661
      PERSON       0.78      0.66      0.71      1617

   micro avg       0.39      0.59      0.47      5648
   macro avg       0.48      0.59      0.49      5648
weighted avg       0.55      0.59      0.54      5648

F1 Score: 0.4675782351282774


Spacy Model (LG) NER:
        

In [None]:
# bonus: visualizing the extracted entities

from spacy import displacy

for i, sentence in enumerate(test_sentences[:20]):
    tokens = [token for token, pos, chunk, ner in sentence]
    text = " ".join(tokens)
    doc = nlp_lg(text)
    displacy.render(doc, style="ent", jupyter=True)

