# Creating a NER model

## 1. Test

In [25]:
from flair.data import Sentence
from flair.models import SequenceTagger
import nltk

In [2]:
raw_sentence = "Hi! My name is Paul Déchorgnat and I am an engineer based in Paris since 2014"
sentence = Sentence(raw_sentence)

In [3]:
tagger = SequenceTagger.load("ner")



2022-10-04 12:13:31,992 loading file /home/paul/.flair/models/ner-english/4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4
2022-10-04 12:13:33,954 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [4]:
tagger.predict(sentence)

In [5]:
for entity in sentence.get_spans("ner"):
    print(entity)

Span[5:7]: "Paul Déchorgnat" → PER (0.9972)
Span[14:15]: "Paris" → LOC (0.9996)


In [6]:
entity.tag, entity.text


('LOC', 'Paris')

In [7]:
entities = {}

for entity in sentence.get_spans("ner"):
    entities[entity.text] = entity.tag
    
entities

{'Paul Déchorgnat': 'PER', 'Paris': 'LOC'}

In [8]:
tag_counter = {}

new_sentence = raw_sentence

for text, tag in entities.items():
    if tag in ["LOC", "PER"]:
        counter = tag_counter.get(tag, 0)
        new_sentence = new_sentence.replace(text, f"{tag}_{counter}")
        tag_counter[tag] = counter + 1
print(new_sentence)

Hi! My name is PER_0 and I am an engineer based in LOC_0 since 2014


## 2. Applying to data

In [11]:
import os

In [12]:
os.listdir("../data/")

['CNN_Articels_clean', 'CNN_Articels_clean_2']

In [13]:
import pandas as pd

In [14]:
df = pd.read_csv("../data/CNN_Articels_clean/CNN_Articels_clean.csv")

In [15]:
df.head()

Unnamed: 0,Index,Author,Date published,Category,Section,Url,Headline,Description,Keywords,Second headline,Article text
0,0,"Jacopo Prisco, CNN",2021-07-15 02:46:59,news,world,https://www.cnn.com/2021/07/14/world/tusimple-...,"There's a shortage of truckers, but TuSimple t...",The e-commerce boom has exacerbated a global t...,"world, There's a shortage of truckers, but TuS...","There's a shortage of truckers, but TuSimple t...","(CNN)Right now, there's a shortage of truck d..."
1,2,"Stephanie Bailey, CNN",2021-05-12 07:52:09,news,world,https://www.cnn.com/2021/05/12/world/ironhand-...,Bioservo's robotic 'Ironhand' could protect fa...,Working in a factory can mean doing the same t...,"world, Bioservo's robotic 'Ironhand' could pro...",A robotic 'Ironhand' could protect factory wor...,(CNN)Working in a factory or warehouse can me...
2,3,"Words by Stephanie Bailey, video by Zahra Jamshed",2021-06-16 02:51:30,news,asia,https://www.cnn.com/2021/06/15/asia/swarm-robo...,This swarm of robots gets smarter the more it ...,"In a Hong Kong warehouse, a swarm of autonomou...","asia, This swarm of robots gets smarter the mo...",This swarm of robots gets smarter the more it ...,"(CNN)In a Hong Kong warehouse, a swarm of aut..."
3,4,"Paul R. La Monica, CNN Business",2022-03-15 09:57:36,business,investing,https://www.cnn.com/2022/03/15/investing/brics...,Russia is no longer an option for investors. T...,"For many years, the world's most popular emerg...","investing, Russia is no longer an option for i...",Russia is no longer an option for investors. T...,"New York (CNN Business)For many years, the wor..."
4,7,Reuters,2022-03-15 11:27:02,business,business,https://www.cnn.com/2022/03/15/business/russia...,Russian energy investment ban part of new EU s...,The European Union formally approved on Tuesda...,"business, Russian energy investment ban part o...",EU bans investment in Russian energy in new sa...,The European Union formally approved on Tuesda...


In [30]:
anonymize_sentence(tagger, df["Article text"][0])

' (CNN)Right now, there\'s a shortage of truck drivers in the LOC_0 and worldwide, exacerbated by the e-commerce boom brought on by the pandemic. One solution to the problem is autonomous trucks, and several companies are in a race to be the first to launch one. Among them is San Diego-based TuSimple.Founded in 2015, TuSimple has completed about 2 million miles of road tests with its 70 prototype trucks across the LOC_0, LOC_1 and LOC_2. Although these are simply commercially available trucks retrofitted with its technology, TuSimple has deals in place with two of the world\'s largest truck manufacturers -- Navistar in the LOC_0 and Traton, Volkswagen\'s trucking business, in LOC_2 -- to design and build fully autonomous models, which it hopes to launch by 2024. Photos: The Yara Birkeland is what its builders call the world\'s first zero-emission, autonomous cargo ship. The ship is scheduled to make its first journey between two Norwegian towns before the end of the year. Click through

In [19]:
sentence = Sentence(df["Article text"][0])
%time tagger.predict(sentence)

CPU times: user 6min 29s, sys: 532 ms, total: 6min 30s
Wall time: 1min 37s


In [31]:
sentence = [Sentence(i) for i in nltk.sent_tokenize(df["Article text"][0])]
%time tagger.predict(sentence) # https://github.com/flairNLP/flair/issues/7#issuecomment-406067806 was right

CPU times: user 1min 8s, sys: 756 ms, total: 1min 9s
Wall time: 17.5 s


In [23]:
sentence = Sentence("France Martin was a sweet child. She lived in Paris, France.")
%time tagger.predict(sentence)

CPU times: user 1.97 s, sys: 7.92 ms, total: 1.97 s
Wall time: 504 ms


In [24]:
for entity in sentence.get_spans("ner"):
    print(entity)

Span[0:1]: "France" → LOC (0.9634)
Span[1:2]: "Martin" → PER (0.5726)
Span[10:11]: "Paris" → LOC (0.9971)
Span[12:13]: "France" → LOC (0.9999)


## 3. Defining functions

In [36]:
def get_entities(tagger, raw_sentence, tags_to_anonymize=["LOC", "PER", "ORG"]):
    sentences = [Sentence(s) for s in nltk.sent_tokenize(raw_sentence)]
    
    tagger.predict(sentences)
    
    entities = {}

    for s in sentences:
        for entity in s.get_spans("ner"):
            entities[entity.text] = entity.tag
        
    tag_counter = {}

    for text, tag in entities.items():
        if tag in tags_to_anonymize:
            counter = tag_counter.get(tag, 0)
            entities[text] = f"{tag}_{counter}"
            tag_counter[tag] = counter + 1
    return entities

def replace_text(raw_sentence, entities):
    new_sentence = raw_sentence
    
    for text, tag in entities.items():
        new_sentence = new_sentence.replace(text, tag)


def anonymize_sentence(tagger, raw_sentence, tags_to_anonymize=["LOC", "PER", "ORG"]):
    
    entities = get_entities(
        tagger=tagger,
        raw_sentence=raw_sentence,
        tags_to_anonymize=tags_to_anonymize
    )
    
    new_sentence = replace_text(
        raw_sentence=raw_sentence,
        entities=entities
    )
    return new_sentence

In [37]:
anonymize_sentence(tagger, raw_sentence="Hi ! This is Paul Déchorgnat and this is Alain Déchorgnat. One is living in Paris and the other is living in Rennes")