## Create a ner for customer name recognition

### Data

In [467]:
import pandas as pd
import random
from sklearn.model_selection  import train_test_split
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin

In [297]:
def construct_customer_name_data_set(in_file_path: str, out_file_path: str, sexe: str='m') -> None:
    """
    Construct a customer name dataset from a base of morrocan names.
    
    :param in_file_path: str
        input file path
    :param out_file_path: str
        output file path
    :param sexe: str
        Sexe, 'f' for female, 'm' for male.
    :return: None
    """
    
    # Read the text files
    in_file = open(in_file_path, 'r')
    out_file = open(out_file_path, 'r')
    
    # Customer name prefix
    if sexe == 'm':
        pre_customer_name = ['M', 'Mr']
    
    if sexe == 'f':
        pre_customer_name = ['Mme']
    
    # Open files an damp cleaned data to out file
    with open(in_file_path, 'r') as in_file:
        with open(out_file_path, 'a') as out_file:
            # Loop through lines
            for line in in_file:
                # join names over '\n' 
                name = '\n'.join(line.split())
                pre = random.choice(pre_customer_name)

                # Take a random predix from prefix array
                out_file.write(pre + ' ' + name)
                out_file.write('\n')

In [295]:
construct_customer_name_data_set('data_m.txt', 'cleaned_m.txt')

In [296]:
construct_customer_name_data_set('data_f.txt', 'cleaned_f.txt', 'f')

In [598]:
def annotate_data(sexe: str='m') -> None:
    """
    Annotate cleaned data.
    
    :param sexe: str
        Sexe, 'f' for female, 'm' for male.
    :return: None
    """
    
    # Holds names with its prefix if present
    clauses = []
    
    tr_data = []

    # Entities, typically will be customer_name
    entities = []
    
    # label
    label = 'PER' 

    # Choose the input file based on the sexe argument
    if sexe == 'm':
        in_file_path = 'cleaned_m.txt'
    else:
        in_file_path = 'cleaned_f.txt'
    
    # Open input file in read mode
    with open(in_file_path) as in_file:
        
        for line in in_file:
            
            line_content = line.split(' ')
            
            # Names without prefixes
            if len(line_content) < 2:
                name = line_content[0].strip('\n')
                name_length = len(name)
                entities.append((0, name_length, label))
                tr_data.append([name, {'entities' : entities}])

            # Name with prefixes 
            elif len(line_content) == 2:
                name = line_content[1].strip('\n')
                prefix = line_content[0]
                
                name_length = len(name)
                prefix_length = len(prefix)
                entities.append((prefix_length+1, prefix_length + name_length + 1, label))
                tr_data.append([line.strip('\n'), {'entities' : entities}])
            # Nevermind
            else:
                pass

            # Initialize entities list
            entities = []
    return tr_data
        

In [599]:
f_set = annotate_data('f')
m_set = annotate_data('m')

In [608]:
def shuffle_combine_data(m_set, f_set):
    """
    Shuffle the data.
    """
    
    # Full data
    data = []
    
    for m_record, f_record in zip(m_set, f_set):
        data.append(m_record)
        data.append(f_record)
    
    # create train, and test sets
    train_set, test_set = train_test_split(data, train_size = 0.7, test_size = 0.3)
    
    # create valid set
    valid_set, test_set = train_test_split(test_set, train_size = 0.5, test_size = 0.5)
    
    return train_set, test_set, valid_set

In [609]:
train_set, test_set, valid_set = shuffle_combine_data(f_set, m_set)

In [610]:
train_set[:10]

[['Nehal', {'entities': [(0, 5, 'PER')]}],
 ['M Chouaib', {'entities': [(2, 9, 'PER')]}],
 ['Hani', {'entities': [(0, 4, 'PER')]}],
 ['Mme Elbatoul', {'entities': [(4, 12, 'PER')]}],
 ['Souhir', {'entities': [(0, 6, 'PER')]}],
 ['Israe', {'entities': [(0, 5, 'PER')]}],
 ['Yassira', {'entities': [(0, 7, 'PER')]}],
 ['Majd', {'entities': [(0, 4, 'PER')]}],
 ['Chaib', {'entities': [(0, 5, 'PER')]}],
 ['Mme Maazouza', {'entities': [(4, 12, 'PER')]}]]

In [611]:
# From : https://stackoverflow.com/questions/67407433/using-spacy-3-0-to-convert-data-from-old-spacy-v2-format-to-the-brand-new-spacy

def convert_spacy_v3(data, out_file = "./train.spacy"):
    nlp = spacy.load("fr_core_news_sm") # load a new spacy model
    db = DocBin() # create a DocBin object

    for text, annot in tqdm(train_set): # data in previous format
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        for start, end, label in annot["entities"]: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents # label the text with the ents
        db.add(doc)

    db.to_disk(out_file) # save the docbin object

    db = DocBin()

In [632]:
convert_spacy_v3(train_set, out_file = "./train.spacy")
convert_spacy_v3(valid_set, out_file = "./valid.spacy")

 76%|██████████████████████████████████████████████████████████████████████████████████▎                         | 1361/1786 [00:00<00:00, 4228.83it/s]

Skipping entity
Skipping entity


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1786/1786 [00:00<00:00, 4204.12it/s]
 91%|██████████████████████████████████████████████████████████████████████████████████████████████████▍         | 1628/1786 [00:00<00:00, 4046.37it/s]

Skipping entity
Skipping entity


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1786/1786 [00:00<00:00, 3911.39it/s]


In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train config.cfg --verbose --output ./customer_name_ner/training/ --paths.train train.spacy --paths.dev valid.spacy

[i] Saving to output directory: customer_name_ner\training


[2022-03-09 21:59:47,119] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[2022-03-09 21:59:50,660] [INFO] Set up nlp object from config
[2022-03-09 21:59:50,660] [DEBUG] Loading corpus from path: valid.spacy
[2022-03-09 21:59:50,660] [DEBUG] Loading corpus from path: train.spacy
[2022-03-09 21:59:50,660] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-03-09 21:59:50,676] [INFO] Created vocabulary
[2022-03-09 21:59:50,676] [INFO] Finished initializing nlp object

[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     83.33   88.30   79.44   99.38    0.88
  9     200          4.03    478.68  100.00  100.00  100.00    1.00
 21     400          0.00      0.00  100.00  100.00  100.00    1.00
 35     600          0.00      0.00  100.00  100.00  100.00    1.00
 53     800          0.00      0.00  100.00  100.00  100.00    1.00
 74    1000          0.00      0.00  100.00  100.00  100.00    1.00
100    1200          0.00      0.00  100.00  100.00  100.00    1.00
131    1400          0.00      0.00  100.00  100.00  100.00    1.00
169    1600          0.00      0.00  100.00  100.00  100.00    1.00
215    1800          0.00      0.00  100.00  100.00  100.00    1.00
[+] Saved pipeline to output directory
customer_name_ner\training\model-


[2022-03-09 21:59:51,740] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[2022-03-09 21:59:51,740] [DEBUG] Loading corpus from path: valid.spacy
[2022-03-09 21:59:51,740] [DEBUG] Loading corpus from path: train.spacy
[2022-03-09 21:59:51,756] [DEBUG] Removed existing output directory: customer_name_ner\training\model-best
[2022-03-09 21:59:51,756] [DEBUG] Removed existing output directory: customer_name_ner\training\model-last


In [653]:
ner = spacy.load("./customer_name_ner/training/model-best") #load the best model

test_sentences = ["""Relové d'ldentts
bancalre

COMMERCI
M ABDELHAK ESSADIQI

POSTE MAROC"""]
for x in test_sentences:
    doc = ner(x)
    for ent in doc.ents:
        spacy.displacy.render(doc, jupyter=True, style = "ent")

In [636]:
# Load pre-existing spacy model
import spacy
nlp=spacy.load('fr_core_news_sm')

# Getting the pipeline component
ner=nlp.get_pipe("ner")

In [None]:
# Adding labels to the `ner`

for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [637]:
ner

<spacy.pipeline.ner.EntityRecognizer at 0x1a057f83140>