## Retrain with 3 entities - PERSON, ORG, GPE

In [1]:
import json
import spacy
import numpy as np
nlp = spacy.load('ja_core_news_lg')

In [None]:
def spacy_retrain(train_data_file, new_model_location):
    with open(train_data_file, mode='r', encoding='utf8') as train_file:
        train_json = json.load(train_file)
    train_data = train_json["train_data"]

    # For reproducing same results during multiple run
    s = 999
    np.random.seed(s)
    spacy.util.fix_random_seed(s)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    print(f'[OtherPipes] = {other_pipes} will be disabled')

    X = []
    y = []
    for text, annotations in train_data:
        X.append(text)
        y.append(annotations)

    model = nlp
    n_iter = 50
    with nlp.disable_pipes(*other_pipes):  # only train ner
        if model is None:
            optimizer = nlp.begin_training()
        else:
            optimizer = nlp.resume_training()
        for i in range(n_iter):
            losses = {}
            nlp.update(X, y, sgd=optimizer, drop=0.5, losses=losses)
            # nlp.entity.update(d, g)
            print("Losses", losses)

    nlp.to_disk(new_model_location)  


train_file = r'/home/kritika/Downloads/Uniphore/Training_set/FA_train_spacy3e_format.json'
new_model_loc = r'./model'
spacy_retrain(train_file, new_model_loc)

## Retrain with 10 entities (2 new - "EMAIL", "URL")

In [None]:
import json
import spacy
import numpy as np
nlp = spacy.load('ja_core_news_lg')

In [None]:
def spacy_retrain(train_data_file, new_model_location):
    with open(train_data_file, mode='r', encoding='utf8') as train_file:
        train_json = json.load(train_file)
    train_data = train_json["train_data"]

    # For reproducing same results during multiple run
    s = 999
    np.random.seed(s)
    spacy.util.fix_random_seed(s)
    
    
    ner = nlp.get_pipe('ner')
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    print(f'[OtherPipes] = {other_pipes} will be disabled')

    X = []
    y = []
    for text, annotations in train_data:
        X.append(text)
        y.append(annotations)

    model = nlp
    n_iter = 50
    with nlp.disable_pipes(*other_pipes):  # only train ner
        if model is None:
            optimizer = nlp.begin_training()
        else:
            optimizer = nlp.resume_training()
        for i in range(n_iter):
            losses = {}
            nlp.update(X, y, sgd=optimizer, drop=0.5, losses=losses)
            # nlp.entity.update(d, g)
            print("Losses", losses)

    nlp.to_disk(new_model_location)  


train_file = r'/home/kritika/Downloads/Uniphore/Training_set/FA_train_spacy10e_format.json'
new_model_loc = r'./model'
spacy_retrain(train_file, new_model_loc)

[OtherPipes] = ['parser'] will be disabled


In [1]:
import json
import spacy
import numpy as np

output_dir = r'/home/kritika/Downloads/Uniphore/Training_set/model'
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)

Loading from /home/kritika/Downloads/Uniphore/model


In [14]:
test_text = "しんちゃん, 出雲大社, 豊田喜一郎"    # Shinchan, building name, kiichiro toyoda
doc2 = nlp2(test_text)
for ent in doc2.ents:
    print(ent.label_, ent.text)

PERSON しん
GPE 出雲大社
PERSON 豊田喜一郎


In [11]:
test_text = "キョウトフ, キョウタンゴシ, タンゴチョウイエノタニ, 248-1005"   # address- 248-1005, Tangocho Ienotani, Kyotango-shi, Kyoto
doc2 = nlp2(test_text)
for ent in doc2.ents:
    print(ent.label_, ent.text)    # Predicted kyoto as a name

PERSON キョウトフ
GPE キョウタンゴシ
GPE タンゴチョウイエノタニ


In [4]:
a = "はい。AE-1-V-6-C。"
print(a.find("AE-1-V-6-C"))
print(len("AE-1-V-6-C"))

3
10
