In [1]:
from ast import literal_eval
import os
from pathlib import Path
import pickle

import augmenty
import pandas as pd
from sklearn.model_selection import train_test_split
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

In [2]:
spacypath = Path("../../data/ner/augmenty")

entities_for_augmenty = Path("../../data/ner/augmenty/entities_values.pickle")

train_filepath = Path("../../data/ner/baseline/train.xlsx")
valid_filepath = Path("../../data/ner/baseline/valid.xlsx")
test_filepath = Path("../../data/ner/baseline/test.xlsx")

# Augmenty 

In [3]:
# augmenters = augmenty.augmenters()

# for augmenter in augmenters:
#     print(augmenter)

# help(augmenty.augmenters()["ents_replace.v1"]) 

In [4]:
# Load Augmenters

char_replace_random_augmenter = augmenty.load("char_replace_random.v1", level=0.5)
char_replace_random_augmenter = augmenty.yield_original(
    char_replace_random_augmenter
) 

keystroke_error_augmenter = augmenty.load("keystroke_error.v1", level=0.3, keyboard="ru.v1")
keystroke_error_augmenter = augmenty.yield_original(
    keystroke_error_augmenter
)

char_swap_augmenter = augmenty.load("char_swap.v1", level=0.3)
char_swap_augmenter = augmenty.yield_original(
    char_swap_augmenter
)


paragraph_subset_augmenter = augmenty.load("paragraph_subset_augmenter.v1")
paragraph_subset_augmenter = augmenty.yield_original(
    paragraph_subset_augmenter
)


replace_dict = {
    "0": ["1", "2", "3", "4", "5", "6", "7", "8", "9"],
    "1": ["0", "2", "3", "4", "5", "6", "7", "8", "9"],
    "2": ["1", "0", "3", "4", "5", "6", "7", "8", "9"],
    "3": ["1", "2", "0", "4", "5", "6", "7", "8", "9"],
    "4": ["1", "2", "3", "0", "5", "6", "7", "8", "9"],
    "5": ["1", "2", "3", "4", "0", "6", "7", "8", "9"],
    "6": ["1", "2", "3", "4", "5", "0", "7", "8", "9"],
    "7": ["1", "2", "3", "4", "5", "6", "0", "8", "9"],
    "8": ["1", "2", "3", "4", "5", "6", "7", "0", "9"],
    "9": ["1", "2", "3", "4", "5", "6", "7", "8", "0"]
}
char_replace_augmenter = augmenty.load("char_replace.v1", level=0.2, replace=replace_dict)
char_replace_augmenter = augmenty.yield_original(
    char_replace_augmenter
)


# Load helper Dict with all entity Values. 
with open(entities_for_augmenty, 'rb') as f:
    entity_values_dict = pickle.load(f)
    
ents_replace_augmenter = augmenty.load("ents_replace.v1", level=0.5, ent_dict=entity_values_dict)
ents_replace_augmenter = augmenty.yield_original(
    ents_replace_augmenter
)

# Prepare data and split it

In [5]:
train = pd.read_excel(train_filepath, engine="openpyxl")
valid = pd.read_excel(valid_filepath, engine="openpyxl")
test = pd.read_excel(test_filepath, engine="openpyxl")

In [6]:
label_mapping = {
    "ИтоговаяСумма": "ИтоговаяСумма",
    "ИтоговаяСуммаПоРаботам": "ИтоговаяСуммаПоРаботам",
    "ИтоговаяСуммаПоДеталям": "ИтоговаяСуммаПоДеталям",
    "Скидки": "ИтоговаяСуммаСкидки",
    "ИтоговаяСуммаПоДеталямСоСкидкой": "ИтоговаяСуммаПоДеталямСоСкидкой",
    "ИтоговаяСуммаПоДеталямСкидки": "ИтоговаяСуммаСкидкиПоДеталям",
    "ПоРаботам": "ИтоговаяСуммаПоРаботам",
    "ПоДеталям": "ИтоговаяСуммаПоДеталям",
    "ИтоговаяСуммаСоСкидкой": "ИтоговаяСуммаСоСкидкой",
    "ИтоговаяСуммаСкидки": "ИтоговаяСуммаСкидки",
    "ИтоговаяСуммаПоРаботамСоСкидкой": "ИтоговаяСуммаПоРаботамСоСкидкой"
}

In [7]:
def create_data(df, textcol="text", labelcol="markup_classicNER"):
    
    data = []
    
    for index, row in tqdm(df.iterrows(), total=len(df)):
        line = row[textcol]
        entities = literal_eval(row[labelcol])

        entities_filtered = []
        for entity in entities:
            start, end, label = entity[0], entity[1], entity[2]

            if label not in label_mapping:
                continue

            entities_filtered.append((start, end, label_mapping[label]))

        data.append((line, entities_filtered))
    
    return data

In [8]:
train_data = create_data(train)
valid_data = create_data(valid)
test_data = create_data(test)

100%|███████████████████████████████████████| 814/814 [00:00<00:00, 6980.59it/s]
100%|███████████████████████████████████████| 283/283 [00:00<00:00, 6436.50it/s]
100%|███████████████████████████████████████| 130/130 [00:00<00:00, 7479.86it/s]


In [9]:
len(train_data), len(valid_data), len(test_data)

(814, 283, 130)

In [12]:
nlp = spacy.blank("ru")
nlp.add_pipe('sentencizer')

AUGMENTERS = [
    char_replace_random_augmenter,
    keystroke_error_augmenter,
    char_swap_augmenter,
    paragraph_subset_augmenter,
    char_replace_augmenter,
    ents_replace_augmenter
    
]

# Add augmentations for train data
def do_augmentations(doc, augmenters):
    
    docs = [doc]
    augmented_docs = []
    for augmenter in augmenters:
        _augmented_docs = augmenty.docs(docs, augmenter=augmenter, nlp=nlp)
        try:
            augmented_docs.extend(_augmented_docs)
        except ValueError:
            continue
        
    return augmented_docs



def create_spacy_object(data, savepath, mode):
    
    db = DocBin()
    erros_overlapping_entities = 0
    for i, (text, annotations) in enumerate(data):
        if type(text) == str:
            doc = nlp(text)
            ents = []
            if len(annotations):
                for start, end, label in annotations:
                    span = doc.char_span(start, end, label=label)
                    if span:
                        ents.append(span)
            
            try:
                doc.ents = ents
            except ValueError:
                erros_overlapping_entities += 1
                continue
            
            
            db.add(doc)
            if mode in ["train", "valid"]:
                augmented_docs = do_augmentations(doc, augmenters=AUGMENTERS)  
                for doc in augmented_docs:
                    db.add(doc)        
                
        else:
            print(text, annotations)
    
    dbpath = f"{savepath}/{mode}.spacy"
    db.to_disk(dbpath)
    print(f"Saved to {dbpath}. {erros_overlapping_entities} Docs were not processed")
    return db

In [13]:
db_train = create_spacy_object(
    data=train_data,
    savepath=spacypath,
    mode="train"
)

db_valid = create_spacy_object(
    data=valid_data,
    savepath=spacypath,
    mode="valid"
)

db_test = create_spacy_object(
    data=test_data,
    savepath=spacypath,
    mode="test"
)

Saved to ../../data/ner/augmenty/train.spacy. 71 Docs were not processed
Saved to ../../data/ner/augmenty/valid.spacy. 25 Docs were not processed
Saved to ../../data/ner/augmenty/test.spacy. 10 Docs were not processed


In [14]:
len(db_train), len(db_valid), len(db_test)

(9503, 3274, 120)

# Create config

In [None]:
%%bash

python -m spacy init fill-config base_config.cfg config.cfg

# Run training (better in comand line)

In [None]:
%%bash

python -m spacy train config.cfg --gpu-id 0 --output ./output --paths.train ./spacy/train.spacy --paths.dev ./spacy/valid.spacy