In [1]:
from ast import literal_eval
import os
from pathlib import Path 

import pandas as pd
from sklearn.model_selection import train_test_split
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

In [22]:
spacypath = Path("../../data/ner/baseline")

train_filepath = Path("../../data/ner/baseline/train.xlsx")
valid_filepath = Path("../../data/ner/baseline/valid.xlsx")
test_filepath = Path("../../data/ner/baseline/test.xlsx")

# Prepare data and split it

In [3]:
train = pd.read_excel(train_filepath, engine="openpyxl")
valid = pd.read_excel(valid_filepath, engine="openpyxl")
test = pd.read_excel(test_filepath, engine="openpyxl")

In [4]:
label_mapping = {
    "ИтоговаяСумма": "ИтоговаяСумма",
    "ИтоговаяСуммаПоРаботам": "ИтоговаяСуммаПоРаботам",
    "ИтоговаяСуммаПоДеталям": "ИтоговаяСуммаПоДеталям",
    "Скидки": "ИтоговаяСуммаСкидки",
    "ИтоговаяСуммаПоДеталямСоСкидкой": "ИтоговаяСуммаПоДеталямСоСкидкой",
    "ИтоговаяСуммаПоДеталямСкидки": "ИтоговаяСуммаСкидкиПоДеталям",
    "ПоРаботам": "ИтоговаяСуммаПоРаботам",
    "ПоДеталям": "ИтоговаяСуммаПоДеталям",
    "ИтоговаяСуммаСоСкидкой": "ИтоговаяСуммаСоСкидкой",
    "ИтоговаяСуммаСкидки": "ИтоговаяСуммаСкидки",
    "ИтоговаяСуммаПоРаботамСоСкидкой": "ИтоговаяСуммаПоРаботамСоСкидкой"
}

In [16]:
def create_data(df, textcol="text", labelcol="markup_classicNER"):
    
    data = []
    
    for index, row in tqdm(df.iterrows(), total=len(df)):
        line = row[textcol]
        entities = literal_eval(row[labelcol])

        entities_filtered = []
        for entity in entities:
            start, end, label = entity[0], entity[1], entity[2]

            if label not in label_mapping:
                continue

            entities_filtered.append((start, end, label_mapping[label]))

        data.append((line, entities_filtered))
    
    return data

In [6]:
train_data = create_data(train)
valid_data = create_data(valid)
test_data = create_data(test)

100%|███████████████████████████████████████| 814/814 [00:00<00:00, 4011.61it/s]
100%|███████████████████████████████████████| 283/283 [00:00<00:00, 9696.74it/s]
100%|███████████████████████████████████████| 130/130 [00:00<00:00, 7393.45it/s]


In [7]:
len(train_data), len(valid_data), len(test_data)

(814, 283, 130)

In [20]:
nlp = spacy.blank("ru")

def create_spacy_object(data, savepath, mode):
    
    db = DocBin()
    erros_overlapping_entities = 0
    for i, (text, annotations) in enumerate(data):
        if type(text) == str:
            doc = nlp(text)
            ents = []
            if len(annotations):
                for start, end, label in annotations:
                    span = doc.char_span(start, end, label=label)
                    if span:
                        ents.append(span)
            
            try:
                doc.ents = ents
            except ValueError:
#                 print(f"Cannot Do for Document ({mode}): {i}")
                erros_overlapping_entities += 1
                continue
            
            
            db.add(doc)
        else:
            print(text, annotations)
    
    dbpath = f"{savepath}/{mode}.spacy"
    db.to_disk(dbpath)
    print(f"Saved to {dbpath}. {erros_overlapping_entities} Docs were not processed")
    return db

In [23]:
db_train = create_spacy_object(
    data=train_data,
    savepath=spacypath,
    mode="train"
)

db_valid = create_spacy_object(
    data=valid_data,
    savepath=spacypath,
    mode="valid"
)

db_test = create_spacy_object(
    data=test_data,
    savepath=spacypath,
    mode="test"
)

Saved to ../../data/ner/baseline/train.spacy. 71 Docs were not processed
Saved to ../../data/ner/baseline/valid.spacy. 25 Docs were not processed
Saved to ../../data/ner/baseline/test.spacy. 10 Docs were not processed


# Create config

In [None]:
%%bash

python -m spacy init fill-config base_config.cfg config.cfg

# Run training (better in comand line)

In [None]:
%%bash

python -m spacy train config.cfg --gpu-id 0 --output ./output --paths.train ./spacy/train.spacy --paths.dev ./spacy/valid.spacy