# Solution 2 (best solution)

In [7]:
%%capture
!python -m spacy download ru_core_web_sm

In [30]:
import pandas as pd
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans
from sklearn.model_selection import train_test_split
import shutil
import warnings

warnings.filterwarnings("ignore")

In [31]:
# Read train data and split it into train and validation sets
train_original_df = pd.read_json("../data/train.jsonl", lines=True)

train_df, val_df = train_test_split(train_original_df, test_size=0.2, random_state=42)

In [32]:
# Get labels from train data
labels = set()
for _, row in train_original_df.iterrows():
    ners = row["ners"]
    for ner in ners:
        labels.add(ner[2])

label_mapping = {i: label for i, label in enumerate(sorted(labels))}

label_mapping

{0: 'AGE',
 1: 'AWARD',
 2: 'CITY',
 3: 'COUNTRY',
 4: 'CRIME',
 5: 'DATE',
 6: 'DISEASE',
 7: 'DISTRICT',
 8: 'EVENT',
 9: 'FACILITY',
 10: 'FAMILY',
 11: 'IDEOLOGY',
 12: 'LANGUAGE',
 13: 'LAW',
 14: 'LOCATION',
 15: 'MONEY',
 16: 'NATIONALITY',
 17: 'NUMBER',
 18: 'ORDINAL',
 19: 'ORGANIZATION',
 20: 'PENALTY',
 21: 'PERCENT',
 22: 'PERSON',
 23: 'PRODUCT',
 24: 'PROFESSION',
 25: 'RELIGION',
 26: 'STATE_OR_PROVINCE',
 27: 'TIME',
 28: 'WORK_OF_ART'}

In [33]:
def dataframe_to_spacy(df, output_file_path):
    # Create a blank spacy model
    nlp = spacy.blank("ru")
    doc_bin = DocBin()

    # Add the labels to the doc_bin
    for _, row in tqdm(df.iterrows(), total=len(df)):
        text = row["sentences"]
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in row["ners"]:
            span = doc.char_span(start, end + 1, label=label, alignment_mode="contract")

            # Skip if span is None (incorrect annotation for spaCy)
            if span is None:
                # print("Skipping entity:", start, end, label, text[start:end + 1])
                continue

            # Skip if span text is not the same as the text in the dataset (incorrect spaces in the annotation)
            if span.text != span.text.strip():
                # print("ERROR entity:", start, end, label, "'" + span.text + "'")
                continue

            ents.append(span)

        # Set the entities to the doc
        filtered_ents = filter_spans(ents)
        doc.ents = filtered_ents
        doc_bin.add(doc)

    # Save the doc_bin
    doc_bin.to_disk(output_file_path)

In [34]:
dataframe_to_spacy(train_df, "training_data.spacy")

100%|██████████| 415/415 [00:01<00:00, 244.67it/s]


In [35]:
dataframe_to_spacy(val_df, "validation_data.spacy")

100%|██████████| 104/104 [00:00<00:00, 203.93it/s]


In [36]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./validation_data.spacy --gpu-id 0

In [38]:
# Remove the last model (save only the best model)
shutil.rmtree('model-last', ignore_errors=True)

In [39]:
# Read dev and test data
dev_df = pd.read_json("../data/dev.jsonl", lines=True)
test_df = pd.read_json("../data/test.jsonl", lines=True)

In [40]:
# Load the trained model
# If you want to use already trained model, unzip it from ../models folder into the current folder (solution-2)
nlp_ner = spacy.load("model-best")

In [41]:
def predict_ner(text):
    # Predict NER tags for the text
    doc = nlp_ner(text)
    return [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

In [42]:
def predict_ner_df(df, output_file_path):
    # Predict NER tags for each sentence in the dataframe

    df = df.copy()

    # Predict NER tags for each sentence
    ners_col = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        text = row["senences"]
        ners = predict_ner(text)
        ners = [(start, end - 1, label) for start, end, label in ners]
        ners_col.append(ners)

    # Add the predicted NER tags to the dataframe
    df["ners"] = ners_col

    # Save the dataframe as a jsonl file
    df.to_json(output_file_path, orient="records", lines=True)

In [43]:
predict_ner_df(dev_df, "dev_predictions.jsonl")

100%|██████████| 65/65 [01:11<00:00,  1.10s/it]


In [44]:
predict_ner_df(test_df, "test_predictions.jsonl")

100%|██████████| 65/65 [01:01<00:00,  1.05it/s]
