In [None]:
import spacy;
import json;
import tqdm;
import sklearn;
import text_formatter;

from text_formatter import cleaned_text_getter;
from spacy.tokens import DocBin;
from tqdm import tqdm;
from sklearn.model_selection import train_test_split;

# Load and clean text data
clean_text = cleaned_text_getter();

# Load annotated data from JSON file
data = json.load(open("annotations.json", 'r', encoding="utf8"));

def get_spacy_doc(file, data):
    """
    Converts annotated text data into a spaCy-compatible DocBin object.
    
    Parameters:
    - file: A file object for logging errors.
    - data: A list of annotated text samples.

    Returns:
    - doc_bin: A spaCy DocBin object containing processed documents.
    """

    nlp = spacy.blank("hr"); # Initialize a blank Croatian NLP model
    doc_bin = DocBin(); # Container for serialized spaCy documents

    for text, anott in tqdm(data):
        doc = nlp.make_doc(text); # Create a spaCy Doc object
        anott = anott["entities"]; # Extract entity annotations
        ents = [];
        entity_indicates = []; # Track entity positions to prevent overlapping

        for start, end, label in anott:
            skip_entity = False;
            
            # Check if any part of the entity overlaps with an existing entity
            for idx in range(start, end):
                if (idx in entity_indicates):
                    skip_entity = True;
                    break;
            
            if (skip_entity):
                continue;

            entity_indicates = entity_indicates + list(range(start, end));

            try:
                # Create a character span for the entity
                span = doc.char_span(start, end, label=label, alignment_mode="strict");
            except:
                continue; # Skip problematic spans
        
            if (span is None):
                # Log problematic entities for debugging
                err_data = str([start, end]) + "   " + str(text) + "\n";
                file.write(err_data);
            else:
                ents.append(span);
        
        try:
            doc.ents = ents; # Assign entities to the document
            doc_bin.add(doc); # Add the document to the DocBin container
        except:
            pass; # Skip any errors

        return doc_bin;

# Split the dataset into training (80%) and testing (20%)
train, test = train_test_split(data, test_size=0.2);

# Open a file to log annotation errors
file = open("trenirani_modeli/ispis.txt");

# Convert training data to spaCy format and save it
doc_bin = get_spacy_doc(file, train);
doc_bin.to_disk("trenirani_modeli/trenirani_podaci.spacy");

# Convert testing data to spaCy format and save it
doc_bin = get_spacy_doc(file, test);
doc_bin.to_disk("trenirani_modeli/testni_podaci.spacy");

# Close the error log file
file.close();

# Command to train the spaCy model using the prepared data:
#python -m spacy train konfiguracija.cfg  --output trenirani_modeli/output  --paths.train trenirani_modeli/trenirani_podaci.spacy  --paths.dev trenirani_modeli/testni_podaci.spacy

# Load the trained model
nlp = spacy.load("trenirani_modeli/output/model-best");

# Process the cleaned text with the trained model
doc = nlp(clean_text);

# Print recognized entities along with their explanations
for e in doc.ents:
    print(e.text, e.label_, spacy.explain(e.label_));

  0%|          | 0/18 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s]


Marselj ORG Companies, agencies, institutions, etc.
Jaques ORG Companies, agencies, institutions, etc.
Konsulatu ORG Companies, agencies, institutions, etc.
Često ORG Companies, agencies, institutions, etc.
Beograd LOC Non-GPE locations, mountain ranges, bodies of water
Ministarstvo ORG Companies, agencies, institutions, etc.
Beogradu LOC Non-GPE locations, mountain ranges, bodies of water
Pozdravite LOC Non-GPE locations, mountain ranges, bodies of water
Molim ORG Companies, agencies, institutions, etc.
Putniku LOC Non-GPE locations, mountain ranges, bodies of water
) ORG Companies, agencies, institutions, etc.
Ventimilja ORG Companies, agencies, institutions, etc.
