In [None]:
# install any necessary packages
!pip install -U spacy
!pip install spacy_transformers

# Install below packages if error occurs while installing above packages
#!pip install -U pip setuptools
# !pip install typing-extensions==4.6.1

In [None]:
# Change the working directory to the project folder
%cd "/content/drive/MyDrive/Doktorat/new_model"

# Import required libraries and install any necessary packages
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

# Check the installed version of spaCy
spacy.__version__

# Check GPU information
!nvidia-smi

In [5]:
# Load the annotated data from a JSON file
cv_data = json.load(open('/content/drive/MyDrive/Doktorat/new_model/annotations_dataset/annotations_weird.json','r'))

# Display the number of items in the dataset
len(cv_data)

# Display the first item in the dataset
cv_data[0]

['BIBLIOGRAFIA l* teoria literat ury OGÓLNE', {'entities': []}]

In [None]:
!python -m spacy init fill-config /content/drive/MyDrive/Doktorat/new_model/config/base_config.cfg /content/drive/MyDrive/Doktorat/new_model/config/config.cfg

In [8]:
# Define a function to create spaCy DocBin objects from the annotated data
def get_spacy_doc(file, data):
  # Create a blank spaCy pipeline
  nlp = spacy.blank('pl')
  db = DocBin()

  # Iterate through the data
  for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']

    ents = []
    entity_indices = []

    # Extract entities from the annotations
    for start, end, label in annot:
      skip_entity = False
      for idx in range(start, end):
        if idx in entity_indices:
          skip_entity = True
          break
      if skip_entity:
        continue

      entity_indices = entity_indices + list(range(start, end))
      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
      except:
        continue

      if span is None:
        # Log errors for annotations that couldn't be processed
        err_data = str([start, end]) + "    " + str(text) + "\n"
        file.write(err_data)
      else:
        ents.append(span)

    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass

  return db

In [12]:
# Split the annotated data into training and testing sets
from sklearn.model_selection import train_test_split
train, test = train_test_split(cv_data, test_size=0.2)

# Display the number of items in the training and testing sets
len(train), len(test)

# Open a file to log errors during annotation processing
file = open('/content/drive/MyDrive/Doktorat/new_model/trained_models/train_file.txt','w')

# Create spaCy DocBin objects for training and testing data
db = get_spacy_doc(file, train)
db.to_disk('/content/drive/MyDrive/Doktorat/new_model/trained_models/train_data.spacy')

db = get_spacy_doc(file, test)
db.to_disk('/content/drive/MyDrive/Doktorat/new_model/trained_models/test_data.spacy')

# Close the error log file
file.close()

100%|██████████| 139/139 [00:00<00:00, 787.11it/s]
100%|██████████| 35/35 [00:00<00:00, 1108.41it/s]


In [14]:
!python -m spacy train /content/drive/MyDrive/Doktorat/new_model/config/config.cfg  --output /content/drive/MyDrive/Doktorat/new_model/trained_models/output  --paths.train /content/drive/MyDrive/Doktorat/new_model/trained_models/train_data.spacy  --paths.dev /content/drive/MyDrive/Doktorat/new_model/trained_models/test_data.spacy --gpu-id 0

2024-01-29 18:56:18.035582: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-29 18:56:18.035746: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-29 18:56:18.037824: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[38;5;4mℹ Saving to output directory:
/content/drive/MyDrive/Doktorat/new_model/trained_models/output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  

In [17]:
nlp = spacy.load('/content/drive/MyDrive/Doktorat/new_model/trained_models/output/model-best')

In [22]:
text = "Szczęścia szukamy. Po 11:38-9"

In [23]:
# Process the extracted text using the loaded spaCy NER model
doc = nlp(text)

# Iterate through the named entities (entities) recognized by the model
for ent in doc.ents:
  # Print the recognized text and its corresponding label
  print(ent.text, "  ->>>>  ", ent.label_)

Po   ->>>>   CZASOPISMO
11:38-9   ->>>>   NUMER_CZASOPISMA
