In [1]:
!pip install transformers torch

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m48.0 MB/s[0m eta [36m0:00:0

In [16]:
train_data_path = "/content/srWaC_transformed_train.txt"
test_data_path = "/content/srWaC_transformed_test.txt"

def load_data(data_path):
    with open(data_path, "r", encoding="utf-8") as file:
        lines = file.readlines()
    sentences = [line.strip().split() for line in lines]
    return sentences

train_sentences = load_data(train_data_path)
test_sentences = load_data(test_data_path)

In [17]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset

# Koristićemo BERT model posebno treniran za srpski jezik
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_sentences(sentences):
    tokenized = []
    for sentence in sentences:
        tokens = []
        labels = []
        for pair in sentence:
            parts = pair.split("/")
            if len(parts) == 2:  # Očekujemo tačno dva dela
                word, label = parts
                tokens.extend(tokenizer.tokenize(word))
                labels.extend([label] + ["Prazno"] * (len(tokenizer.tokenize(word)) - 1))
            else:
                # Ovde možete rukovati redovima koji ne ispunjavaju očekivanu strukturu
                print("Greška: Red ne ispunjava očekivanu strukturu:", pair)
        tokenized.append((tokens, labels))
    return tokenized

train_tokenized = tokenize_sentences(train_sentences)
test_tokenized = tokenize_sentences(test_sentences)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Greška: Red ne ispunjava očekivanu strukturu: http://www.water-energy-food.org//X
Greška: Red ne ispunjava očekivanu strukturu: //Z
Greška: Red ne ispunjava očekivanu strukturu: //Z
Greška: Red ne ispunjava očekivanu strukturu: http://www.ebay.com/itm/Western-Digital-My-Passport-Edge-Mac-500GB-/310650415709?_trksid=p2050601.m2372&amp;_trkparms=aid%3D111000%26algo%3DREC.CURRENT%26ao%3/X


In [18]:
def prepare_data(tokenized_data, tokenizer, max_length):
    input_ids = []
    attention_masks = []
    labels = []

    for tokens, label_ids in tokenized_data:
        # Provera da li su tokens i label_ids prazni
        if not tokens or not label_ids:
            continue

        # Mapiranje nepostojećih oznaka na "Prazno"
        label_ids = [label if label in labels_srWaC else "Prazno" for label in label_ids]

        encoded = tokenizer.encode_plus(tokens, padding="max_length", max_length=max_length, truncation=True, add_special_tokens=True)
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])

        # Dodajte "Prazno" oznake na kraju liste label_ids kako biste dobili uniformnu dužinu
        while len(label_ids) < max_length:
            label_ids.append("Prazno")

        label_ids = [labels_srWaC.index(label) for label in label_ids]
        labels.append(label_ids)

    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)

    dataset = TensorDataset(input_ids, attention_masks, labels)
    return dataset


# Prvo pronađite maksimalnu dužinu tokena
max_token_length = max(len(tokens) for tokens, _ in train_tokenized + test_tokenized)

# Zatim koristite tu maksimalnu dužinu za tokenizaciju i kodiranje
#train_dataset = prepare_data(train_tokenized, tokenizer, max_length=max_token_length)
test_dataset = prepare_data(test_tokenized, tokenizer, max_length=max_token_length)


batch_size = 8
#train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [22]:
import torch
from transformers import BertForTokenClassification, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Putanja do modela na lokalnom računaru
local_model_path = "/content/bert_custom_ner_model.pth"

labels_srWaC = ['AGA', 'AGD', 'AGG', 'AGI', 'AGL', 'AGN', 'AGV',
    'APA', 'APD', 'APG', 'API', 'APL', 'APN', 'APV',
    'ASA', 'ASD', 'ASG', 'ASI', 'ASL', 'ASN',
    'CC', 'CS',
    'I',
    'MC', 'MCA', 'MCD', 'MCG', 'MCI', 'MCL', 'MCN',
    'MM',
    'MO', 'MOA', 'MOD', 'MOG', 'MOI', 'MOL', 'MON',
    'MS', 'MSI', 'MSL', 'MSN',
    'NA', 'ND', 'NG', 'NI', 'NL', 'NN', 'NV',
    'PNA', 'PND', 'PNG', 'PNI', 'PNL', 'PNN',
    'PPA', 'PPD', 'PPG', 'PPI', 'PPL', 'PPN', 'PPV',
    'PXA', 'PXD', 'PXG', 'PXI', 'PXL',
    'QO', 'QQ', 'QR', 'QZ',
    'RG', 'RR', 'RS',
    'SA', 'SD', 'SG', 'SI', 'SL',
    'VAA', 'VAE', 'VAM', 'VAN', 'VAP', 'VAR',
    'VMA', 'VME', 'VMF', 'VMM', 'VMN', 'VMP', 'VMR',
    'X', 'XF', 'Y', 'Z', 'Prazno']

# Učitajte model bez korišćenja pickle_module
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(labels_srWaC))
model.load_state_dict(torch.load(local_model_path))  # Ovde koristite map_location ako je potrebno

# Prebacite model u "evaluation" režim
model.eval()

# Sada možete izvršiti evaluaciju kao što ste to ranije radili
# Na primer, koristeći test podatke i izračunati metrike tačnosti, preciznosti, odziva i F1-skor
# Prvo tokenizujte i pripremite test podatke kao što ste to uradili za treniranje

# Inicijalizujte prazne liste za stvarne i predviđene oznake
true_labels = []
predicted_labels = []

# Iterirajte kroz test dataloader
for batch in test_dataloader:
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    # Izlazi modela su u formatu logita, koristite argmax da biste dobili predviđene oznake
    predicted_batch_labels = torch.argmax(outputs.logits, dim=2).cpu().numpy()
    true_batch_labels = labels.cpu().numpy()

    # Dodajte oznake za ovaj batch u liste
    predicted_labels.extend(predicted_batch_labels)
    true_labels.extend(true_batch_labels)

# Pretvorite listu u numpy niz za računanje tačnosti
true_labels = np.concatenate(true_labels, axis=0)
predicted_labels = np.concatenate(predicted_labels, axis=0)

# Izračunajte tačnost
accuracy = accuracy_score(true_labels, predicted_labels)

print(f"Tačnost modela na test skupu podataka: {accuracy * 100:.2f}%")

# Pretvorite predicted_labels u odgovarajući format za izračunavanje metrika


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tačnost modela na test skupu podataka: 95.39%


In [23]:
# Izračunajte metrike kao što su tačnost, preciznost, odziv i F1-skor
import numpy as np

# True labels su true_labels, a predicted labels su predicted_labels iz prethodnog odgovora

# Tačnost
accuracy = accuracy_score(true_labels, predicted_labels)

# Preciznost
precision = precision_score(true_labels, predicted_labels, average='weighted')

# Odziv
recall = recall_score(true_labels, predicted_labels, average='weighted')

# F1-Skor
f1 = f1_score(true_labels, predicted_labels, average='weighted')

# Matrica konfuzije
confusion = confusion_matrix(true_labels, predicted_labels)

print(f"Tačnost: {accuracy}")
print(f"Preciznost: {precision}")
print(f"Odziv: {recall}")
print(f"F1-Skor: {f1}")
print("Matrica konfuzije:")
print(confusion)


Tačnost: 0.9539354838709677
Preciznost: 0.9415873860120019
Odziv: 0.9539354838709677
F1-Skor: 0.9461468696240154
Matrica konfuzije:
[[     0      0      0 ...      0     58      5]
 [     0      0      0 ...      0      5      1]
 [     0      0      0 ...      0     90      8]
 ...
 [     0      0      0 ...      0     19      3]
 [     0      0      0 ...      0    747    100]
 [     0      0      0 ...      0     40 117222]]


  _warn_prf(average, modifier, msg_start, len(result))
