### Загрузка данных

In [55]:
# Функция для загрузки данных из файла
def load_data(file_path):
    sentences = []  # List to store sentences

    with open(file_path, 'r', encoding='utf-8') as file:
        current_sentence = []
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespaces
            if line:
                parts = line.split()  # Split each line by space
                word, label = parts[0], parts[1]
                current_sentence.append((word, label))
            else:
                if current_sentence:
                    sentences.append(current_sentence)
                current_sentence = []

        if current_sentence:
            sentences.append(current_sentence)

    return sentences

ner_data = load_data("data_token/train.txt")
val_data = load_data("data_token/dev.txt")
test_data = load_data("data_token/test.txt")

ner_data[0][0:3], val_data[0][0:3], test_data[0][0:3]

([('"', 'O'), ('Если', 'O'), ('Миронов', 'B-PER')],
 [('как', 'O'), ('акционерный', 'O'), ('коммерческий', 'O')],
 [('Тогда', 'O'), ('замешанные', 'O'), ('в', 'O')])

### Токенизация

In [56]:
import torch
import torch.nn as nn
import torch.optim as optim
from TorchCRF import CRF
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score

word_to_idx = {'<PAD>': 0} 
label_to_idx = {'<PAD>': 0}

def encode_sentence(sentence):
    word_indices = [word_to_idx[word] for word, _ in sentence]
    label_indices = [label_to_idx[label] for _, label in sentence]
    return word_indices, label_indices

def create_dataloader(data):
    # Токенизация (разбиваем на слова и метки)
    tokenized_data = [[(word, label) for word, label in sentence] for sentence in data]

    # Индексируем слова и метки
    for sentence in tokenized_data:
        for word, label in sentence:
            if word not in word_to_idx:
                word_to_idx[word] = len(word_to_idx)
            if label not in label_to_idx:
                label_to_idx[label] = len(label_to_idx)

    encoded_data = [encode_sentence(sentence) for sentence in tokenized_data]

    # Создаем input и target
    input_sequences = [torch.LongTensor(sentence[0]) for sentence in encoded_data]
    target_sequences = [torch.LongTensor(sentence[1]) for sentence in encoded_data]

    # Приводим к единому размеру
    input_sequences_padded = pad_sequence(input_sequences, batch_first=True, padding_value=0)
    target_sequences_padded = pad_sequence(target_sequences, batch_first=True, padding_value=0)

    # Create DataLoader for batching
    res_data = TensorDataset(input_sequences_padded, target_sequences_padded)

    return res_data

batch_size = 2
data_loader = DataLoader(create_dataloader(ner_data), batch_size=batch_size, shuffle=True)
val_data_loader = DataLoader(create_dataloader(val_data), batch_size=batch_size, shuffle=False)
test_data_loader = DataLoader(create_dataloader(test_data), batch_size=batch_size, shuffle=False)



### BiLSTM-CRF

In [57]:
# Зададим архитектуру
class BiLSTMCRF(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim):
        super(BiLSTMCRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tagset_size = tagset_size

        # Embedding layer
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        # BiLSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True, batch_first=True)

        # Linear layer for CRF
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

        # CRF layer
        self.crf = CRF(tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeds(sentence)
        lstm_out, _ = self.lstm(embeds)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats
    
def calculate_f1_score(true_labels, predicted_labels):
    precision = precision_score(true_labels, predicted_labels, average='micro')
    recall = recall_score(true_labels, predicted_labels, average='micro')
    f1 = f1_score(true_labels, predicted_labels, average='micro')
    return precision, recall, f1

# Гиперпараметры
vocab_size = len(word_to_idx)
tagset_size = len(label_to_idx)
embedding_dim = 50
hidden_dim = 100
learning_rate = 0.01

model = BiLSTMCRF(vocab_size, tagset_size, embedding_dim, hidden_dim)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Training
def train_epoch(data_loader, model, optimizer):
    model.train()
    total_loss = 0
    for input_batch, target_batch in data_loader:
        optimizer.zero_grad()
        lstm_feats = model(input_batch)
        loss = -model.crf(lstm_feats, target_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss

# Evaluation
def evaluate(data_loader, model):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for input_batch, target_batch in data_loader:
            lstm_feats = model(input_batch)
            loss = -model.crf(lstm_feats, target_batch)
            total_loss += loss.item()
    return total_loss

# Final evaluation
def evaluate_with_metrics(data_loader, model):
    model.eval()
    true_labels = []
    predicted_labels = []
    total_loss = 0
    with torch.no_grad():
        for input_batch, target_batch in data_loader:
            lstm_feats = model(input_batch)
            loss = -model.crf(lstm_feats, target_batch)
            total_loss += loss.item()

            # Собираем истинные и предсказанные метки
            true_labels.extend(target_batch.view(-1).tolist())
            decoded_labels = model.crf.decode(lstm_feats)
            for labels in decoded_labels:
                predicted_labels.extend(labels)
    precision, recall, f1 = calculate_f1_score(true_labels, predicted_labels)
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    return total_loss


num_epochs = 10

for epoch in range(num_epochs):
    train_loss = train_epoch(data_loader, model, optimizer)
    print(f"Epoch {epoch + 1}, Train Loss: {train_loss:.4f}")

    val_loss = evaluate(val_data_loader, model)
    print(f"Epoch {epoch + 1}, Validation Loss: {val_loss:.4f}")

# Оценка модели
test_loss = evaluate_with_metrics(test_data_loader, model)
print(f"Test Loss: {test_loss:.4f}")

Epoch 1, Train Loss: 78009.4929
Epoch 1, Validation Loss: 21365.7323
Epoch 2, Train Loss: 50848.6247
Epoch 2, Validation Loss: 17165.6112
Epoch 3, Train Loss: 37784.0828
Epoch 3, Validation Loss: 15883.2099
Epoch 4, Train Loss: 29112.0662
Epoch 4, Validation Loss: 15072.8291
Epoch 5, Train Loss: 22117.5215
Epoch 5, Validation Loss: 15609.1726
Epoch 6, Train Loss: 16604.4640
Epoch 6, Validation Loss: 15860.2106
Epoch 7, Train Loss: 12221.9402
Epoch 7, Validation Loss: 16354.0611
Epoch 8, Train Loss: 8379.9462
Epoch 8, Validation Loss: 17177.5422
Epoch 9, Train Loss: 5350.5110
Epoch 9, Validation Loss: 18832.9843
Epoch 10, Train Loss: 3162.5885
Epoch 10, Validation Loss: 19290.1949
Precision: 0.8901
Recall: 0.8901
F1-Score: 0.8901
Test Loss: 19268.0990
