# 03LAB - zadanie
### Miłosz Rolewski

## Przygotowanie danych

In [11]:
import torch
from collections import Counter
from torchtext.vocab import vocab
import csv

SPECIALS = ["<unk>", "<pad>", "<bos>", "<eos>"]

def load_data_tsv(path):
    all_tokens = []
    all_tags = []

    with open(path, "r", encoding="utf-8") as f:
        tsv_reader = csv.reader(f, delimiter="\t")
        for row in tsv_reader:
            if len(row) != 2:
                continue
            tags_str, text_str = row
            tags = tags_str.strip().split(" ")
            tokens = text_str.strip().split(" ")
            # usuwamy tagi </S>
            filtered = [(t, tag) for t, tag in zip(tokens, tags) if t != "</S>"]
            if not filtered:
                continue
            tokens, tags = zip(*filtered)
            all_tokens.append(list(tokens))
            all_tags.append(list(tags))

    return all_tokens, all_tags

def build_vocab(tokens_list):
    counter = Counter()
    for tokens in tokens_list:
        counter.update(tokens)
    v = vocab(counter, specials=SPECIALS)
    v.set_default_index(v["<unk>"])
    return v

def build_label_vocab(tag_list):
    unique_tags = sorted(set(tag for tags in tag_list for tag in tags))
    tag2id = {tag: i for i, tag in enumerate(unique_tags)}
    id2tag = {i: tag for tag, i in tag2id.items()}
    return tag2id, id2tag

def vectorize(tokens_list, tags_list, token_vocab, tag_vocab):
    token_ids = []
    tag_ids = []
    for tokens, tags in zip(tokens_list, tags_list):
        token_vec = [token_vocab["<bos>"]] + [token_vocab[t] for t in tokens] + [token_vocab["<eos>"]]
        tag_vec = [tag_vocab["O"]] + [tag_vocab[t] for t in tags] + [tag_vocab["O"]]  # "O" jako neutralny
        token_ids.append(torch.tensor(token_vec, dtype=torch.long))
        tag_ids.append(torch.tensor(tag_vec, dtype=torch.long))
    return token_ids, tag_ids


In [12]:
tokens, tags = load_data_tsv("en-ner-conll-2003/train/train.tsv")
token_vocab = build_vocab(tokens)
tag2id, id2tag = build_label_vocab(tags)
token_ids, tag_ids = vectorize(tokens, tags, token_vocab, tag2id)

print(tokens[0])
print(tags[0])
print(token_ids[0])
print(tag_ids[0])


['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.', 'Peter', 'Blackburn', 'BRUSSELS', '1996-08-22', 'The', 'European', 'Commission', 'said', 'on', 'Thursday', 'it', 'disagreed', 'with', 'German', 'advice', 'to', 'consumers', 'to', 'shun', 'British', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.', 'Germany', "'s", 'representative', 'to', 'the', 'European', 'Union', "'s", 'veterinary', 'committee', 'Werner', 'Zwingmann', 'said', 'on', 'Wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'Britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.', '"', 'We', 'do', "n't", 'support', 'any', 'such', 'recommendation', 'because', 'we', 'do', "n't", 'see', 'any', 'grounds', 'for', 'it', ',', '"', 'the', 'Commission', "'s", 'chief', 'spokesman', 'Nikolaus', 'van', 'der', 'Pas', 'told', 'a', 'news', 'briefing', '.', 'He', 'said', 'further',

## Model

In [13]:
import torch.nn as nn

class LSTM_NER(nn.Module):
    def __init__(self, vocab_size, num_labels, embedding_dim=100, hidden_dim=256):
        super(LSTM_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.classifier = nn.Linear(hidden_dim, num_labels)

    def forward(self, x):
        x = self.embedding(x)                   # (batch, seq_len, embed_dim)
        x, _ = self.lstm(x)                     # (batch, seq_len, hidden_dim)
        logits = self.classifier(x)             # (batch, seq_len, num_labels)
        return logits


## Przygotowanie modelu

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LSTM_NER(vocab_size=len(token_vocab), num_labels=len(tag2id)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())


In [15]:
from tqdm import tqdm

EPOCHS = 5

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for tokens_tensor, tags_tensor in tqdm(zip(token_ids, tag_ids), total=len(tag_ids)):
        tokens_tensor = tokens_tensor.unsqueeze(0).to(device)  # batch = 1
        tags_tensor = tags_tensor.unsqueeze(0).to(device)

        optimizer.zero_grad()
        output = model(tokens_tensor)  # (1, seq_len, num_labels)
        loss = criterion(output.squeeze(0), tags_tensor.squeeze(0))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} loss: {total_loss:.4f}")


100%|██████████| 945/945 [00:16<00:00, 57.20it/s]


Epoch 1 loss: 584.1406


100%|██████████| 945/945 [00:16<00:00, 55.72it/s]


Epoch 2 loss: 308.8680


100%|██████████| 945/945 [00:17<00:00, 55.34it/s]


Epoch 3 loss: 182.4637


100%|██████████| 945/945 [00:16<00:00, 56.11it/s]


Epoch 4 loss: 108.3680


100%|██████████| 945/945 [00:17<00:00, 55.16it/s]

Epoch 5 loss: 59.6965





## Ewaluacja

In [16]:
from seqeval.metrics import classification_report, f1_score

def evaluate_model(model, token_ids, tag_ids, tag_vocab, id2tag):
    model.eval()
    y_true = []
    y_pred = []

    with torch.no_grad():
        for tokens_tensor, tags_tensor in zip(token_ids, tag_ids):
            tokens_tensor = tokens_tensor.unsqueeze(0).to(device)
            tags_tensor = tags_tensor.to(device)

            outputs = model(tokens_tensor)  # (1, seq_len, num_labels)
            predictions = torch.argmax(outputs.squeeze(0), dim=1).cpu().tolist()
            true_tags = tags_tensor.cpu().tolist()

            # Pomijamy <bos> i <eos>
            pred_tags = [id2tag[i] for i in predictions[1:-1]]
            true_tags = [id2tag[i] for i in true_tags[1:-1]]

            y_pred.append(pred_tags)
            y_true.append(true_tags)

    print(classification_report(y_true, y_pred))
    print("F1-score:", f1_score(y_true, y_pred))


In [17]:
evaluate_model(model, token_ids, tag_ids, tag2id, id2tag)


              precision    recall  f1-score   support

         LOC       0.92      0.93      0.93      7139
        MISC       0.82      0.88      0.85      3436
         ORG       0.82      0.87      0.85      6317
         PER       0.92      0.88      0.90      6600

   micro avg       0.88      0.89      0.89     23492
   macro avg       0.87      0.89      0.88     23492
weighted avg       0.88      0.89      0.89     23492

F1-score: 0.8861438405338062
