## Task 1: Load and Preprocess Data

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter, defaultdict
import numpy as np
from tqdm import tqdm

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [5]:
def load_conll_file(filepath):
    """
    Load CoNLL-2003 format file
    Format: token pos chunk ner_tag
    Sentences are separated by blank lines
    """
    sentences = []
    tags = []
    current_sentence = []
    current_tags = []
    
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            
            if line == "" or line.startswith("-DOCSTART-"):
                # End of sentence or document marker
                if current_sentence:
                    sentences.append(current_sentence)
                    tags.append(current_tags)
                    current_sentence = []
                    current_tags = []
            else:
                # Parse line: token pos chunk ner_tag
                parts = line.split()
                if len(parts) >= 4:
                    token = parts[0]
                    ner_tag = parts[3]  # Last column is NER tag
                    current_sentence.append(token)
                    current_tags.append(ner_tag)
        
        # Add last sentence if exists
        if current_sentence:
            sentences.append(current_sentence)
            tags.append(current_tags)
    
    return sentences, tags

# Load all splits
print("Loading CoNLL-2003 dataset from local files...")
DATA_DIR = "e:/NLP/Lab5/part4/data/conll2003_files"

train_sentences, train_tags = load_conll_file(f"{DATA_DIR}/train.txt")
val_sentences, val_tags = load_conll_file(f"{DATA_DIR}/valid.txt")
test_sentences, test_tags = load_conll_file(f"{DATA_DIR}/test.txt")

print(f"\nDataset loaded successfully!")
print(f"Train size: {len(train_sentences)} sentences")
print(f"Validation size: {len(val_sentences)} sentences")
print(f"Test size: {len(test_sentences)} sentences")

# Show example
print(f"\nExample sentence from training set:")
print(f"Tokens: {' '.join(train_sentences[2][:10])}...")
print(f"Tags: {' '.join(train_tags[2][:10])}...")

Loading CoNLL-2003 dataset from local files...

Dataset loaded successfully!
Train size: 14041 sentences
Validation size: 3250 sentences
Test size: 3453 sentences

Example sentence from training set:
Tokens: BRUSSELS 1996-08-22...
Tags: B-LOC O...


In [6]:
# Analyze tag distribution
all_tags = [tag for sent_tags in train_tags for tag in sent_tags]
tag_counts = Counter(all_tags)

print("\nNER Tag Distribution in Training Set:")
for tag, count in sorted(tag_counts.items()):
    print(f"  {tag:<12} {count:>8} ({count/len(all_tags)*100:.2f}%)")


NER Tag Distribution in Training Set:
  B-LOC            7140 (3.51%)
  B-MISC           3438 (1.69%)
  B-ORG            6321 (3.10%)
  B-PER            6600 (3.24%)
  I-LOC            1157 (0.57%)
  I-MISC           1155 (0.57%)
  I-ORG            3704 (1.82%)
  I-PER            4528 (2.22%)
  O              169578 (83.28%)


In [6]:
# Build vocabularies
def build_vocab(sentences, tags):
    # Build word vocabulary
    word_counts = Counter()
    for sentence in sentences:
        word_counts.update(sentence)
    
    # Create word_to_ix with special tokens
    word_to_ix = {"<PAD>": 0, "<UNK>": 1}
    for word in word_counts:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
    
    # Build tag vocabulary with consistent ordering
    # Collect all unique tags first
    all_tags = set()
    for sent_tags in tags:
        for tag in sent_tags:
            all_tags.add(tag)
    
    # Sort tags for reproducibility (O first, then B-*, then I-* alphabetically)
    sorted_tags = sorted(all_tags, key=lambda t: (0 if t == 'O' else (1 if t.startswith('B-') else 2), t))
    tag_to_ix = {tag: i for i, tag in enumerate(sorted_tags)}
    
    return word_to_ix, tag_to_ix

word_to_ix, tag_to_ix = build_vocab(train_sentences, train_tags)

print(f"\nVocabulary sizes:")
print(f"  Word vocabulary: {len(word_to_ix)} words")
print(f"  Tag vocabulary: {len(tag_to_ix)} tags")

# Create reverse mapping for tags
ix_to_tag = {v: k for k, v in tag_to_ix.items()}
print(f"\nTag to index mapping (sorted):")
for tag, idx in sorted(tag_to_ix.items(), key=lambda x: x[1]):
    print(f"  {idx}: {tag}")


Vocabulary sizes:
  Word vocabulary: 23625 words
  Tag vocabulary: 9 tags

Tag to index mapping (sorted):
  0: O
  1: B-LOC
  2: B-MISC
  3: B-ORG
  4: B-PER
  5: I-LOC
  6: I-MISC
  7: I-ORG
  8: I-PER


## Task 2: Create PyTorch Dataset and DataLoader

In [7]:
class NERDataset(Dataset):
    def __init__(self, sentences, tags, word_to_ix, tag_to_ix):
        self.sentences = sentences
        self.tags = tags
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        # Convert words to indices
        sentence_indices = torch.tensor([
            self.word_to_ix.get(word, self.word_to_ix["<UNK>"])
            for word in self.sentences[idx]
        ], dtype=torch.long)
        
        # Convert tags to indices
        tag_indices = torch.tensor([
            self.tag_to_ix[tag]
            for tag in self.tags[idx]
        ], dtype=torch.long)
        
        return sentence_indices, tag_indices

# Create datasets
train_dataset = NERDataset(train_sentences, train_tags, word_to_ix, tag_to_ix)
val_dataset = NERDataset(val_sentences, val_tags, word_to_ix, tag_to_ix)
test_dataset = NERDataset(test_sentences, test_tags, word_to_ix, tag_to_ix)

print(f"Datasets created:")
print(f"  Train: {len(train_dataset)} sentences")
print(f"  Validation: {len(val_dataset)} sentences")
print(f"  Test: {len(test_dataset)} sentences")

Datasets created:
  Train: 14041 sentences
  Validation: 3250 sentences
  Test: 3453 sentences


In [8]:
def collate_fn(batch):
    """Collate function to pad sequences in a batch"""
    sentences, tags = zip(*batch)
    
    # Pad sentences with <PAD> token index (0)
    sentences_padded = nn.utils.rnn.pad_sequence(
        sentences, 
        batch_first=True, 
        padding_value=word_to_ix["<PAD>"]
    )
    
    # Pad tags with -1 for loss calculation (ignore_index)
    tags_padded = nn.utils.rnn.pad_sequence(
        tags, 
        batch_first=True, 
        padding_value=-1
    )
    
    return sentences_padded, tags_padded

# Create DataLoaders with fixed seed for reproducibility
BATCH_SIZE = 32
generator = torch.Generator()
generator.manual_seed(42)

train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    collate_fn=collate_fn,
    generator=generator
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    collate_fn=collate_fn
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    collate_fn=collate_fn
)

print(f"\nDataLoaders created with batch size {BATCH_SIZE}")
print(f"  Train batches: {len(train_loader)}")
print(f"  Validation batches: {len(val_loader)}")
print(f"  Test batches: {len(test_loader)}")

# Test the DataLoader
sample_batch = next(iter(train_loader))
print(f"\nSample batch shapes:")
print(f"  Sentences: {sample_batch[0].shape}")
print(f"  Tags: {sample_batch[1].shape}")


DataLoaders created with batch size 32
  Train batches: 439
  Validation batches: 102
  Test batches: 108

Sample batch shapes:
  Sentences: torch.Size([32, 43])
  Tags: torch.Size([32, 43])


## Task 3: Build RNN Model

In [9]:
class SimpleRNNForNER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size, padding_idx=0):
        super(SimpleRNNForNER, self).__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(
            vocab_size, 
            embedding_dim, 
            padding_idx=padding_idx
        )
        
        # RNN layer
        self.rnn = nn.RNN(
            embedding_dim, 
            hidden_dim, 
            batch_first=True
        )
        
        # Linear layer for classification
        self.fc = nn.Linear(hidden_dim, output_size)
    
    def forward(self, x):
        # x shape: (batch_size, seq_len)
        
        # Embedding: (batch_size, seq_len, embedding_dim)
        embedded = self.embedding(x)
        
        # RNN: (batch_size, seq_len, hidden_dim)
        rnn_out, _ = self.rnn(embedded)
        
        # Linear: (batch_size, seq_len, output_size)
        output = self.fc(rnn_out)
        
        return output

# Model hyperparameters
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
VOCAB_SIZE = len(word_to_ix)
OUTPUT_SIZE = len(tag_to_ix)

# Initialize model
model = SimpleRNNForNER(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    output_size=OUTPUT_SIZE,
    padding_idx=word_to_ix["<PAD>"]
).to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nModel Architecture:")
print(model)
print(f"\nModel Parameters:")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")


Model Architecture:
SimpleRNNForNER(
  (embedding): Embedding(23625, 100, padding_idx=0)
  (rnn): RNN(100, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=9, bias=True)
)

Model Parameters:
  Total parameters: 2,393,101
  Trainable parameters: 2,393,101


## Task 4: Train Model

In [12]:
# Training setup
LEARNING_RATE = 0.001
NUM_EPOCHS = 5

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
# Use ignore_index=-1 to ignore padding tokens in loss calculation
criterion = nn.CrossEntropyLoss(ignore_index=-1)

print(f"Training Configuration:")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Epochs: {NUM_EPOCHS}")
print(f"  Optimizer: Adam")
print(f"  Loss function: CrossEntropyLoss (ignore_index=-1)")

Training Configuration:
  Learning rate: 0.001
  Epochs: 5
  Optimizer: Adam
  Loss function: CrossEntropyLoss (ignore_index=-1)


In [13]:
def train_epoch(model, dataloader, optimizer, criterion, device):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    
    for sentences, tags in tqdm(dataloader, desc="Training"):
        # Move to device
        sentences = sentences.to(device)
        tags = tags.to(device)
        
        # 1. Zero gradients
        optimizer.zero_grad()
        
        # 2. Forward pass
        outputs = model(sentences)
        
        # 3. Compute loss
        # Reshape for CrossEntropyLoss: (batch_size * seq_len, num_classes)
        outputs_flat = outputs.view(-1, outputs.shape[-1])
        tags_flat = tags.view(-1)
        loss = criterion(outputs_flat, tags_flat)
        
        # 4. Backward pass
        loss.backward()
        
        # 5. Update weights
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

# Training history
train_losses = []
val_accuracies = []

print("\n" + "="*50)
print("Starting training...")
print("="*50)


Starting training...


## Task 5: Evaluate Model

In [16]:
def evaluate(model, dataloader, device):
    """Evaluate model accuracy"""
    model.eval()
    correct = 0
    total = 0
    
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for sentences, tags in dataloader:
            # Move to device
            sentences = sentences.to(device)
            tags = tags.to(device)
            
            # Forward pass
            outputs = model(sentences)
            
            # Get predictions
            predictions = torch.argmax(outputs, dim=-1)
            
            # Create mask for non-padding tokens
            mask = tags != -1
            
            # Calculate accuracy only on non-padding tokens
            correct += ((predictions == tags) & mask).sum().item()
            total += mask.sum().item()
            
            # Store predictions and labels for detailed analysis
            all_predictions.extend(predictions[mask].cpu().numpy())
            all_labels.extend(tags[mask].cpu().numpy())
    
    accuracy = correct / total if total > 0 else 0
    return accuracy, all_predictions, all_labels

# Train the model
best_val_accuracy = 0

for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch + 1}/{NUM_EPOCHS}")
    print("-" * 50)
    
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    train_losses.append(train_loss)
    
    # Evaluate on validation set
    val_accuracy, _, _ = evaluate(model, val_loader, device)
    val_accuracies.append(val_accuracy)
    
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy*100:.2f}%")
    
    # Save best model
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), 'best_ner_model.pt')
        print(f"New best model saved!")

print("\n" + "="*50)
print("Training completed!")
print(f"Best validation accuracy: {best_val_accuracy*100:.2f}%")
print("="*50)


Epoch 1/5
--------------------------------------------------


Training: 100%|██████████| 439/439 [00:07<00:00, 57.53it/s]



Train Loss: 0.0260
Validation Accuracy: 93.36%
New best model saved!

Epoch 2/5
--------------------------------------------------


Training: 100%|██████████| 439/439 [00:07<00:00, 57.11it/s]
Training: 100%|██████████| 439/439 [00:07<00:00, 57.11it/s]


Train Loss: 0.0203
Validation Accuracy: 93.84%
New best model saved!

Epoch 3/5
--------------------------------------------------


Training: 100%|██████████| 439/439 [00:08<00:00, 54.51it/s]



Train Loss: 0.0156
Validation Accuracy: 93.03%

Epoch 4/5
--------------------------------------------------


Training: 100%|██████████| 439/439 [00:08<00:00, 54.59it/s]



Train Loss: 0.0128
Validation Accuracy: 93.56%

Epoch 5/5
--------------------------------------------------


Training: 100%|██████████| 439/439 [00:08<00:00, 54.12it/s]



Train Loss: 0.0105
Validation Accuracy: 93.41%

Training completed!
Best validation accuracy: 93.84%


In [17]:
# Load best model and evaluate on test set
model.load_state_dict(torch.load('best_ner_model.pt'))
test_accuracy, test_predictions, test_labels = evaluate(model, test_loader, device)

print(f"\nTest Set Performance:")
print(f"  Accuracy: {test_accuracy*100:.2f}%")


Test Set Performance:
  Accuracy: 90.87%


In [28]:
# Detailed per-tag analysis
def analyze_per_tag(predictions, labels, ix_to_tag):
    """Analyze performance per tag"""
    tag_correct = defaultdict(int)
    tag_total = defaultdict(int)
    
    for pred, label in zip(predictions, labels):
        tag_name = ix_to_tag[label]
        tag_total[tag_name] += 1
        if pred == label:
            tag_correct[tag_name] += 1
    
    # Calculate accuracy per tag
    tag_accuracies = {}
    for tag in tag_total:
        tag_accuracies[tag] = tag_correct[tag] / tag_total[tag] if tag_total[tag] > 0 else 0
    
    return tag_accuracies, tag_total

# Analyze test set performance
tag_accuracies, tag_counts = analyze_per_tag(test_predictions, test_labels, ix_to_tag)

print("\nPer-Tag Accuracy on Test Set:")
print(f"{'Tag':<12} {'Count':>8} {'Accuracy':>10}")
print("-" * 32)

# Sort by tag name for better readability
for tag in sorted(tag_accuracies.keys()):
    acc = tag_accuracies[tag]
    count = tag_counts[tag]
    print(f"{tag:<12} {count:>8} {acc*100:>9.2f}%")


Per-Tag Accuracy on Test Set:
Tag             Count   Accuracy
--------------------------------
B-LOC            1668     69.96%
B-MISC            702     57.83%
B-ORG            1661     52.50%
B-PER            1617     74.15%
I-LOC             257     57.59%
I-MISC            216     51.85%
I-ORG             835     60.60%
I-PER            1156     83.82%
O               38323     95.51%


In [29]:
# Calculate precision, recall, F1 for entity types (not token-level)
def get_entities(tags):
    """Extract entities from BIO tags"""
    entities = []
    current_entity = None
    
    for i, tag in enumerate(tags):
        if tag.startswith('B-'):
            if current_entity:
                entities.append(current_entity)
            current_entity = {'type': tag[2:], 'start': i, 'end': i}
        elif tag.startswith('I-'):
            if current_entity and current_entity['type'] == tag[2:]:
                current_entity['end'] = i
            else:
                if current_entity:
                    entities.append(current_entity)
                current_entity = None
        else:  # O tag
            if current_entity:
                entities.append(current_entity)
                current_entity = None
    
    if current_entity:
        entities.append(current_entity)
    
    return entities

def calculate_entity_f1(pred_tags_list, true_tags_list):
    """Calculate entity-level precision, recall, F1"""
    entity_types = ['PER', 'LOC', 'ORG', 'MISC']
    results = {}
    
    for entity_type in entity_types:
        tp = 0
        fp = 0
        fn = 0
        
        for pred_tags, true_tags in zip(pred_tags_list, true_tags_list):
            pred_entities = [e for e in get_entities(pred_tags) if e['type'] == entity_type]
            true_entities = [e for e in get_entities(true_tags) if e['type'] == entity_type]
            
            # Convert to sets of (start, end, type) for comparison
            pred_set = {(e['start'], e['end'], e['type']) for e in pred_entities}
            true_set = {(e['start'], e['end'], e['type']) for e in true_entities}
            
            tp += len(pred_set & true_set)
            fp += len(pred_set - true_set)
            fn += len(true_set - pred_set)
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        results[entity_type] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'support': tp + fn
        }
    
    return results

# Reconstruct full tag sequences for test set
print("\nCalculating entity-level metrics...")
model.eval()
pred_sequences = []
true_sequences = []

with torch.no_grad():
    for sentences, tags in test_loader:
        sentences = sentences.to(device)
        outputs = model(sentences)
        predictions = torch.argmax(outputs, dim=-1)
        
        for pred_seq, true_seq in zip(predictions.cpu().numpy(), tags.cpu().numpy()):
            # Remove padding
            valid_idx = true_seq != -1
            pred_tags = [ix_to_tag[p] for p in pred_seq[valid_idx]]
            true_tags = [ix_to_tag[t] for t in true_seq[valid_idx]]
            pred_sequences.append(pred_tags)
            true_sequences.append(true_tags)

# Calculate entity-level metrics
entity_results = calculate_entity_f1(pred_sequences, true_sequences)

print("\nEntity-Level Performance (Test Set):")
print(f"{'Entity':<8} {'Precision':>10} {'Recall':>10} {'F1-Score':>10} {'Support':>10}")
print("-" * 52)

for entity_type in ['PER', 'LOC', 'ORG', 'MISC']:
    metrics = entity_results[entity_type]
    print(f"{entity_type:<8} {metrics['precision']*100:>9.2f}% {metrics['recall']*100:>9.2f}% "
          f"{metrics['f1']*100:>9.2f}% {metrics['support']:>10}")

# Calculate overall metrics
overall_support = sum(r['support'] for r in entity_results.values())
overall_f1 = sum(r['f1'] * r['support'] for r in entity_results.values()) / overall_support if overall_support > 0 else 0

print("-" * 52)
print(f"Overall F1-Score: {overall_f1*100:.2f}%")


Calculating entity-level metrics...

Entity-Level Performance (Test Set):
Entity    Precision     Recall   F1-Score    Support
----------------------------------------------------
PER          43.21%     67.53%     52.70%       1617
LOC          81.50%     66.55%     73.27%       1668
ORG          62.08%     48.40%     54.40%       1661
MISC         63.65%     52.14%     57.32%        702
----------------------------------------------------
Overall F1-Score: 59.85%


In [15]:
# Prediction function for new sentences
def predict_sentence(sentence_text, model, word_to_ix, ix_to_tag, device):
    """Predict NER tags for a new sentence"""
    model.eval()
    
    # Tokenize (simple whitespace split)
    tokens = sentence_text.split()
    
    # Convert to indices
    token_indices = torch.tensor([
        word_to_ix.get(token, word_to_ix["<UNK>"]) for token in tokens
    ], dtype=torch.long).unsqueeze(0).to(device)  # Add batch dimension
    
    # Predict
    with torch.no_grad():
        outputs = model(token_indices)
        predictions = torch.argmax(outputs, dim=-1).squeeze(0)
    
    # Convert predictions to tags
    predicted_tags = [ix_to_tag[pred.item()] for pred in predictions]
    
    return list(zip(tokens, predicted_tags))

# Test prediction function with diverse real-world examples
print("\nExample Predictions on Custom Sentences:")
print("=" * 70)

test_sentences_custom = [
    # Vietnamese entities
    "VNU University is located in Hanoi",
    "Nguyen Huu Thang works at FPT Corporation in Ho Chi Minh City",
    "The Mekong Delta is in southern Vietnam",
    
    # International entities
    "Apple Inc. was founded by Steve Jobs in California",
    "Barack Obama visited Paris last year",
    "Microsoft headquarters is in Redmond Washington",
    
    # Mixed entities
    "The Amazon river flows through Brazil",
    "Google CEO Sundar Pichai announced new products",
    "The United Nations meeting was held in New York",
    
    # Complex entities
    "President Joe Biden met with Prime Minister of Japan in Tokyo",
    "Tesla and SpaceX are both led by Elon Musk",
    "The World Health Organization issued guidelines for COVID-19",
    
    # Edge cases
    "The Wall Street Journal reported on European markets",
    "Harvard University professor John Smith published research",
    "Samsung and Sony compete in Asian markets"
]

for sent in test_sentences_custom:
    predictions = predict_sentence(sent, model, word_to_ix, ix_to_tag, device)
    
    # Only show sentence if there are predicted entities
    has_entity = any(tag != 'O' for _, tag in predictions)
    if has_entity:
        print(f"\nSentence: {sent}")
        print("Predictions:")
        for token, tag in predictions:
            if tag != 'O':
                print(f"  {token:20s} -> {tag}")
    else:
        print(f"\n✗ Sentence: {sent}")
        print("  No entities detected")


Example Predictions on Custom Sentences:

Sentence: VNU University is located in Hanoi
Predictions:
  VNU                  -> B-ORG
  University           -> I-ORG
  Hanoi                -> B-PER

Sentence: Nguyen Huu Thang works at FPT Corporation in Ho Chi Minh City
Predictions:
  Nguyen               -> B-PER
  Huu                  -> I-PER
  Thang                -> I-PER
  Corporation          -> I-ORG
  Chi                  -> I-PER
  City                 -> I-ORG

Sentence: The Mekong Delta is in southern Vietnam
Predictions:
  Mekong               -> B-ORG
  Delta                -> I-ORG
  Vietnam              -> B-MISC

Sentence: Apple Inc. was founded by Steve Jobs in California
Predictions:
  Apple                -> B-ORG
  Inc.                 -> I-ORG
  Steve                -> B-PER
  Jobs                 -> I-PER
  California           -> B-LOC

Sentence: Barack Obama visited Paris last year
Predictions:
  Barack               -> B-PER
  Obama                -> I-PER
  Pa

In [18]:
# Show some real examples from test set
print("\nReal Examples from Test Set:")
print("=" * 70)

# Show 5 examples
import random
random.seed(42)
sample_indices = random.sample(range(len(test_sentences)), 5)

for idx in sample_indices:
    tokens = test_sentences[idx]
    true_tags = test_tags[idx]
    
    predictions = predict_sentence(' '.join(tokens), model, word_to_ix, ix_to_tag, device)
    pred_tags = [tag for _, tag in predictions]
    
    print(f"\nSentence: {' '.join(tokens)}")
    print(f"{'Token':<20s} {'True':<12s} {'Predicted':<12s}")
    print("-" * 44)
    
    for token, true_tag, pred_tag in zip(tokens, true_tags, pred_tags):
        if true_tag != 'O' or pred_tag != 'O':
            match = "OK" if true_tag == pred_tag else "DIFF"
            print(f"{token:<20s} {true_tag:<12s} {pred_tag:<12s} {match}")


Real Examples from Test Set:

Sentence: ( 52.76 / 53.18 )
Token                True         Predicted   
--------------------------------------------
52.76                O            B-PER        DIFF
53.18                O            B-PER        DIFF

Sentence: WESTERN CONFERENCE
Token                True         Predicted   
--------------------------------------------
WESTERN              O            B-MISC       DIFF
CONFERENCE           O            B-PER        DIFF

Sentence: Wasim Akram b Harris 4
Token                True         Predicted   
--------------------------------------------
Wasim                B-PER        B-PER        OK
Akram                I-PER        I-PER        OK
Harris               B-PER        B-PER        OK

Sentence: Mansfield 21 5 9 7 21 22 24
Token                True         Predicted   
--------------------------------------------
Mansfield            B-ORG        B-ORG        OK

Sentence: -- New York Commodities Desk , 212-859-1640
Token  

In [30]:
# Final summary
print("\n" + "="*70)
print("FINAL RESULTS SUMMARY")
print("="*70)
print(f"\nModel Architecture: SimpleRNN")
print(f"  Embedding dimension: {EMBEDDING_DIM}")
print(f"  Hidden dimension: {HIDDEN_DIM}")
print(f"  Total parameters: {total_params:,}")
print(f"\nDataset:")
print(f"  Training sentences: {len(train_sentences)}")
print(f"  Validation sentences: {len(val_sentences)}")
print(f"  Test sentences: {len(test_sentences)}")
print(f"\nTraining:")
print(f"  Epochs: {NUM_EPOCHS}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"\nPerformance:")
print(f"  Best validation accuracy: {best_val_accuracy*100:.2f}%")
print(f"  Test accuracy: {test_accuracy*100:.2f}%")
print(f"  Overall F1-score: {overall_f1*100:.2f}%")
print("\nPer-Entity F1-Scores:")
for entity_type in ['PER', 'LOC', 'ORG', 'MISC']:
    f1 = entity_results[entity_type]['f1']
    print(f"  {entity_type}: {f1*100:.2f}%")
print("="*70)


FINAL RESULTS SUMMARY

Model Architecture: SimpleRNN
  Embedding dimension: 100
  Hidden dimension: 128
  Total parameters: 2,393,101

Dataset:
  Training sentences: 14041
  Validation sentences: 3250
  Test sentences: 3453

Training:
  Epochs: 5
  Batch size: 32
  Learning rate: 0.001

Performance:
  Best validation accuracy: 93.11%
  Test accuracy: 90.41%
  Overall F1-score: 59.85%

Per-Entity F1-Scores:
  PER: 52.70%
  LOC: 73.27%
  ORG: 54.40%
  MISC: 57.32%
