In [None]:
!pip install --quiet datasets transformers seqeval evaluate

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [None]:
!git clone https://github.com/eblict-gigatech/BanNERD.git

Cloning into 'BanNERD'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (28/28), done.[K
remote: Total 36 (delta 11), reused 23 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (36/36), 10.29 MiB | 2.54 MiB/s, done.
Resolving deltas: 100% (11/11), done.


In [None]:
# Import libraries
from datasets import Dataset
from evaluate import load as load_metric
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
import torch
import numpy as np
import os
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Check GPU info
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
if torch.cuda.is_available():
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Disable wandb to avoid errors
os.environ["WANDB_DISABLED"] = "true"

CUDA available: True
GPU device: Tesla T4
GPU memory: 15.8 GB
Using device: cuda


In [None]:
# Function to parse CoNLL format from .txt files
def load_conll_data(filepath):
    sentences = []
    with open(filepath, 'r', encoding='utf-8') as f:
        tokens, tags = [], []
        for line in f:
            line = line.strip()
            if line == '':
                # Empty line indicates end of sentence
                if tokens:
                    sentences.append({"tokens": tokens, "ner_tags": tags})
                    tokens, tags = [], []
            else:
                # Split by whitespace - token should be first, tag should be last
                parts = line.split()
                if len(parts) >= 2:
                    token = parts[0]
                    ner_tag = parts[-1]  # Last element is the NER tag
                    tokens.append(token)
                    tags.append(ner_tag)

        # Don't forget the last sentence if file doesn't end with empty line
        if tokens:
            sentences.append({"tokens": tokens, "ner_tags": tags})

    return sentences

In [None]:
# Load BanNERD data from .txt files
print("Loading data...")
train_data = load_conll_data('/content/BanNERD/dataset/coNLL_format/train.txt')
val_data = load_conll_data('/content/BanNERD/dataset/coNLL_format/val.txt')
test_data = load_conll_data('/content/BanNERD/dataset/coNLL_format/test.txt')
print("Data loaded successfully")

Loading data...
Data loaded successfully


In [None]:
print(f"Train samples: {len(train_data)}")
print(f"Val samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")

# Check first example
if train_data:
    print("\nFirst training example:")
    print(f"Tokens: {train_data[0]['tokens']}")
    print(f"Tags: {train_data[0]['ner_tags']}")


Train samples: 71844
Val samples: 4059
Test samples: 9272

First training example:
Tokens: ['একেকটি', 'বই', '৭০', 'থেকে', '৯০', 'টাকা', 'বা', 'এর', 'চেয়েও', 'বেশি', 'দামের', '।']
Tags: ['O', 'O', 'B-NUM', 'O', 'B-NUM', 'B-UNIT', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
# Get all unique labels from the dataset
def get_label_list(datasets):
    all_labels = set()
    for dataset in datasets:
        for example in dataset:
            all_labels.update(example['ner_tags'])
    return sorted(list(all_labels))

label_list = get_label_list([train_data, val_data, test_data])
print(f"\nLabel list: {label_list}")
print(f"Number of labels: {len(label_list)}")


Label list: ['B-D&T', 'B-EVENT', 'B-GPE', 'B-LOC', 'B-MISC', 'B-NUM', 'B-ORG', 'B-PER', 'B-T&T', 'B-UNIT', 'I-D&T', 'I-EVENT', 'I-GPE', 'I-LOC', 'I-MISC', 'I-NUM', 'I-ORG', 'I-PER', 'I-T&T', 'I-UNIT', 'O']
Number of labels: 21


In [None]:
# Create label mappings
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

print(f"Label to ID mapping: {label_to_id}")

Label to ID mapping: {'B-D&T': 0, 'B-EVENT': 1, 'B-GPE': 2, 'B-LOC': 3, 'B-MISC': 4, 'B-NUM': 5, 'B-ORG': 6, 'B-PER': 7, 'B-T&T': 8, 'B-UNIT': 9, 'I-D&T': 10, 'I-EVENT': 11, 'I-GPE': 12, 'I-LOC': 13, 'I-MISC': 14, 'I-NUM': 15, 'I-ORG': 16, 'I-PER': 17, 'I-T&T': 18, 'I-UNIT': 19, 'O': 20}


In [None]:
# Create Hugging Face datasets
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)


In [None]:
# Check label distribution
def check_label_distribution(dataset, name="Dataset"):
    all_labels = []
    for example in dataset:
        all_labels.extend(example['ner_tags'])

    label_counts = pd.Series(all_labels).value_counts()
    print(f"\n=== {name} Label Distribution ===")
    for label, count in label_counts.items():
        print(f"{label:15}: {count:6} ({count/len(all_labels)*100:.2f}%)")
    print(f"Total tokens: {len(all_labels)}")

check_label_distribution(train_dataset, "Training")
check_label_distribution(test_dataset, "Test")


=== Training Label Distribution ===
O              : 643412 (76.42%)
B-PER          :  56062 (6.66%)
B-NUM          :  26827 (3.19%)
B-ORG          :  17501 (2.08%)
I-PER          :  11188 (1.33%)
B-GPE          :  11081 (1.32%)
I-ORG          :  10460 (1.24%)
I-D&T          :  10227 (1.21%)
B-D&T          :   9547 (1.13%)
B-EVENT        :   7564 (0.90%)
B-LOC          :   7439 (0.88%)
B-UNIT         :   6993 (0.83%)
B-MISC         :   4830 (0.57%)
I-NUM          :   4616 (0.55%)
I-LOC          :   4278 (0.51%)
I-EVENT        :   3723 (0.44%)
I-T&T          :   2293 (0.27%)
B-T&T          :   1972 (0.23%)
I-MISC         :   1293 (0.15%)
I-GPE          :    321 (0.04%)
I-UNIT         :    317 (0.04%)
Total tokens: 841944

=== Test Label Distribution ===
O              :  75258 (76.48%)
B-PER          :   6558 (6.66%)
B-NUM          :   3138 (3.19%)
B-ORG          :   2048 (2.08%)
I-PER          :   1309 (1.33%)
B-GPE          :   1297 (1.32%)
I-ORG          :   1224 (1.24%)
B-D&T      

In [None]:
# Initialize tokenizer and model
model_name = "csebuetnlp/banglabert"
print(f"\nLoading model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)


Loading model: csebuetnlp/banglabert


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Move model to GPU
model = model.to(device)
print(f"Model moved to: {next(model.parameters()).device}")

Model moved to: cuda:0


In [None]:
# Function to tokenize and align labels
def tokenize_and_align_labels(examples):
    # Tokenize the text
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        padding=False,
        is_split_into_words=True,
        max_length=256,
    )

    labels = []
    for i, label_seq in enumerate(examples['ner_tags']):
        # Get word IDs for this example
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []

        for word_idx in word_ids:
            # Set special tokens to -100
            if word_idx is None:
                label_ids.append(-100)
            else:
                # Set the label for each token
                label_ids.append(label_to_id[label_seq[word_idx]])

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# Tokenize datasets
print("\nTokenizing datasets...")
train_tokenized = train_dataset.map(tokenize_and_align_labels, batched=True)
val_tokenized = val_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized = test_dataset.map(tokenize_and_align_labels, batched=True)


Tokenizing datasets...


Map:   0%|          | 0/71844 [00:00<?, ? examples/s]

Map:   0%|          | 0/4059 [00:00<?, ? examples/s]

Map:   0%|          | 0/9272 [00:00<?, ? examples/s]

In [None]:
# Debug tokenized data
print("\nDebugging tokenized data...")
print(f"Input IDs length in first example: {len(train_tokenized[0]['input_ids'])}")
print(f"Labels length in first example: {len(train_tokenized[0]['labels'])}")



Debugging tokenized data...
Input IDs length in first example: 14
Labels length in first example: 14


In [None]:
# Remove original columns to avoid tensor conversion issues
columns_to_remove = ['tokens', 'ner_tags']
train_tokenized = train_tokenized.remove_columns(columns_to_remove)
val_tokenized = val_tokenized.remove_columns(columns_to_remove)
test_tokenized = test_tokenized.remove_columns(columns_to_remove)

In [None]:
# Load evaluation metric
metric = load_metric("seqeval")

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# Define compute_metrics function for evaluation
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
# Data collator for dynamic padding
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True
)


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./bannerd-ner-model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir='./logs',
    logging_steps=100,
    save_total_limit=2,
    report_to="none",
    remove_unused_columns=True,
    dataloader_pin_memory=True,
    fp16=True,
    warmup_ratio=0.1,
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Train the model
print("\nStarting training...")
trainer.train()


Starting training...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1613,0.160462,0.849576,0.86769,0.858537,0.950095
2,0.1264,0.146401,0.869887,0.877254,0.873555,0.95504
3,0.0913,0.147264,0.867996,0.884428,0.876135,0.955602
4,0.0626,0.166077,0.865667,0.8918,0.87854,0.956547
5,0.0443,0.179095,0.87143,0.890704,0.880962,0.957351


TrainOutput(global_step=22455, training_loss=0.1452851996233882, metrics={'train_runtime': 1890.7403, 'train_samples_per_second': 189.989, 'train_steps_per_second': 11.876, 'total_flos': 4425779868032160.0, 'train_loss': 0.1452851996233882, 'epoch': 5.0})

In [None]:
# Save the model
trainer.save_model("./bannerd-ner-final")
tokenizer.save_pretrained("./bannerd-ner-final")
print("\nModel saved to './bannerd-ner-final'")


Model saved to './bannerd-ner-final'


In [None]:
# COMPREHENSIVE TESTING ON TEST SET
print("\n" + "="*60)
print("COMPREHENSIVE TESTING ON TEST SET")
print("="*60)


COMPREHENSIVE TESTING ON TEST SET


In [None]:
# 1. Basic evaluation using Trainer
print("\n1. Basic Evaluation using Trainer:")
test_results = trainer.evaluate(test_tokenized)
print("=== BASIC TEST RESULTS ===")
for key, value in test_results.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")


1. Basic Evaluation using Trainer:


=== BASIC TEST RESULTS ===
eval_loss: 0.1885
eval_precision: 0.8603
eval_recall: 0.8871
eval_f1: 0.8735
eval_accuracy: 0.9558
eval_runtime: 12.2996
eval_samples_per_second: 753.8470
eval_steps_per_second: 47.1560
epoch: 5.0000


In [None]:
# 2. Detailed per-class metrics
print("\n2. Detailed Per-Class Metrics:")

def get_detailed_predictions(model, tokenized_dataset):
    """Get detailed predictions for analysis"""
    all_predictions = []
    all_true_labels = []
    all_tokens = []

    model.eval()
    with torch.no_grad():
        for i in range(len(tokenized_dataset)):
            # Get single example
            example = tokenized_dataset[i]
            input_ids = torch.tensor(example['input_ids']).unsqueeze(0).to(device)
            attention_mask = torch.tensor(example['attention_mask']).unsqueeze(0).to(device)
            labels = example['labels']

            # Get predictions
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=2).squeeze(0).cpu().numpy()

            # Convert token IDs to tokens
            tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze(0).cpu().numpy())

            # Filter out special tokens and -100 labels
            for j, (token, pred, true_label) in enumerate(zip(tokens, predictions, labels)):
                if true_label != -100:  # Only consider non-special tokens
                    all_tokens.append(token)
                    all_predictions.append(id_to_label[pred])
                    all_true_labels.append(id_to_label[true_label])

    return all_tokens, all_predictions, all_true_labels


2. Detailed Per-Class Metrics:


In [None]:
# Get detailed predictions
print("Getting detailed predictions...")
test_tokens, test_preds, test_true = get_detailed_predictions(model, test_tokenized)


Getting detailed predictions...


In [None]:
# Print classification report
print("\n=== DETAILED CLASSIFICATION REPORT ===")
print(classification_report(test_true, test_preds, zero_division=0))



=== DETAILED CLASSIFICATION REPORT ===
              precision    recall  f1-score   support

       B-D&T       0.91      0.94      0.93      1256
     B-EVENT       0.77      0.81      0.79      1038
       B-GPE       0.86      0.86      0.86      1492
       B-LOC       0.80      0.83      0.81      1410
      B-MISC       0.78      0.78      0.78       773
       B-NUM       0.95      0.95      0.95      3475
       B-ORG       0.84      0.86      0.85      2641
       B-PER       0.91      0.94      0.92      8703
       B-T&T       0.80      0.84      0.82       346
      B-UNIT       0.93      0.96      0.94       869
       I-D&T       0.94      0.93      0.93      1154
     I-EVENT       0.82      0.78      0.80       499
       I-GPE       0.93      0.78      0.85        51
       I-LOC       0.76      0.83      0.79       647
      I-MISC       0.73      0.58      0.64       217
       I-NUM       0.93      0.92      0.92       562
       I-ORG       0.88      0.88      0.

In [None]:
# 3. Per-class performance
print("\n3. Per-Class Performance:")
unique_labels = sorted(set(test_true + test_preds))
for label in unique_labels:
    if label != 'O':  # Skip 'O' label for clarity
        true_count = test_true.count(label)
        pred_count = test_preds.count(label)
        correct = sum(1 for t, p in zip(test_true, test_preds) if t == label and p == label)
        precision = correct / pred_count if pred_count > 0 else 0
        recall = correct / true_count if true_count > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        print(f"{label:15}: Precision={precision:.3f}, Recall={recall:.3f}, F1={f1:.3f}, Support={true_count}")


3. Per-Class Performance:
B-D&T          : Precision=0.911, Recall=0.940, F1=0.925, Support=1256
B-EVENT        : Precision=0.774, Recall=0.814, F1=0.793, Support=1038
B-GPE          : Precision=0.856, Recall=0.859, F1=0.858, Support=1492
B-LOC          : Precision=0.799, Recall=0.826, F1=0.812, Support=1410
B-MISC         : Precision=0.777, Recall=0.776, F1=0.777, Support=773
B-NUM          : Precision=0.950, Recall=0.953, F1=0.951, Support=3475
B-ORG          : Precision=0.841, Recall=0.860, F1=0.850, Support=2641
B-PER          : Precision=0.908, Recall=0.939, F1=0.923, Support=8703
B-T&T          : Precision=0.802, Recall=0.844, F1=0.823, Support=346
B-UNIT         : Precision=0.926, Recall=0.960, F1=0.942, Support=869
I-D&T          : Precision=0.937, Recall=0.931, F1=0.934, Support=1154
I-EVENT        : Precision=0.825, Recall=0.782, F1=0.802, Support=499
I-GPE          : Precision=0.930, Recall=0.784, F1=0.851, Support=51
I-LOC          : Precision=0.761, Recall=0.825, F1=0.792

In [None]:
# 4. Error Analysis - Most common errors
print("\n4. Error Analysis - Most Common Misclassifications:")
from collections import Counter

errors = []
for true, pred, token in zip(test_true, test_preds, test_tokens):
    if true != pred:
        errors.append((true, pred, token))

error_counts = Counter(errors)
print("Top 10 most common errors:")
for (true, pred, token), count in error_counts.most_common(10):
    print(f"  '{token}' : {true} -> {pred} (count: {count})")


4. Error Analysis - Most Common Misclassifications:
Top 10 most common errors:
  '[UNK]' : O -> B-PER (count: 180)
  '[UNK]' : B-PER -> O (count: 117)
  '[UNK]' : B-NUM -> O (count: 65)
  '[UNK]' : O -> B-GPE (count: 65)
  '[UNK]' : B-ORG -> O (count: 60)
  '[UNK]' : B-GPE -> O (count: 58)
  '[UNK]' : O -> B-ORG (count: 57)
  '[UNK]' : O -> B-EVENT (count: 42)
  '[UNK]' : O -> B-LOC (count: 40)
  'সাহেব' : I-PER -> B-PER (count: 38)


In [None]:
# 5. Sample predictions from test set
print("\n5. Sample Predictions from Test Set:")
def show_sample_predictions(model, tokenized_dataset, original_dataset, num_samples=3):
    model.eval()
    with torch.no_grad():
        for i in range(min(num_samples, len(tokenized_dataset))):
            print(f"\n--- Sample {i+1} ---")

            # Get original data
            original_tokens = original_dataset[i]['tokens']
            original_tags = original_dataset[i]['ner_tags']

            # Get model prediction
            example = tokenized_dataset[i]
            input_ids = torch.tensor(example['input_ids']).unsqueeze(0).to(device)
            attention_mask = torch.tensor(example['attention_mask']).unsqueeze(0).to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=2).squeeze(0).cpu().numpy()

            # Get word-aligned predictions
            word_ids = tokenized_dataset[i]['word_ids'] if 'word_ids' in tokenized_dataset[i] else None
            if word_ids is None:
                # Recreate word_ids
                word_ids = tokenizer(original_tokens, is_split_into_words=True).word_ids()

            # Map predictions back to words
            word_predictions = {}
            current_word_idx = None
            for j, (word_idx, pred_id) in enumerate(zip(word_ids, predictions)):
                if word_idx is not None and word_idx != current_word_idx:
                    word_predictions[word_idx] = id_to_label[pred_id]
                    current_word_idx = word_idx

            # Display results
            print("Text:", " ".join(original_tokens))
            print("True :", " ".join(f"{token}/{tag}" for token, tag in zip(original_tokens, original_tags)))
            print("Pred :", " ".join(f"{token}/{word_predictions.get(j, 'O')}" for j, token in enumerate(original_tokens)))

show_sample_predictions(model, test_tokenized, test_dataset)


5. Sample Predictions from Test Set:

--- Sample 1 ---
Text: সে অনুযায়ী শিক্ষকদের প্রশিক্ষণের ব্যবস্থা করা হচ্ছে ।
True : সে/O অনুযায়ী/O শিক্ষকদের/B-PER প্রশিক্ষণের/O ব্যবস্থা/O করা/O হচ্ছে/O ।/O
Pred : সে/O অনুযায়ী/O শিক্ষকদের/B-PER প্রশিক্ষণের/O ব্যবস্থা/O করা/O হচ্ছে/O ।/O

--- Sample 2 ---
Text: স্বাধীন বাংলাদেশে মাথাপিছু আয় ছিলো ৮৮ ডলার ।
True : স্বাধীন/O বাংলাদেশে/B-GPE মাথাপিছু/O আয়/O ছিলো/O ৮৮/B-NUM ডলার/B-UNIT ।/O
Pred : স্বাধীন/O বাংলাদেশে/B-GPE মাথাপিছু/O আয়/O ছিলো/O ৮৮/B-NUM ডলার/B-UNIT ।/O

--- Sample 3 ---
Text: তিনি দাবি করেন , পশ্চিমাদের ষড়যন্ত্রের কারণে মস্কো নর্ড স্ট্রিমের মাধ্যমে গ্যাস সরবরাহ বন্ধ করতে বাধ্য হয় ।
True : তিনি/O দাবি/O করেন/O ,/O পশ্চিমাদের/B-PER ষড়যন্ত্রের/O কারণে/O মস্কো/B-GPE নর্ড/O স্ট্রিমের/O মাধ্যমে/O গ্যাস/O সরবরাহ/O বন্ধ/O করতে/O বাধ্য/O হয়/O ।/O
Pred : তিনি/O দাবি/O করেন/O ,/O পশ্চিমাদের/B-PER ষড়যন্ত্রের/O কারণে/O মস্কো/B-GPE নর্ড/B-LOC স্ট্রিমের/I-LOC মাধ্যমে/O গ্যাস/O সরবরাহ/O বন্ধ/O করতে/O বাধ্য/O হয়/O ।/O


In [None]:
# 6. Performance on different entity types
print("\n6. Performance by Entity Type:")

def get_entity_performance(true_labels, pred_labels):
    entity_types = set([label[2:] for label in true_labels + pred_labels if label != 'O'])
    results = {}

    for entity in entity_types:
        b_entity = f"B-{entity}"
        i_entity = f"I-{entity}"

        # Count true positives, false positives, false negatives
        tp = sum(1 for t, p in zip(true_labels, pred_labels) if t in [b_entity, i_entity] and p in [b_entity, i_entity])
        fp = sum(1 for t, p in zip(true_labels, pred_labels) if t not in [b_entity, i_entity] and p in [b_entity, i_entity])
        fn = sum(1 for t, p in zip(true_labels, pred_labels) if t in [b_entity, i_entity] and p not in [b_entity, i_entity])

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

        results[entity] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'support': tp + fn
        }

    return results

entity_results = get_entity_performance(test_true, test_preds)
for entity, metrics in entity_results.items():
    print(f"{entity:15}: Precision={metrics['precision']:.3f}, Recall={metrics['recall']:.3f}, F1={metrics['f1']:.3f}, Support={metrics['support']}")


6. Performance by Entity Type:
LOC            : Precision=0.806, Recall=0.846, F1=0.825, Support=2057
GPE            : Precision=0.859, Recall=0.857, F1=0.858, Support=1543
PER            : Precision=0.937, Recall=0.956, F1=0.946, Support=10349
MISC           : Precision=0.789, Recall=0.752, F1=0.770, Support=990
T&T            : Precision=0.803, Recall=0.840, F1=0.821, Support=749
ORG            : Precision=0.892, Recall=0.906, F1=0.899, Support=4112
D&T            : Precision=0.941, Recall=0.954, F1=0.947, Support=2410
UNIT           : Precision=0.932, Recall=0.967, F1=0.949, Support=912
EVENT          : Precision=0.828, Recall=0.843, F1=0.836, Support=1537
NUM            : Precision=0.960, Recall=0.960, F1=0.960, Support=4037


In [None]:
# 7. Test with custom sentences
print("\n7. Testing with Custom Sentences:")
def predict_custom_sentences(sentences, model, tokenizer):
    model.eval()
    for sentence in sentences:
        print(f"\nInput: {sentence}")

        # Tokenize
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=256)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Predict
        with torch.no_grad():
            outputs = model(**inputs)

        predictions = torch.argmax(outputs.logits, dim=2)
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        predicted_labels = [id_to_label[pred_id] for pred_id in predictions[0].cpu().numpy()]

        # Display results
        result = []
        for token, label in zip(tokens, predicted_labels):
            if token not in [tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token]:
                result.append(f"{token}/{label}")

        print("Output:", " ".join(result))

custom_sentences = [
    "ঢাকা বাংলাদেশের রাজধানী শহর",
    "শেখ মুজিবুর রহমান বাংলাদেশের প্রতিষ্ঠাতা",
    "পদ্মা নদী বাংলাদেশের প্রধান নদী",
    "বাংলাদেশ ব্যাংক দেশের কেন্দ্রীয় ব্যাংক"
]

predict_custom_sentences(custom_sentences, model, tokenizer)

print("\n" + "="*60)
print("TESTING COMPLETED SUCCESSFULLY!")
print("="*60)


7. Testing with Custom Sentences:

Input: ঢাকা বাংলাদেশের রাজধানী শহর
Output: ঢাকা/B-GPE বাংলাদেশের/B-GPE রাজধানী/O শহর/O

Input: শেখ মুজিবুর রহমান বাংলাদেশের প্রতিষ্ঠাতা
Output: শেখ/B-PER মুজিবুর/I-PER রহমান/I-PER বাংলাদেশের/B-GPE প্রতিষ্ঠাতা/B-PER

Input: পদ্মা নদী বাংলাদেশের প্রধান নদী
Output: পদ্মা/B-LOC নদী/I-LOC বাংলাদেশের/B-GPE প্রধান/O নদী/O

Input: বাংলাদেশ ব্যাংক দেশের কেন্দ্রীয় ব্যাংক
Output: বাংলাদেশ/B-ORG ব্যাংক/I-ORG দেশের/O কেন্দ্রীয়/B-ORG ব্যাংক/I-ORG

TESTING COMPLETED SUCCESSFULLY!


In [None]:
print("=== ADVERSARIAL ATTACKS ON NER MODEL ===")


=== ADVERSARIAL ATTACKS ON NER MODEL ===


In [None]:
# Install additional packages for adversarial attacks
!pip install --quiet textattack adversarial-robustness-toolbox

# Import libraries for adversarial attacks
import textattack
from textattack import Attack
from textattack.attack_recipes import BAEGarg2019, TextFoolerJin2019, DeepWordBugGao2018
from textattack.datasets import Dataset as TextAttackDataset
from textattack.models.wrappers import HuggingFaceModelWrapper
from textattack.attack_results import SuccessfulAttackResult
import numpy as np
from typing import List
import random

print(f"Library imported successfully")

Library imported successfully


In [None]:
print("=== ADVERSARIAL ATTACKS ON NER MODEL ===")

# First, let's load our trained model for testing
model_path = "./bannerd-ner-final"
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = model.to(device)

print("Model loaded for adversarial testing")

=== ADVERSARIAL ATTACKS ON NER MODEL ===
Model loaded for adversarial testing


In [None]:
# Function to evaluate model performance
def evaluate_model_performance(model, tokenized_dataset, description="Model"):
    """Comprehensive evaluation of model performance"""
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=tokenized_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    results = trainer.evaluate(tokenized_dataset)
    print(f"\n=== {description} PERFORMANCE ===")
    for key, value in results.items():
        if isinstance(value, float):
            print(f"{key}: {value:.4f}")

    return results


# Get baseline performance
print("Getting baseline performance...")
baseline_results = evaluate_model_performance(model, test_tokenized, "BASELINE")

Getting baseline performance...



=== BASELINE PERFORMANCE ===
eval_loss: 0.1885
eval_model_preparation_time: 0.0045
eval_precision: 0.8603
eval_recall: 0.8871
eval_f1: 0.8735
eval_accuracy: 0.9558
eval_runtime: 26.6958
eval_samples_per_second: 347.3200
eval_steps_per_second: 21.7260


In [None]:
# 1. CHARACTER-LEVEL ATTACKS (Evasive/Invasive)
print("\n" + "="*60)
print("1. CHARACTER-LEVEL ADVERSARIAL ATTACKS")
print("="*60)

class CharacterLevelAttack:
    """Character-level adversarial attacks on Bengali text"""

    def __init__(self):
        # Bengali character substitutions that look similar
        self.bengali_subs = {
            'া': ['ি', 'ী', 'ু', 'ূ'],  # aa vowel
            'ি': ['া', 'ী', 'ু'],       # i vowel
            'ী': ['ি', 'া'],            # ii vowel
            'ে': ['ৈ', 'ো'],            # e vowel
            'ো': ['ে', 'ৌ'],            # o vowel
            'র': ['ড়', 'ঢ়'],          # ra
            'ন': ['ণ'],                 # na
            'শ': ['ষ', 'স'],            # sha
            'ব': ['ভ'],                 # ba
            'দ': ['ড'],                 # da
            'জ': ['য'],                 # ja
        }

    def random_character_swap(self, text, swap_prob=0.1):
        """Randomly swap similar-looking Bengali characters"""
        chars = list(text)
        for i in range(len(chars)):
            if random.random() < swap_prob and chars[i] in self.bengali_subs:
                substitutes = self.bengali_subs[chars[i]]
                chars[i] = random.choice(substitutes)
        return ''.join(chars)

    def add_diacritics(self, text, add_prob=0.05):
        """Add random diacritics to confuse the model"""
        diacritics = ['়', '্', 'ঁ']  # Various Bengali diacritics
        chars = list(text)
        result = []
        for char in chars:
            result.append(char)
            if random.random() < add_prob and char not in [' ', '.', ',', '!', '?']:
                result.append(random.choice(diacritics))
        return ''.join(result)

    def remove_spaces(self, text, remove_prob=0.1):
        """Remove spaces between words"""
        words = text.split()
        result = []
        for word in words:
            result.append(word)
            if random.random() < remove_prob and len(result) > 1:
                # Merge with previous word
                result[-2] = result[-2] + result[-1]
                result.pop()
        return ' '.join(result)

# Test character-level attacks
char_attack = CharacterLevelAttack()


1. CHARACTER-LEVEL ADVERSARIAL ATTACKS


In [None]:
def apply_character_attack_to_dataset(dataset, attack_type='swap', intensity=0.1):
    """Apply character-level attacks to the entire dataset"""
    attacked_data = []

    for example in dataset:
        tokens = example['tokens']
        attacked_tokens = []

        for token in tokens:
            if attack_type == 'swap':
                attacked_token = char_attack.random_character_swap(token, intensity)
            elif attack_type == 'diacritic':
                attacked_token = char_attack.add_diacritics(token, intensity)
            elif attack_type == 'space':
                # For space removal, we need to process the whole sentence
                sentence = " ".join(tokens)
                attacked_sentence = char_attack.remove_spaces(sentence, intensity)
                attacked_tokens = attacked_sentence.split()
                break  # Break after processing the whole sentence
            else:
                attacked_token = token

            if attack_type != 'space':
                attacked_tokens.append(attacked_token)

        attacked_data.append({
            'tokens': attacked_tokens,
            'ner_tags': example['ner_tags']
        })

    return attacked_data

In [None]:
# Convert test_data to proper format if needed
if isinstance(test_data, Dataset):
    test_data_list = [{'tokens': example['tokens'], 'ner_tags': example['ner_tags']} for example in test_data]
else:
    test_data_list = test_data


In [None]:
# Test different character-level attacks
print("\nTesting Character-Level Attacks...")

character_attacks = [
    ('Random Character Swap', 'swap', 0.15),
    ('Add Diacritics', 'diacritic', 0.1),
    ('Remove Spaces', 'space', 0.2),
]

character_attack_results = {}

for attack_name, attack_type, intensity in character_attacks:
    print(f"\n--- {attack_name} (Intensity: {intensity}) ---")

    # Create attacked dataset
    attacked_test_data = apply_character_attack_to_dataset(test_data_list, attack_type, intensity)

    # Show sample of attacked text
    sample_idx = 0
    original_tokens = test_data_list[sample_idx]['tokens']
    attacked_tokens = attacked_test_data[sample_idx]['tokens']

    print("Original:", " ".join(original_tokens[:8]))
    print("Attacked:", " ".join(attacked_tokens[:8]))

    # Create Hugging Face Dataset from attacked data
    attacked_dataset = Dataset.from_list(attacked_test_data)

    # Tokenize attacked data
    attacked_tokenized = attacked_dataset.map(tokenize_and_align_labels, batched=True)
    attacked_tokenized = attacked_tokenized.remove_columns(['tokens', 'ner_tags'])

    # Evaluate performance
    attack_results = evaluate_model_performance(model, attacked_tokenized, f"{attack_name.upper()} ATTACK")
    character_attack_results[attack_name] = attack_results



Testing Character-Level Attacks...

--- Random Character Swap (Intensity: 0.15) ---
Original: সে অনুযায়ী শিক্ষকদের প্রশিক্ষণের ব্যবস্থা করা হচ্ছে ।
Attacked: সো অণুযূয়ি শাক্ষকদের প্রশিক্ষণেঢ় ব্যবস্থা কড়া হচ্ছে ।


Map:   0%|          | 0/9272 [00:00<?, ? examples/s]


=== RANDOM CHARACTER SWAP ATTACK PERFORMANCE ===
eval_loss: 0.6209
eval_model_preparation_time: 0.0026
eval_precision: 0.7119
eval_recall: 0.6719
eval_f1: 0.6913
eval_accuracy: 0.8800
eval_runtime: 28.8121
eval_samples_per_second: 321.8090
eval_steps_per_second: 20.1300

--- Add Diacritics (Intensity: 0.1) ---
Original: সে অনুযায়ী শিক্ষকদের প্রশিক্ষণের ব্যবস্থা করা হচ্ছে ।
Attacked: সে্ অনুযায়ী় শিক়্ষকদের প্রশিক্ষণের ব্্যবস্থা করা় হচ্ছে ।


Map:   0%|          | 0/9272 [00:00<?, ? examples/s]


=== ADD DIACRITICS ATTACK PERFORMANCE ===
eval_loss: 0.7012
eval_model_preparation_time: 0.0069
eval_precision: 0.7623
eval_recall: 0.6090
eval_f1: 0.6770
eval_accuracy: 0.8755
eval_runtime: 37.0949
eval_samples_per_second: 249.9540
eval_steps_per_second: 15.6360

--- Remove Spaces (Intensity: 0.2) ---
Original: সে অনুযায়ী শিক্ষকদের প্রশিক্ষণের ব্যবস্থা করা হচ্ছে ।
Attacked: সেঅনুযায়ী শিক্ষকদের প্রশিক্ষণের ব্যবস্থা করা হচ্ছে।


Map:   0%|          | 0/9272 [00:00<?, ? examples/s]


=== REMOVE SPACES ATTACK PERFORMANCE ===
eval_loss: 1.8957
eval_model_preparation_time: 0.0044
eval_precision: 0.4356
eval_recall: 0.4070
eval_f1: 0.4208
eval_accuracy: 0.7532
eval_runtime: 29.3855
eval_samples_per_second: 315.5300
eval_steps_per_second: 19.7380


In [None]:
# 2. WORD-LEVEL ATTACKS (Evasive)
print("\n" + "="*60)
print("2. WORD-LEVEL ADVERSARIAL ATTACKS")
print("="*60)


2. WORD-LEVEL ADVERSARIAL ATTACKS


In [None]:
class WordLevelAttack:
    """Word-level adversarial attacks"""

    def __init__(self):
        # Bengali synonyms (simplified - in practice, you'd need a proper thesaurus)
        self.synonyms = {
            'বাংলাদেশ': ['ভারত', 'পাকিস্তান', 'দেশ'],
            'ঢাকা': ['চট্টগ্রাম', 'সিলেট', 'শহর'],
            'মানুষ': ['ব্যক্তি', 'লোক', 'ব্যক্তিত্ব'],
            'বই': ['গ্রন্থ', 'পুস্তক', 'কিতাব'],
            'স্কুল': ['বিদ্যালয়', 'শিক্ষায়তন', 'মাদ্রাসা'],
            'মুজিব': ['রহমান', 'নেতা', 'নায়ক'],
            'রহমান': ['মুজিব', 'নেতা', 'ব্যক্তি'],
        }

    def synonym_replacement(self, tokens, replace_prob=0.2):
        """Replace words with synonyms"""
        attacked_tokens = []
        for token in tokens:
            if token in self.synonyms and random.random() < replace_prob:
                attacked_tokens.append(random.choice(self.synonyms[token]))
            else:
                attacked_tokens.append(token)
        return attacked_tokens

    def insert_irrelevant_words(self, tokens, insert_prob=0.1):
        """Insert irrelevant words to confuse the model"""
        irrelevant_words = ['এবং', 'কিন্তু', 'অথবা', 'যে', 'এই', 'একটি']
        attacked_tokens = []

        for token in tokens:
            attacked_tokens.append(token)
            if random.random() < insert_prob:
                attacked_tokens.append(random.choice(irrelevant_words))

        return attacked_tokens

    def entity_obfuscation(self, tokens, tags, obfuscate_prob=0.3):
        """Specifically target entities for obfuscation"""
        attacked_tokens = tokens.copy()

        for i, (token, tag) in enumerate(zip(tokens, tags)):
            if tag != 'O' and random.random() < obfuscate_prob:
                # Replace entity with similar but different word
                if 'PER' in tag:
                    attacked_tokens[i] = 'ব্যক্তি'
                elif 'LOC' in tag:
                    attacked_tokens[i] = 'স্থান'
                elif 'ORG' in tag:
                    attacked_tokens[i] = 'সংস্থা'

        return attacked_tokens


In [None]:
# Test word-level attacks
word_attack = WordLevelAttack()

def apply_word_attack_to_dataset(dataset, attack_type='synonym', intensity=0.2):
    """Apply word-level attacks to the entire dataset"""
    attacked_data = []

    for example in dataset:
        tokens = example['tokens']
        tags = example['ner_tags']

        if attack_type == 'synonym':
            attacked_tokens = word_attack.synonym_replacement(tokens, intensity)
        elif attack_type == 'insert':
            attacked_tokens = word_attack.insert_irrelevant_words(tokens, intensity)
        elif attack_type == 'obfuscation':
            attacked_tokens = word_attack.entity_obfuscation(tokens, tags, intensity)
        else:
            attacked_tokens = tokens

        attacked_data.append({
            'tokens': attacked_tokens,
            'ner_tags': tags
        })

    return attacked_data


In [None]:
# Test different word-level attacks
print("\nTesting Word-Level Attacks...")

word_attacks = [
    ('Synonym Replacement', 'synonym', 0.25),
    ('Insert Irrelevant Words', 'insert', 0.15),
    ('Entity Obfuscation', 'obfuscation', 0.3),
]

word_attack_results = {}

for attack_name, attack_type, intensity in word_attacks:
    print(f"\n--- {attack_name} (Intensity: {intensity}) ---")

    # Create attacked dataset
    attacked_test_data = apply_word_attack_to_dataset(test_data_list, attack_type, intensity)

    # Show sample
    sample_idx = 0
    original_tokens = test_data_list[sample_idx]['tokens']
    attacked_tokens = attacked_test_data[sample_idx]['tokens']

    print("Original:", " ".join(original_tokens[:8]))
    print("Attacked:", " ".join(attacked_tokens[:8]))

    # Create Hugging Face Dataset and tokenize
    attacked_dataset = Dataset.from_list(attacked_test_data)
    attacked_tokenized = attacked_dataset.map(tokenize_and_align_labels, batched=True)
    attacked_tokenized = attacked_tokenized.remove_columns(['tokens', 'ner_tags'])

    # Evaluate performance
    attack_results = evaluate_model_performance(model, attacked_tokenized, f"{attack_name.upper()} ATTACK")
    word_attack_results[attack_name] = attack_results


Testing Word-Level Attacks...

--- Synonym Replacement (Intensity: 0.25) ---
Original: সে অনুযায়ী শিক্ষকদের প্রশিক্ষণের ব্যবস্থা করা হচ্ছে ।
Attacked: সে অনুযায়ী শিক্ষকদের প্রশিক্ষণের ব্যবস্থা করা হচ্ছে ।


Map:   0%|          | 0/9272 [00:00<?, ? examples/s]


=== SYNONYM REPLACEMENT ATTACK PERFORMANCE ===
eval_loss: 0.1926
eval_model_preparation_time: 0.0065
eval_precision: 0.8595
eval_recall: 0.8853
eval_f1: 0.8722
eval_accuracy: 0.9553
eval_runtime: 30.6125
eval_samples_per_second: 302.8830
eval_steps_per_second: 18.9470

--- Insert Irrelevant Words (Intensity: 0.15) ---
Original: সে অনুযায়ী শিক্ষকদের প্রশিক্ষণের ব্যবস্থা করা হচ্ছে ।
Attacked: সে অনুযায়ী অথবা শিক্ষকদের প্রশিক্ষণের এবং ব্যবস্থা করা


Map:   0%|          | 0/9272 [00:00<?, ? examples/s]

IndexError: list index out of range

In [None]:
# 3. NOISE INJECTION ATTACKS (Evasive)
print("\n" + "="*60)
print("3. NOISE INJECTION ADVERSARIAL ATTACKS")
print("="*60)

class NoiseInjectionAttack:
    """Inject various types of noise to degrade model performance"""

    def __init__(self):
        self.common_words = ['এবং', 'কিন্তু', 'যে', 'এই', 'একটি', 'হয়', 'করে', 'নেই']

    def random_deletion(self, tokens, deletion_prob=0.1):
        """Randomly delete tokens"""
        return [token for token in tokens if random.random() > deletion_prob]

    def random_swap(self, tokens, swap_prob=0.1):
        """Randomly swap adjacent tokens"""
        if len(tokens) < 2:
            return tokens

        attacked_tokens = tokens.copy()
        for i in range(len(attacked_tokens) - 1):
            if random.random() < swap_prob:
                attacked_tokens[i], attacked_tokens[i + 1] = attacked_tokens[i + 1], attacked_tokens[i]
        return attacked_tokens

    def random_insertion(self, tokens, insertion_prob=0.1):
        """Randomly insert common words"""
        attacked_tokens = []
        for token in tokens:
            attacked_tokens.append(token)
            if random.random() < insertion_prob:
                attacked_tokens.append(random.choice(self.common_words))
        return attacked_tokens


In [None]:
# Test noise injection attacks
noise_attack = NoiseInjectionAttack()

def apply_noise_attack_to_dataset(dataset, attack_type='deletion', intensity=0.1):
    """Apply noise injection attacks to the entire dataset"""
    attacked_data = []

    for example in dataset:
        tokens = example['tokens']
        tags = example['ner_tags']

        if attack_type == 'deletion':
            attacked_tokens = noise_attack.random_deletion(tokens, intensity)
            # Adjust tags to match remaining tokens
            attacked_tags = [tag for i, tag in enumerate(tags) if random.random() > intensity]
        elif attack_type == 'swap':
            attacked_tokens = noise_attack.random_swap(tokens, intensity)
            attacked_tags = tags  # Tags remain the same, just reordered
        elif attack_type == 'insertion':
            attacked_tokens = noise_attack.random_insertion(tokens, intensity)
            # For insertion, we need to adjust tags (insert 'O' for new tokens)
            attacked_tags = []
            for i, tag in enumerate(tags):
                attacked_tags.append(tag)
                if random.random() < intensity:
                    attacked_tags.append('O')
        else:
            attacked_tokens = tokens
            attacked_tags = tags

        attacked_data.append({
            'tokens': attacked_tokens,
            'ner_tags': attacked_tags
        })

    return attacked_data

In [None]:
# Test different noise injection attacks
print("\nTesting Noise Injection Attacks...")

noise_attacks = [
    ('Random Deletion', 'deletion', 0.15),
    ('Random Swap', 'swap', 0.2),
    ('Random Insertion', 'insertion', 0.1),
]

noise_attack_results = {}

for attack_name, attack_type, intensity in noise_attacks:
    print(f"\n--- {attack_name} (Intensity: {intensity}) ---")

    # Create attacked dataset
    attacked_test_data = apply_noise_attack_to_dataset(test_data_list, attack_type, intensity)

    # Show sample
    sample_idx = 0
    original_tokens = test_data_list[sample_idx]['tokens']
    attacked_tokens = attacked_test_data[sample_idx]['tokens']

    print("Original:", " ".join(original_tokens[:8]))
    print("Attacked:", " ".join(attacked_tokens[:8]))

    # Create Hugging Face Dataset and tokenize
    attacked_dataset = Dataset.from_list(attacked_test_data)
    attacked_tokenized = attacked_dataset.map(tokenize_and_align_labels, batched=True)
    attacked_tokenized = attacked_tokenized.remove_columns(['tokens', 'ner_tags'])

    # Evaluate performance
    attack_results = evaluate_model_performance(model, attacked_tokenized, f"{attack_name.upper()} ATTACK")
    noise_attack_results[attack_name] = attack_results

In [None]:
# 4. ADVERSARIAL ATTACK ANALYSIS AND COMPARISON
print("\n" + "="*60)
print("4. ADVERSARIAL ATTACK ANALYSIS AND COMPARISON")
print("="*60)


In [None]:
def calculate_performance_degradation(baseline_results, attack_results, metric='eval_f1'):
    """Calculate performance degradation due to attacks"""
    baseline_score = baseline_results.get(metric, 0)
    attack_score = attack_results.get(metric, 0)

    if baseline_score > 0:
        degradation = (baseline_score - attack_score) / baseline_score * 100
        return degradation
    return 0

# Analyze all attacks
print("\n=== PERFORMANCE DEGRADATION ANALYSIS ===")
print(f"Baseline F1 Score: {baseline_results.get('eval_f1', 0):.4f}\n")

all_attack_results = {}
all_attack_results.update(character_attack_results)
all_attack_results.update(word_attack_results)
all_attack_results.update(noise_attack_results)

# Sort attacks by effectiveness (most damaging first)
attack_effectiveness = []
for attack_name, results in all_attack_results.items():
    degradation = calculate_performance_degradation(baseline_results, results, 'eval_f1')
    attack_f1 = results.get('eval_f1', 0)
    attack_effectiveness.append((attack_name, degradation, attack_f1))

# Sort by degradation (highest first)
attack_effectiveness.sort(key=lambda x: x[1], reverse=True)

print("Attack Effectiveness Ranking:")
print("-" * 60)
print(f"{'Attack Name':<25} {'F1 Score':<10} {'Degradation':<12}")
print("-" * 60)
for attack_name, degradation, f1_score in attack_effectiveness:
    print(f"{attack_name:<25} {f1_score:.4f}    {degradation:>6.1f}%")


In [None]:
# Create a simple text-based visualization
def create_attack_impact_chart(attack_effectiveness, baseline_f1):
    """Create a simple text-based chart showing attack impact"""
    print(f"\nAttack Impact on F1 Score (Baseline: {baseline_f1:.4f})")
    print("=" * 70)

    for attack_name, degradation, f1_score in attack_effectiveness:
        bar_length = int((f1_score / baseline_f1) * 50)
        bar = "█" * bar_length + " " * (50 - bar_length)
        print(f"{attack_name:<25} |{bar}| {f1_score:.4f} (-{degradation:.1f}%)")

create_attack_impact_chart(attack_effectiveness, baseline_results.get('eval_f1', 0))