We'll get our data by scraping for sentences with mountain names

In [1]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
import evaluate
import random
import spacy
import json
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm
2025-02-28 00:06:47.215369: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740694007.258381   62164 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740694007.267115   62164 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
DATA_DIR = "data/raw-img"

In [3]:
def get_label_names(data_dir):
    data = []
    for label in os.listdir(data_dir):
        data.append(label)
    
    return data

animal_labels = get_label_names(DATA_DIR)
print(animal_labels)

['elephant', 'chicken', 'cat', 'spider', 'dog', 'sheep', 'horse', 'butterfly', 'cow', 'squirrel']


We'll get the mountain list from wikipedia as it is simple and reliable

We'll get 3 sentences for each mountain, that gives us a nice list of around 4500 sentences

In [None]:
def get_wordhippo_sentences(required_word, exclude_words, num_sentences=3):
    url = f"https://www.wordhippo.com/what-is/sentences-with-the-word/{required_word}.html"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve data for '{required_word}'. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    sentence_tds = soup.find_all('td')

    raw_sentences = [
        td.get_text(strip=False)
        for td in sentence_tds
        if required_word.lower() in td.get_text(strip=False).lower()
    ]

    replaced = [txt.replace('\n', '') for txt in raw_sentences]

    splitted = []
    for txt in replaced:
        sub_sents = txt.split('.')
        sub_sents = [s.strip() for s in sub_sents if s.strip()]
        splitted.extend(sub_sents)
    
    included = [
        s for s in splitted
        if required_word.lower() in s.lower()
    ]

    final_sentences = []
    for s in included:
        if any(exw.lower() in s.lower() for exw in exclude_words):
            continue 
        final_sentences.append(s)

    if len(final_sentences) > num_sentences:
        final_sentences = random.sample(final_sentences, num_sentences)

    return final_sentences

animal_name = "elephant"
exclude_words = ["translate", "sentences"]
sentences = get_wordhippo_sentences(animal_name, exclude_words, num_sentences=500)
print(sentences)
print(len(sentences))

500


In [None]:
animal_names_sentences = []
for animal in animal_labels:
    animal_names_sentences = animal_names_sentences + get_wordhippo_sentences(animal, exclude_words, num_sentences=300)


In [None]:
print(len(animal_names_sentences))
print(animal_names_sentences[0:5])

3000
['No decent soul could be glad that an elephant was shot in a drive-by', 'The use and trade of elephant ivory have become controversial because they have contributed to seriously declining elephant populations in many countries', 'The mammoth or woolly elephant, the hyna, the cave bear, and the reindeer ranged the land', 'The boy possesses the uncanny ability to eat like an elephant and remain as skinny as a stick', 'York would gain absolutely nothing in having an airport, except perhaps a huge white elephant that would cost a large amount of money']


In [None]:
with open('animal_sentences.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    for sentence in animal_names_sentences:
        writer.writerow([sentence])

In [4]:
labels = ["O"] + [f"B-{animal.upper()}" for animal in animal_labels]
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}
print(id2label)
print(label2id)
print(label2id)

{0: 'O', 1: 'B-ELEPHANT', 2: 'B-CHICKEN', 3: 'B-CAT', 4: 'B-SPIDER', 5: 'B-DOG', 6: 'B-SHEEP', 7: 'B-HORSE', 8: 'B-BUTTERFLY', 9: 'B-COW', 10: 'B-SQUIRREL'}
{'O': 0, 'B-ELEPHANT': 1, 'B-CHICKEN': 2, 'B-CAT': 3, 'B-SPIDER': 4, 'B-DOG': 5, 'B-SHEEP': 6, 'B-HORSE': 7, 'B-BUTTERFLY': 8, 'B-COW': 9, 'B-SQUIRREL': 10}
{'O': 0, 'B-ELEPHANT': 1, 'B-CHICKEN': 2, 'B-CAT': 3, 'B-SPIDER': 4, 'B-DOG': 5, 'B-SHEEP': 6, 'B-HORSE': 7, 'B-BUTTERFLY': 8, 'B-COW': 9, 'B-SQUIRREL': 10}


In [6]:
nlp = spacy.load("en_core_web_sm")

In [None]:
def annotate_sentence_to_json(sentence, sent_id=0):
    doc = nlp(sentence)
    tokens = []
    ner_tags = []
    
    for token in doc:
        tokens.append(token.text)
        if token.text.lower() in animal_labels:
            label = f"B-{token.text.lower().upper()}"  # e.g. B-DOG, B-CAT
        else:
            label = "O"
        ner_tags.append(label2id.get(label, 0))  # default to 0 if not found
    
    return {
        "id": sent_id,
        "tokens": tokens,
        "ner_tags": ner_tags
    }

def annotate_and_save_json(input_csv, output_file):
    data = []
    with open(input_csv, "r", encoding="utf-8", newline="") as fin:
        reader = csv.reader(fin)
        for i, row in enumerate(reader):
            if not row:  
                continue
            sentence = row[0].strip()
            if not sentence:
                continue

            data.append(annotate_sentence_to_json(sentence, sent_id=i))

    with open(output_file, "w", encoding="utf-8") as fout:
        json.dump(data, fout, indent=2)
    
    with open(output_file, "w", encoding="utf-8") as fout:
        json.dump(data, fout, indent=2)

In [None]:
annotate_and_save_json("animal_sentences.csv", "bert_annotations.json")

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_and_align_labels(example):
    tokenized = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        truncation=True
    )
    
    # Align labels to subword tokens
    word_ids = tokenized.word_ids()  # subword -> index of original word
    original_labels = example["ner_tags"]

    label_ids = []
    for word_id in word_ids:
        if word_id is None:
            label_ids.append(-100)
        else:
            label_ids.append(original_labels[word_id])
    
    tokenized["labels"] = label_ids
    return tokenized


In [6]:
with open("bert_annotations.json", "r", encoding="utf-8") as f:
    data_list = json.load(f)  # 'data_list' is a list of dicts

hf_dataset = Dataset.from_list(data_list)

In [7]:
dataset = hf_dataset.map(tokenize_and_align_labels)
dataset = dataset.shuffle(seed=42)

train_test = dataset.train_test_split(test_size=0.2)
train_dataset = train_test["train"]
temp_dataset = train_test["test"]

val_test = temp_dataset.train_test_split(test_size=0.5)
val_dataset = val_test["train"]
test_dataset = val_test["test"]

Map: 100%|██████████| 3000/3000 [00:01<00:00, 1508.05 examples/s]


In [8]:
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    from_tf=True
)

2025-02-28 00:07:39.217503: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
All TF 2.0 model weights were used when initializing BertForTokenClassification.

All the weights of BertForTokenClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForTokenClassification for predictions without further training.


In [9]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [10]:
metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = logits.argmax(-1)

    true_predictions = []
    true_labels      = []
    for pred, lab in zip(predictions, labels):
        tmp_pred = []
        tmp_lab  = []
        for p, l in zip(pred, lab):
            if l != -100:
                tmp_pred.append(p)
                tmp_lab.append(l)
        true_predictions.append(tmp_pred)
        true_labels.append(tmp_lab)

    # Convert IDs to label strings
    true_predictions_str = [
        [id2label[p] for p in pred_seq] for pred_seq in true_predictions
    ]
    true_labels_str = [
        [id2label[l] for l in lab_seq] for lab_seq in true_labels
    ]

    results = metric.compute(predictions=true_predictions_str, references=true_labels_str)
    return {
        "precision": results["overall_precision"],
        "recall":    results["overall_recall"],
        "f1":        results["overall_f1"],
        "accuracy":  results["overall_accuracy"],
    }

In [11]:
training_args = TrainingArguments(
    output_dir="my_bert_ner",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,     # must have .map()ed data
    eval_dataset=val_dataset,       # optional but recommended
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


KeyboardInterrupt: 