# Dataset creation

I generated data using ChatGPT, resulting in 998 short sentences  
where mountain names are marked/tagged. Let's look at some example  
sentences and the format they are written in.

In [118]:
import json

# Open and load the raw dataset from a JSON file
f = open('raw_dataset.json', 'r', encoding='utf-8')
data = json.load(f)


print("Example of structure: ", data['sentences'][0])
print()

# Print the first 5 sentences from the loaded data
for i, sentence_data in enumerate(data['sentences'][:5]):
    print(f"Sentence {i+1}: {sentence_data['text']}")
    print(f"Labels:", sentence_data['labels']['mountain'])
    print()

Example of structure:  {'text': 'Mount Everest stands as the highest peak in the world at 8,848 meters.', 'labels': {'mountain': 'Mount Everest'}}

Sentence 1: Mount Everest stands as the highest peak in the world at 8,848 meters.
Labels: Mount Everest

Sentence 2: Climbers spent three weeks ascending the dangerous slopes of K2.
Labels: K2

Sentence 3: The Rocky Mountains stretch from British Columbia to New Mexico.
Labels: Rocky Mountains

Sentence 4: Tourists often visit Mont Blanc to enjoy skiing in the winter.
Labels: Mont Blanc

Sentence 5: The Andes Mountains form the longest mountain range in South America.
Labels: Andes Mountains



I decided to use a lightweight version of the BERT model — DistilBERT.  
Since my dataset is small and the task is relatively simple. To train  
the model, we need to convert the data into an appropriate format.

In [119]:
import re
import json

# Load the raw dataset from JSON file

f = open('raw_dataset.json', 'r', encoding='utf-8')
data = json.load(f)

def convert_data_to_bert_format(data):
    """
    Convert raw NER data into BERT-compatible BIO format.
    
    BIO Format:
    - B-TAG: Beginning of an entity
    - I-TAG: Inside/continuation of an entity
    - O: Outside any entity
    
    Args:
        data (dict): Raw data containing sentences and their entity labels
        
    Returns:
        list: List of dictionaries containing tokenized sentences and their BIO labels
    """
    converted_data = []
    
    # Process each sentence in the dataset
    for sentence_data in data['sentences']:
        # Tokenize the sentence into words and punctuation
        words = re.findall(r'\w+|[^\w\s]', sentence_data['text'])
        
        # Initialize all words with 'O' (Outside) label
        word_labels = ["O"] * len(words)
        
        # Process each entity and its corresponding label in the sentence
        for entity, entity_name in sentence_data['labels'].items():
            # Handle both string and list entity names
            # Some datasets might have entity names as strings, others as lists
            if isinstance(entity_name, str):
                entity_words = entity_name.split()
            else:
                entity_words = entity_name
                
            # Find and label all occurrences of the entity in the sentence
            for i in range(len(words) - len(entity_words) + 1):
                # Check if we found the entity at current position
                if words[i:i + len(entity_words)] == entity_words:
                    # Label the first word of entity with B- (Beginning)
                    word_labels[i] = f"B-{entity.upper()}"
                    
                    # Label subsequent words with I- (Inside)
                    for j in range(1, len(entity_words)):
                        word_labels[i + j] = f"I-{entity.upper()}"
        
        # Store the processed sentence and its labels
        converted_data.append({
            "sentence": words,          # Tokenized words
            "labels": word_labels       # Corresponding BIO labels
        })
    
    return converted_data

# Convert the data to BERT format and save it
converted_data = convert_data_to_bert_format(data)

# Save the processed data to a new JSON file
with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(converted_data, f, ensure_ascii=False, indent=4)

# Let's create and train the DistilBert model.

We will train the model for 5 epochs, which is generally  
sufficient for such a small model.

In [None]:
import json
import torch
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification
from datasets import Dataset

# 1. Data Loading and Preparation
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# JSON file with training data
file_path = "data.json"
data = load_data(file_path)

def convert_to_dataset(data):
    """
    Convert raw JSON data into format suitable for training.
    
    Args:
        data (list): List of dictionaries containing sentences and their NER labels
        
    Returns:
        tuple: (tokenized_data, labels) where:
            - tokenized_data: List of sentences
            - labels: List of corresponding NER tags
    """
    tokenized_data = []
    labels = []
    for item in data:
        tokenized_data.append(item["sentence"])
        labels.append(item["labels"])
    return tokenized_data, labels

# Convert data into required format
sentences, ner_tags = convert_to_dataset(data)

# Create label mapping for converting string labels to integers
label_list = sorted(set(tag for tags in ner_tags for tag in tags))
label_map = {label: i for i, label in enumerate(label_list)}

def align_labels_with_tokens(tokenizer, sentence, labels):
    """
    Align NER labels with tokenized input, handling subword tokenization.
    
    Args:
        tokenizer: Hugging Face tokenizer
        sentence (list): List of input sentences
        labels (list): List of NER labels for each sentence
        
    Returns:
        dict: Tokenized inputs with aligned labels
    """
    tokenized_inputs = tokenizer(sentences, padding=True, truncation=True, 
                               is_split_into_words=True, return_tensors="pt")
    labels_enc = []
    
    # Process each sentence and its labels
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        
        # Handle subword tokens and special tokens
        for word_idx in word_ids:
            if word_idx is None:
                # Special tokens get label -100 (ignored in loss calculation)
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # First token of word gets the NER label
                label_ids.append(label_map[label[word_idx]])
            else:
                # Subsequent subword tokens: keep I- labels, ignore B- labels
                label_ids.append(label_map[label[word_idx]] 
                               if label[word_idx].startswith("I-") else -100)
            previous_word_idx = word_idx
            
        labels_enc.append(label_ids)
    
    tokenized_inputs["labels"] = torch.tensor(labels_enc)
    return tokenized_inputs

# 2. Model and Tokenizer Initialization
# Load pre-trained DistilBERT tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-cased")
model = DistilBertForTokenClassification.from_pretrained("distilbert-base-cased", 
                                                        num_labels=len(label_map))

# Prepare training data
train_data = align_labels_with_tokens(tokenizer, sentences, ner_tags)

# Convert to Hugging Face Dataset format and split into train/test
dataset = Dataset.from_dict(train_data)
train_test_split = dataset.train_test_split(test_size=0.2)

# 3. Training Configuration
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # Directory for storing training outputs
    eval_strategy="epoch",           # Evaluation strategy
    per_device_train_batch_size=32,  # Training batch size
    per_device_eval_batch_size=32,   # Evaluation batch size
    num_train_epochs=10,             # Number of training epochs
    weight_decay=0.01,               # Weight decay for regularization
)

# Initialize data collator for handling variable length sequences
data_collator = DataCollatorForTokenClassification(tokenizer)

# 4. Trainer Setup
# Initialize Hugging Face Trainer
trainer = Trainer(
    model=model,                           # The model to train
    args=training_args,                    # Training arguments
    train_dataset=train_test_split["train"], # Training data
    eval_dataset=train_test_split["test"],   # Evaluation data
    data_collator=data_collator,           # Data collator
    processing_class=tokenizer,            # Tokenizer for processing inputs
)

# 5. Model Training and Saving
# Start the training process
trainer.train()

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 20%|██        | 25/125 [00:04<00:18,  5.39it/s]
 21%|██        | 26/125 [00:05<00:30,  3.27it/s]

{'eval_loss': 0.07221564650535583, 'eval_runtime': 0.4126, 'eval_samples_per_second': 484.71, 'eval_steps_per_second': 16.965, 'epoch': 1.0}


 40%|████      | 50/125 [00:09<00:13,  5.41it/s]
 41%|████      | 51/125 [00:10<00:22,  3.35it/s]

{'eval_loss': 0.029547180980443954, 'eval_runtime': 0.4025, 'eval_samples_per_second': 496.839, 'eval_steps_per_second': 17.389, 'epoch': 2.0}


 60%|██████    | 75/125 [00:14<00:09,  5.39it/s]
 61%|██████    | 76/125 [00:15<00:14,  3.33it/s]

{'eval_loss': 0.022712137550115585, 'eval_runtime': 0.4029, 'eval_samples_per_second': 496.378, 'eval_steps_per_second': 17.373, 'epoch': 3.0}


 80%|████████  | 100/125 [00:19<00:04,  5.39it/s]
 81%|████████  | 101/125 [00:20<00:07,  3.34it/s]

{'eval_loss': 0.020518431439995766, 'eval_runtime': 0.4015, 'eval_samples_per_second': 498.139, 'eval_steps_per_second': 17.435, 'epoch': 4.0}


100%|██████████| 125/125 [00:25<00:00,  5.41it/s]
100%|██████████| 125/125 [00:27<00:00,  4.55it/s]

{'eval_loss': 0.023718249052762985, 'eval_runtime': 0.3686, 'eval_samples_per_second': 542.623, 'eval_steps_per_second': 18.992, 'epoch': 5.0}
{'train_runtime': 27.4676, 'train_samples_per_second': 145.262, 'train_steps_per_second': 4.551, 'train_loss': 0.07670320892333984, 'epoch': 5.0}





TrainOutput(global_step=125, training_loss=0.07670320892333984, metrics={'train_runtime': 27.4676, 'train_samples_per_second': 145.262, 'train_steps_per_second': 4.551, 'total_flos': 22400266581000.0, 'train_loss': 0.07670320892333984, 'epoch': 5.0})

We can see that the model's loss is quite low, which indicates  
successful training so far. This suggests that the model has effectively  
learned from the data, and now we will proceed to train it on the entire  
available dataset to further refine its performance

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    data_collator=data_collator,
    processing_class=tokenizer,
)

trainer.train()

# Save the trained model and tokenizer for later use
model.save_pretrained("./trained_distilbert_ner")
tokenizer.save_pretrained("./trained_distilbert_ner")

 10%|▉         | 31/320 [00:05<00:55,  5.22it/s]
 10%|█         | 33/320 [00:08<02:50,  1.68it/s]

{'eval_loss': 2.3928210794110782e-05, 'eval_runtime': 1.8872, 'eval_samples_per_second': 528.83, 'eval_steps_per_second': 16.956, 'epoch': 1.0}


 20%|█▉        | 63/320 [00:13<00:48,  5.32it/s]
 20%|██        | 65/320 [00:15<02:29,  1.71it/s]

{'eval_loss': 0.0011436919448897243, 'eval_runtime': 1.8598, 'eval_samples_per_second': 536.621, 'eval_steps_per_second': 17.206, 'epoch': 2.0}


 30%|██▉       | 95/320 [00:21<00:42,  5.27it/s]
 30%|███       | 97/320 [00:23<02:10,  1.71it/s]

{'eval_loss': 3.817624383373186e-05, 'eval_runtime': 1.852, 'eval_samples_per_second': 538.877, 'eval_steps_per_second': 17.279, 'epoch': 3.0}


 40%|███▉      | 127/320 [00:29<00:36,  5.25it/s]
 40%|████      | 129/320 [00:31<01:52,  1.70it/s]

{'eval_loss': 1.1501042536110617e-05, 'eval_runtime': 1.8577, 'eval_samples_per_second': 537.213, 'eval_steps_per_second': 17.225, 'epoch': 4.0}


 50%|████▉     | 159/320 [00:37<00:30,  5.22it/s]
 50%|█████     | 161/320 [00:39<01:33,  1.70it/s]

{'eval_loss': 9.148269782599527e-06, 'eval_runtime': 1.8638, 'eval_samples_per_second': 535.46, 'eval_steps_per_second': 17.169, 'epoch': 5.0}


 60%|█████▉    | 191/320 [00:45<00:24,  5.25it/s]
 60%|██████    | 193/320 [00:47<01:15,  1.69it/s]

{'eval_loss': 8.085515219136141e-06, 'eval_runtime': 1.8735, 'eval_samples_per_second': 532.705, 'eval_steps_per_second': 17.081, 'epoch': 6.0}


 70%|██████▉   | 223/320 [00:52<00:18,  5.24it/s]
 70%|███████   | 225/320 [00:54<00:55,  1.70it/s]

{'eval_loss': 7.510131126764463e-06, 'eval_runtime': 1.8597, 'eval_samples_per_second': 536.639, 'eval_steps_per_second': 17.207, 'epoch': 7.0}


 80%|███████▉  | 255/320 [01:00<00:12,  5.23it/s]
 80%|████████  | 257/320 [01:02<00:37,  1.70it/s]

{'eval_loss': 7.104204996721819e-06, 'eval_runtime': 1.8588, 'eval_samples_per_second': 536.909, 'eval_steps_per_second': 17.216, 'epoch': 8.0}


 90%|████████▉ | 287/320 [01:08<00:06,  5.20it/s]
 90%|█████████ | 289/320 [01:10<00:18,  1.69it/s]

{'eval_loss': 6.845733423688216e-06, 'eval_runtime': 1.8671, 'eval_samples_per_second': 534.521, 'eval_steps_per_second': 17.139, 'epoch': 9.0}


100%|█████████▉| 319/320 [01:16<00:00,  5.23it/s]
100%|██████████| 320/320 [01:20<00:00,  3.98it/s]


{'eval_loss': 6.767094419046771e-06, 'eval_runtime': 1.8329, 'eval_samples_per_second': 544.503, 'eval_steps_per_second': 17.459, 'epoch': 10.0}
{'train_runtime': 80.4996, 'train_samples_per_second': 123.976, 'train_steps_per_second': 3.975, 'train_loss': 0.0009172443300485611, 'epoch': 10.0}


('./trained_distilbert_ner/tokenizer_config.json',
 './trained_distilbert_ner/special_tokens_map.json',
 './trained_distilbert_ner/vocab.txt',
 './trained_distilbert_ner/added_tokens.json',
 './trained_distilbert_ner/tokenizer.json')

# 3. Let's demonstrate the model's results.

To demonstrate that the model has learned but still has clear  
shortcomings, let's examine two sentences.

In [131]:
# Example sentences for testing NER model
bad_sentence = "Mount Fuji is Japan's iconic and highest mountain, known for its symmetrical beauty and cultural significance."
good_sentence = "Softly, softly crawl, snail on the slope of Mount Fuji, up to the heights."
sentences = [bad_sentence.split(), good_sentence.split()]

# Load the trained model and tokenizer from saved path
model_path = "./trained_distilbert_ner"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForTokenClassification.from_pretrained(model_path)

# Tokenize input sentences
# is_split_into_words=True because we're passing pre-tokenized sentences
inputs = tokenizer(sentences, 
                  padding=True,        # Add padding to make all sequences same length
                  truncation=True,     # Truncate sequences that are too long
                  is_split_into_words=True, 
                  return_tensors="pt")  # Return PyTorch tensors

# Get model predictions
# torch.no_grad() disables gradient calculation for inference
with torch.no_grad():
    outputs = model(**inputs)

# Get the most likely label for each token
predictions = torch.argmax(outputs.logits, dim=2)

# Get the label mapping dictionary from model config
# This maps numerical label IDs back to string labels
label_map = model.config.id2label

# Convert predicted label indices back to actual labels
predicted_labels = []
for i, sentence in enumerate(sentences):
    # Get word IDs to handle subword tokenization
    word_ids = inputs.word_ids(batch_index=i)
    predicted_labels_sentence = []
    
    # Process predictions for each token
    for word_id, label_id in zip(word_ids, predictions[i]):
        if word_id is not None:  # Skip special tokens (CLS, SEP, PAD)
            label = label_map[label_id.item()]
            # Only keep the first subword token's prediction for each word
            if len(predicted_labels_sentence) <= word_id:
                predicted_labels_sentence.append(label)
    
    predicted_labels.append(predicted_labels_sentence)

# Define mapping from model labels to entity types
label_to_entity = {
    'LABEL_0': "Mountain",  # First label type
    'LABEL_1': "Mountain",  # Second label type
    'LABEL_2': ""           # No entity
}

# Display results
for i, (sentence, pred_labels) in enumerate(zip(sentences, predicted_labels)):
    print(f"Sentence {i+1}:")
    for word, pred_label in zip(sentence, pred_labels):
        print(f"{word} -- {label_to_entity[pred_label]}")
    print("\n")

Sentence 1:
Mount -- Mountain
Fuji -- Mountain
is -- 
Japan's -- 
iconic -- 
and -- 
highest -- 
mountain, -- 
known -- 
for -- 
its -- 
symmetrical -- 
beauty -- 
and -- 
cultural -- 
significance. -- 


Sentence 2:
Softly, -- 
softly -- 
crawl, -- 
snail -- 
on -- 
the -- 
slope -- 
of -- 
Mount -- 
Fuji, -- 
up -- 
to -- 
the -- 
heights. -- 




We can see that the model performed well on the sentence with  
a clearer context, but struggled with the second one. These  
results will be analyzed in the report file.

In [132]:
pip freeze > requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.
