In [17]:
import spacy
import json
import pandas as pd
import os
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset


**Function to load CoNLL data**

In [2]:
def load_conll_data(file_path):
    sentences = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        sentence = []
        label = []
        for line in f:
            line = line.strip()  # Remove extra spaces and newlines
            if line == "":  # Blank lines separate sentences
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence = []
                    label = []
            else:
                # Split the line only if it contains both a token and a label
                parts = line.split()
                if len(parts) == 2:  # Ensure we have exactly 2 parts (token and label)
                    token, entity = parts
                    sentence.append(token)
                    label.append(entity)
                else:
                    print(f"Skipping malformed line: {line}")
                    
    return sentences, labels

# Load the CoNLL data
data_path = '../data/labeled/automated_labeled_data.conll'
sentences, labels = load_conll_data(data_path)

print("Example sentence:", sentences[0])
print("Example labels:", labels[0])


Skipping malformed line: O
Skipping malformed line: O
Skipping malformed line: O
Example sentence: ['mama', 'bag', 'ኦሪጅናል', 'ማቴሪያል', 'በሳይዙ', 'ትልቅ', '1600', 'ብር', 'Free', 'delivery', '0909003864', '0905707448', 'ሊንኩን', 'በመጫን', 'ቴሌግራማችንን', 'ይቀላቀሉhttpstmesinayelj', 'እቃ', 'ለማዘዝ', 'ከስር', 'ያለውን', 'ሊንኮች', 'በመጫን', 'ማዘዝ', 'ትችላላቹ', 'sinasinaye', 'httpstmesinayelj2', 'አድራሻ', '1ቁጥር1', 'ገርጂ', 'ኢምፔሪያል', 'ከሳሚ', 'ህንፃ', 'ጎን', 'አልፎዝ', 'ፕላዛ', 'ግራውንድ', 'ላይ', 'እንደገቡ', 'ያገኙናል', '2ቁጥር2', '4ኪሎ', 'ቅድስት', 'ስላሴ', 'ህንፃ', 'ማለትም', 'ከብልፅግና', 'ዋናፅፈት', 'ቤት', 'ህንፃ', 'በስተ', 'ቀኝ', 'ባለው', 'አስፓልት', '20ሜትር', 'ዝቅ', 'እንዳሉ', 'ሀበሻ', 'ኮፊ', 'የሚገኝበት', 'ቀይ', 'ሸክላ', 'ህንፃ', '2ተኛ', 'ፎቅ', 'ላይ', 'ያገኙናል', '3ቁጥር3', 'ብስራተ', 'ገብርኤል', 'ላፍቶ', 'ሞል', 'መግቢያው', 'ፊት', 'ለፊት', 'የሚገኘው', 'የብስራተ', 'ገብርኤል', 'ቤተ', 'ክርስቲያን', 'ህንፃ', 'አንደኛ', 'ፎቅ', 'ላይ', 'ደረጃ', 'እንደወጣቹ', 'በስተግራ', 'በኩል', 'ሱቅ', 'ቁጥር', 'FF09', 'ክቡራን', 'ደምበኞቻችን', 'ገርጂ', 'አልፎዝ', 'ፕላዛ', 'ላይ', 'አራት', 'ኪሎ', 'ቅድስት', 'ስላሴ', 'እንዲሁም', 'ብስራተ', 'ገብሬል', 'ያሉት', 'ሱቆቻችን', 'ሲመጡ', 'አስተማማኝ', 'ሰፊ', 'ፓርኪንግ', 'ያላቸው

**Tokenize the Date**

In [3]:
from transformers import BertTokenizerFast

# Load the tokenizer for the model you will fine-tune
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

# Tokenize the sentences
tokenized_inputs = tokenizer(sentences, is_split_into_words=True, padding=True, truncation=True, return_tensors="pt")

# Align the labels with the tokens
def align_labels_with_tokens(labels, tokenized_inputs):
    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_label = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        aligned_labels.append(aligned_label)
    return aligned_labels

aligned_labels = align_labels_with_tokens(labels, tokenized_inputs)

In [4]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch"
)

print("TrainingArguments setup complete")


TrainingArguments setup complete




**Fine-tune the model**

In [8]:
import torch  # Add this line
from transformers import BertForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Create a label mapping (label2id and id2label)
unique_labels = set(label for label_list in labels for label in label_list)  # Collect all unique labels
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

print("Label to ID mapping:", label2id)

# Convert string labels to numeric labels, skipping -100 values
def convert_labels_to_ids(labels, label2id):
    numeric_labels = []
    for label_list in labels:
        numeric_label_list = []
        for label in label_list:
            if label == -100:
                numeric_label_list.append(-100)  # Keep the -100 label as is
            else:
                numeric_label_list.append(label2id[label])  # Convert valid labels to IDs
        numeric_labels.append(numeric_label_list)
    return numeric_labels

# Convert the aligned labels to IDs
numeric_labels = convert_labels_to_ids(aligned_labels, label2id)


# Ensure labels have the same length as input_ids (necessary padding if needed)
def align_numeric_labels_with_tokens(labels, tokenized_inputs):
    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_label = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        aligned_labels.append(aligned_label)
    return aligned_labels

aligned_numeric_labels = align_numeric_labels_with_tokens(numeric_labels, tokenized_inputs)

# Now we can use the aligned numeric labels in the Dataset class
class NERDataset(Dataset):
    def __init__(self, tokenized_inputs, labels):
        self.input_ids = tokenized_inputs['input_ids']
        self.attention_mask = tokenized_inputs['attention_mask']
        self.labels = torch.tensor(labels, dtype=torch.long)  # Make sure labels are in long type

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# Create the dataset with numeric labels
dataset = NERDataset(tokenized_inputs, aligned_numeric_labels)

# Load the pre-trained model for token classification
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(label2id))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()



Label to ID mapping: {'O': 0, 'B-Product': 1, 'B-LOC': 2}


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                             
 33%|███▎      | 2/6 [01:32<02:25, 36.26s/it]

{'eval_loss': 0.22323617339134216, 'eval_runtime': 13.0771, 'eval_samples_per_second': 1.988, 'eval_steps_per_second': 0.153, 'epoch': 1.0}


                                             
 67%|██████▋   | 4/6 [02:28<00:58, 29.40s/it]

{'eval_loss': 0.13700102269649506, 'eval_runtime': 12.016, 'eval_samples_per_second': 2.164, 'eval_steps_per_second': 0.166, 'epoch': 2.0}


                                             
100%|██████████| 6/6 [03:44<00:00, 37.38s/it]

{'eval_loss': 0.12178146094083786, 'eval_runtime': 13.0792, 'eval_samples_per_second': 1.988, 'eval_steps_per_second': 0.153, 'epoch': 3.0}
{'train_runtime': 224.3211, 'train_samples_per_second': 0.348, 'train_steps_per_second': 0.027, 'train_loss': 0.39222343762715656, 'epoch': 3.0}





TrainOutput(global_step=6, training_loss=0.39222343762715656, metrics={'train_runtime': 224.3211, 'train_samples_per_second': 0.348, 'train_steps_per_second': 0.027, 'total_flos': 8518759562664.0, 'train_loss': 0.39222343762715656, 'epoch': 3.0})

In [21]:

# Split data into train and test sets and save them as separate files

def split_data(file_path, train_output_path, test_output_path, test_size=0.2):
    # Load the data
    sentences, labels = load_conll_data(file_path)
    
    # Calculate the split index
    split_idx = int(len(sentences) * (1 - test_size))
    
    # Split the data into train and test sets
    train_sentences, test_sentences = sentences[:split_idx], sentences[split_idx:]
    train_labels, test_labels = labels[:split_idx], labels[split_idx:]
    
    # Save the train and test data in CoNLL format
    save_conll_data(train_output_path, train_sentences, train_labels)
    save_conll_data(test_output_path, test_sentences, test_labels)

def save_conll_data(output_path, sentences, labels):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)  # Ensure the directory exists
    with open(output_path, 'w', encoding='utf-8') as f:
        for sentence, label in zip(sentences, labels):
            for token, entity in zip(sentence, label):
                f.write(f"{token} {entity}\n")
            f.write("\n")  # Blank line to separate sentences

# Path to your labeled data from Task 3
labeled_data_path = '../data/labeled/automated_labeled_data.conll'
train_data_path = '../data/labeled/train_data.conll'
test_data_path = '../data/labeled/test_data.conll'

# Split the data and save
split_data(labeled_data_path, train_data_path, test_data_path, test_size=0.2)



Skipping malformed line: O
Skipping malformed line: O
Skipping malformed line: O


**Save the Fine-tuned Model**

In [20]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./models/fine_tuned_model")
tokenizer.save_pretrained("./models/fine_tuned_model")


SafetensorError: Error while serializing: IoError(Os { code: 1224, kind: Uncategorized, message: "The requested operation cannot be performed on a file with a user-mapped section open." })