In [2]:
!pip install transformers datasets


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-an

In [1]:
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, Trainer, TrainingArguments
from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification
from transformers import BertTokenizerFast, BertForTokenClassification
from datasets import load_dataset

# Function to load the dataset
def load_data():
    return load_dataset("conll2003", data_files="/content/drive/MyDrive/EthioMart-/scripts/labeled_data.conll")

# Function to tokenize and align labels
def tokenize_and_align_labels(examples, tokenizer):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        # Ensure the labels match the length of the tokenized inputs
        if len(label_ids) < len(tokenized_inputs["input_ids"][i]):
            label_ids += [-100] * (len(tokenized_inputs["input_ids"][i]) - len(label_ids))
        elif len(label_ids) > len(tokenized_inputs["input_ids"][i]):
            label_ids = label_ids[:len(tokenized_inputs["input_ids"][i])]
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Function to train and evaluate a model
def train_and_evaluate_model(model_name, tokenizer, model, train_dataset, eval_dataset):
    # Tokenize the datasets
    tokenized_train_dataset = train_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)
    tokenized_eval_dataset = eval_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer), batched=True)

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=f"./results/{model_name}",
        evaluation_strategy="epoch",  # Evaluation strategy
        save_strategy="epoch",  # Save strategy set to match evaluation strategy
        learning_rate=2e-5,
        per_device_train_batch_size=4,  # Adjusted for faster training
        per_device_eval_batch_size=4,  # Adjusted for faster evaluation
        num_train_epochs=2,  # Adjusted to limit training time
        weight_decay=0.01,
        logging_dir=f'./logs/{model_name}',
        save_total_limit=2,
        fp16=True,
        dataloader_num_workers=0,  # Minimize the number of workers to reduce overhead
        disable_tqdm=False,
        load_best_model_at_end=True,
        report_to="none"  # Disable reporting to avoid overhead
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_eval_dataset,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    evaluation_results = trainer.evaluate()
    print(f"{model_name} Evaluation results: {evaluation_results}")
    return evaluation_results

# Load your dataset
dataset = load_data()

# Get label list from dataset
label_list = dataset['train'].features['ner_tags'].feature.names
num_labels = len(label_list)
print(f"Number of labels: {num_labels}, Labels: {label_list}")

# Prepare datasets
train_dataset = dataset['train'].shuffle(seed=42).select(range(500))  # Use a smaller subset of 500 samples for quicker training
eval_dataset = dataset['test'].shuffle(seed=42).select(range(100))  # Use a smaller subset of 100 samples for quicker evaluation

# Fine-tune and evaluate DistilBERT
distilbert_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-multilingual-cased")
distilbert_model = DistilBertForTokenClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=num_labels)
distilbert_results = train_and_evaluate_model("DistilBERT", distilbert_tokenizer, distilbert_model, train_dataset, eval_dataset)

# Fine-tune and evaluate XLM-Roberta
xlm_roberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
xlm_roberta_model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=num_labels)
xlm_roberta_results = train_and_evaluate_model("XLM-Roberta", xlm_roberta_tokenizer, xlm_roberta_model, train_dataset, eval_dataset)

# Fine-tune and evaluate mBERT
mbart_tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
mbart_model = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=num_labels)
mbart_results = train_and_evaluate_model("mBERT", mbart_tokenizer, mbart_model, train_dataset, eval_dataset)

# Compare the results
results_comparison = {
    "DistilBERT": distilbert_results['eval_loss'],
    "XLM-Roberta": xlm_roberta_results['eval_loss'],
    "mBERT": mbart_results['eval_loss']
}

# Find the best-performing model
best_model = min(results_comparison, key=results_comparison.get)
print(f"Best performing model is: {best_model} with evaluation loss: {results_comparison[best_model]}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Number of labels: 9, Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,No log,0.265777
2,No log,0.181446


DistilBERT Evaluation results: {'eval_loss': 0.18144619464874268, 'eval_runtime': 0.2961, 'eval_samples_per_second': 337.746, 'eval_steps_per_second': 84.437, 'epoch': 2.0}


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,No log,0.433605
2,No log,0.295752


XLM-Roberta Evaluation results: {'eval_loss': 0.29575154185295105, 'eval_runtime': 0.4991, 'eval_samples_per_second': 200.366, 'eval_steps_per_second': 50.092, 'epoch': 2.0}


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,No log,0.18246
2,No log,0.117299


mBERT Evaluation results: {'eval_loss': 0.1172986626625061, 'eval_runtime': 0.4395, 'eval_samples_per_second': 227.546, 'eval_steps_per_second': 56.887, 'epoch': 2.0}
Best performing model is: mBERT with evaluation loss: 0.1172986626625061
