In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
sys.path.append('.')
from utils import preprocess_article, compute_metrics, calculate_length, tokenize_function, evaluate_model, generate_predictions

In [3]:
import gdown

In [6]:
import gdown

# Replace with the direct download link
url = 'https://drive.google.com/uc?export=download&id=1BoVDdHgPc0HYyG85NmVw8qESSQzdLaNV'
output = '/home/michael_george/notebooks/Amharic.csv'

gdown.download(url, output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1BoVDdHgPc0HYyG85NmVw8qESSQzdLaNV
From (redirected): https://drive.google.com/uc?export=download&id=1BoVDdHgPc0HYyG85NmVw8qESSQzdLaNV&confirm=t&uuid=4243bd09-7569-4288-967b-2664098140e4
To: /home/michael_george/notebooks/Amharic.csv
100%|██████████| 260M/260M [00:06<00:00, 42.3MB/s] 


'/home/michael_george/notebooks/Amharic.csv'

In [4]:
# Load dataset
data = load_dataset("csv", data_files="Amharic.csv")
data

DatasetDict({
    train: Dataset({
        features: ['article', 'category'],
        num_rows: 61915
    })
})

In [5]:

# Preprocess and calculate length
data['train'] = data['train'].map(calculate_length, batched=False)
data['train'] = data['train'].map(preprocess_article, batched=False)

In [6]:
# Split dataset
raw_datasets = data['train'].train_test_split(train_size=0.8, seed=42)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['article', 'category', 'word_count'],
        num_rows: 49532
    })
    test: Dataset({
        features: ['article', 'category', 'word_count'],
        num_rows: 12383
    })
})

In [None]:
# Tokenize dataset
categories = list(set(data['train']['category']))
category_to_id = {cat: idx for idx, cat in enumerate(categories)}
id_to_category = {idx: cat for cat, idx in category_to_id.items()}

model_name = "rasyosef/bert-small-amharic"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_datasets = raw_datasets.map(lambda x: tokenize_function(x, tokenizer, category_to_id))

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='pt')

# Set format for datasets
tokenized_datasets.set_format("torch")

Map:   0%|          | 56/49532 [00:00<01:29, 552.74 examples/s]

Map: 100%|██████████| 49532/49532 [01:32<00:00, 536.09 examples/s]
Map: 100%|██████████| 12383/12383 [00:23<00:00, 527.28 examples/s]


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(categories),
    id2label = {i: lbl for i, lbl in enumerate(categories)},
    label2id = {lbl: i for i, lbl in enumerate(categories)},
    
   )


embedding_layer = model.base_model.embeddings
print(f"Embedding layer: {embedding_layer}")
print(f"Embedding details: {embedding_layer.word_embeddings.weight.shape}")

print(f"Model configuration: {model.config}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at rasyosef/bert-small-amharic and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding layer: BertEmbeddings(
  (word_embeddings): Embedding(24576, 512, padding_idx=0)
  (position_embeddings): Embedding(512, 512)
  (token_type_embeddings): Embedding(2, 512)
  (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)
Embedding details: torch.Size([24576, 512])
Model configuration: BertConfig {
  "_name_or_path": "rasyosef/bert-small-amharic",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "id2label": {
    "0": "Others",
    "1": "Sports",
    "2": "Business",
    "3": "International News",
    "4": "Entertainment",
    "5": "Local News",
    "6": "Politics"
  },
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "label2id": {
    "Business": 2,
    "Entertainment": 4,
    "International News": 3,
    "Local News": 5,
    "Others": 0,
    "Politics": 6,
 

Model evaluation before finetuning

In [None]:
before_finetuning_predictions = generate_predictions(model, tokenized_datasets, device='cpu', id_to_category=id_to_category,num_samples=5)
before_finetuning_predictions 

NameError: name 'generate_predictions' is not defined

In [None]:
# # Evaluate the model
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# test_before_metrics = evaluate_model(model, tokenized_datasets, data_collator, device,'test')
# print(test_before_metrics)


In [None]:
training_args = TrainingArguments(
    output_dir=model_name + "-finetuned",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,
    seed=42,
)

NameError: name 'TrainingArguments' is not defined

In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6069,0.419877,0.843495,0.844957,0.843495,0.842833


Model evaluation after finetuning

In [None]:
after_finetuning_predictions = generate_predictions(model, tokenized_datasets, device="cpu", id_to_category=id_to_category,num_samples=5)
after_finetuning_predictions

NameError: name 'generate_predictions' is not defined