SET UP A VIRTUAL ENVIRONMENT -your_env_name- WITH PYTHON 3.10 (conda create -n your_env_name python=3.10)


In [None]:
#1 INSTALL REQUIRED LIBRARIES

#!pip install keras==2.15.0
#!pip install tensorboard==2.15.0
#!pip install ml-dtypes==0.2.0
#!pip install transformers datasets torch sentencepiece
#!pip install tensorflow
#!pip install accelerate>=0.26.0
#!pip list

In [None]:
# ENVIRONMENT VARIABLES THAT I HAD TO SET AFTERWARDS TO IGNORE/AVOID ERRORS WHEN CONFIGURING TRAINING ARGUMENTS STEP 4
#(The error is occurring because transformers currently has compatibility issues with Keras 3, which is bundled with recent versions of TensorFlow)

#!pip uninstall tensorflow tensorflow-macos keras -y

import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
#os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" #FOR EXTENDING MEMORY ALLOCATION, MIGHT HANG YOUR COMPUTER


# MAKE SURE WE HAVE TENSORFLOW INSTALLED AND TAKE NOTE OF CURRENT DIRECTORY
#import tensorflow as tf
#print(tf.__version__)

!pwd

In [None]:
#2 LOAD THE DATA

from datasets import load_dataset

# Replace 'path/to/swahili_text.txt' with the path to your text file. MINE HAS 34M WORDS, 1.64M LINES, 197M CHARS
dataset = load_dataset('text', data_files='swahili2.txt')

# Display the first 5 rows of the dataset
print(dataset['train'].select(range(5)))

# Alternatively, if you just want to print the text field:
for i in range(5):
    print(dataset['train'][i]['text'])

In [None]:
# 3-1 USE A PRE-EXISTING TOKENIZER

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print("DONE")

In [None]:
# 3-2 TOKENIZE THE DATASET

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
print("DONE")

In [None]:
# 4-1 Prepare for Training - CREATE DATA COLLECTOR

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)
print("DONE")

In [None]:
# 4-2  Prepare for Training - CONFIGURE TRAINING ARGUMENTS

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./swahili_model",
#    eval_strategy="epoch",
    eval_strategy="no",
    learning_rate=1e-5,
    weight_decay=0.01,
    num_train_epochs=1,
    per_device_train_batch_size=8,
)
print("DONE")

In [None]:
# 4-3  Prepare for Training - LOAD MODEL

'''from transformers import BertForMaskedLM

model = BertForMaskedLM.from_pretrained("bert-base-uncased") # THIS WAS CAUSING AN ERROR MESSAGE

'''
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
print("Model loaded successfully.")


In [None]:
#5 TRAIN THE MODEL

from transformers import Trainer

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   data_collator=data_collator,
   tokenizer=tokenizer,
)

trainer.train()
print("DONE")

In [None]:
# 6 SAVE THE MODEL

model.save_pretrained("./swahili_model")
tokenizer.save_pretrained("./swahili_model")

USING THE MODEL WEIGHTS

Step 1: Load the Model and Tokenizer

In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

# Load the model and tokenizer from the saved directory
tokenizer = AutoTokenizer.from_pretrained("./swahili_model")
model = AutoModelForMaskedLM.from_pretrained("./swahili_model")
print("Model loaded")

Step 2: Generate Text with Masked Language 
Since BERT-based models are designed for masked token prediction, you can use it to fill in blanks within a sentence. Here’s how:

1.	Define a Prompt with Masked Tokens:
Create a Swahili sentence with a [MASK] token where you want the model to predict words.

In [None]:
input_text = "Wakati [MASK] leo."  # Example prompt in Swahili

In [None]:
# 2 Tokenize input
inputs = tokenizer(input_text, return_tensors="pt")

In [None]:
#3 Predict masked token
from transformers import pipeline

# Use the fill-mask pipeline for masked token prediction
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

# Generate predictions
result = fill_mask(input_text)
for prediction in result:
    print(prediction["sequence"])