In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('../PalmLM-70000-tokenizer')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [2]:
# Load model directly
from transformers import AutoModelForCausalLM
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_int8_training
import torch

# tokenizer = AutoTokenizer.from_pretrained("DAMO-NLP-MT/polylm-1.7b")
model = AutoModelForCausalLM.from_pretrained("DAMO-NLP-MT/polylm-1.7b", device_map="auto", load_in_8bit=True,torch_dtype=torch.float16,)

config = LoraConfig(
    r=4, lora_alpha=16, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
)

lora_model = get_peft_model(model, config)

In [3]:
from datasets import load_dataset, interleave_datasets

# lang = ['', '', '', '', '', '', '', '', '', '']

train_afaa = load_dataset("castorini/afriberta-corpus", "afaanoromoo", split="train", streaming=True)
test_afaa = load_dataset("castorini/afriberta-corpus", "afaanoromoo", split="test", streaming=True)
train_amh = load_dataset("castorini/afriberta-corpus", "amharic", split="train", streaming=True)
test_amh = load_dataset("castorini/afriberta-corpus", "amharic", split="test", streaming=True)
train_gah = load_dataset("castorini/afriberta-corpus", "gahuza", split="train", streaming=True)
test_gah = load_dataset("castorini/afriberta-corpus", "gahuza", split="test", streaming=True)
train_hau = load_dataset("castorini/afriberta-corpus", "hausa", split="train", streaming=True)
test_hau = load_dataset("castorini/afriberta-corpus", "hausa", split="test", streaming=True)
train_igb = load_dataset("castorini/afriberta-corpus", "igbo", split="train", streaming=True)
test_igb = load_dataset("castorini/afriberta-corpus", "igbo", split="test", streaming=True)
train_som = load_dataset("castorini/afriberta-corpus", "somali", split="train", streaming=True)
test_som = load_dataset("castorini/afriberta-corpus", "somali", split="test", streaming=True)
train_swa = load_dataset("castorini/afriberta-corpus", "swahili", split="train", streaming=True)
test_swa = load_dataset("castorini/afriberta-corpus", "swahili", split="test", streaming=True)
train_tig = load_dataset("castorini/afriberta-corpus", "tigrinya", split="train", streaming=True)
test_tig = load_dataset("castorini/afriberta-corpus", "tigrinya", split="test", streaming=True)
train_yor = load_dataset("castorini/afriberta-corpus", "yoruba", split="train", streaming=True)
test_yor = load_dataset("castorini/afriberta-corpus", "yoruba", split="test", streaming=True)


multilingual_train = interleave_datasets([train_afaa, train_amh, train_gah, train_hau, train_igb, train_som, train_swa, train_tig, train_yor])
multilingual_test = interleave_datasets([test_afaa, test_amh, test_gah, test_hau, test_igb, test_som, test_swa, test_tig, test_yor])


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading builder script: 100%|██████████| 3.67k/3.67k [00:00<00:00, 16.6MB/s]
Downloading metadata: 100%|██████████| 18.7k/18.7k [00:00<00:00, 14.5MB/s]
Downloading readme: 100%|██████████| 3.42k/3.42k [00:00<00:00, 16.3MB/s]


In [4]:
# from datasets import DatasetDict

# raw_datasets = DatasetDict(
#     {
#         "train": multilingual_train, #.shuffle().select(range(10000)),
#         "valid": multilingual_test, #.shuffle().select(range(100))
#     }
# )

# raw_datasets

In [5]:
# for key in raw_datasets["train"][0]:
#     print(f"{key.upper()}: {raw_datasets['train'][0][key][:200]}")

In [6]:



# outputs = tokenizer(
#     raw_datasets["train"][:2]["text"],
#     truncation=True,
#     max_length=context_length,
#     return_overflowing_tokens=True,
#     return_length=True,
# )

# print(f"Input IDs length: {len(outputs['input_ids'])}")
# print(f"Input chunk lengths: {(outputs['length'])}")
# print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

In [7]:
context_length = 128

def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
        padding='max_length',
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_train = multilingual_train.map(
    tokenize, batched=True, remove_columns=multilingual_train.column_names
)
tokenized_test = multilingual_test.map(
    tokenize, batched=True, remove_columns=multilingual_train.column_names
)

In [8]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [9]:
# out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
# for key in out:
#     print(f"{key} shape: {out[key].shape}")

In [10]:
# from peft import LoraConfig
# Lora_config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     lora_dropout=0.05,
#     bias="none"
#     )

# # trainer = transformers.Trainer(
# # model=model,
# # train_dataset=train_data_transformed,
# # args=training_args,
# # data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
# # peft_config=Lora_config

# # )

In [14]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="../AfriPalmLM",
    per_device_train_batch_size=8, # Reduced from 32 due to small GPU memeory
    per_device_eval_batch_size=8, # Reduced from 32 due to small GPU memeory
    evaluation_strategy="steps",
    eval_steps=200,
    logging_steps=200,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=100,
    fp16=True,
    push_to_hub=True,
    max_steps=460000 # Because we are streaming, if not it gives errors
)

trainer = Trainer(
    model=lora_model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

In [15]:
##this empties the GPU
torch.cuda.empty_cache()

In [16]:
trainer.train()



Step,Training Loss,Validation Loss
200,8.5158,7.785079
400,7.7921,7.778021
600,7.7713,7.760106
800,7.7557,7.755634
1000,7.4333,7.261116




ConnectionError: (ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')), '(Request ID: f86822a0-3b04-437d-9a36-12a11662a87c)')

In [17]:
# Resuming Training
trainer.train(resume_from_checkpoint = True)



Step,Training Loss,Validation Loss
1000,7.3371,7.261116
1200,7.2775,7.233973




In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()