In [None]:
!pip install --upgrade accelerate evaluate transformers datasets numpy tiktoken torch rouge_score trl peft wandb

In [None]:
!nvidia-smi
!nvcc --version

In [None]:
import evaluate
import numpy as np
import torch
import os
import math
# from trl import SFTTrainer
# from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training
# from peft import get_peft_model, prepare_model_for_int8_training
# from peft import prepare_model_for_int8_training
# from peft import LoraConfig, get_peft_model
import transformers
from transformers import AutoModel, Trainer, AutoTokenizer, TrainingArguments
from datasets import load_dataset
from io import open

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
if torch.backends.mps.is_available():
    device = 'mps'

In [None]:
print(transformers.__version__)

In [None]:
project_root = '..'
dataset_name = 'vblagoje/lfqa_support_docs'
data_dir = os.path.join(project_root, 'data/lfqa/')
model_dir = os.path.join(project_root, 'models/')
# model_checkpoint = 'gpt2'
# model_checkpoint = 'distilgpt2'
model_checkpoint = 'mistralai/Mistral-7B-v0.1'
# model_checkpoint = 'Salesforce/xgen-7b-8k-base'

In [None]:
model = AutoModel.from_pretrained(model_checkpoint, cache_dir=model_dir).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, cache_dir=model_dir, trust_remote_code=True)

In [None]:
model

In [None]:
special_tokens_dict = {'eos_token': '<|endoftext|>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

In [None]:
# if not os.path.exists(data_dir):
#     os.mkdir(data_dir)
data = load_dataset(dataset_name, cache_dir=data_dir)

In [None]:
data

In [None]:
train_data = data['train']
val_data = data['validation']
# test_data = data['test']

In [None]:
train_data

In [None]:
train_data = train_data.shard(24, 0)
train_data

In [None]:
def transform(example):
    qa = f"Question: {example['input']} \n Answer: {example['output'][0]['answer']}"
    max_length = 1024
    id_tensor = tokenizer.encode(qa, truncation=True, padding="max_length", max_length=max_length, return_tensors='np').squeeze(0).to(device)
    # id_tensor = tokenizer.encode(qa, return_tensors='np')
    return {"input_ids": id_tensor} ##"attention_mask": torch.ones(max_length)}

In [None]:
train_data_tokz = train_data.map(transform, num_proc=4, remove_columns=["input", "output", "meta", "id"])
val_data_tokz = train_data.map(transform, num_proc=4, remove_columns=["input", "output", "meta", "id"])

In [None]:
train_data_tokz

In [None]:
example = train_data_tokz[5]['input_ids']
len(example)

In [None]:
model.resize_token_embeddings(len(tokenizer))
# model = prepare_model_for_int8_training(model)
# model = get_peft_model(model, lora_peft_config)

In [None]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"../{model_dir}/{model_name}-finetuned-{dataset_name}",
    per_device_train_batch_size=32,
    optim="adamw_torch",
    logging_steps=100,
    evaluation_strategy="epoch", 
    num_train_epochs=1,
    learning_rate=2e-4,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    # weight_decay=0.01,
    prediction_loss_only=True,
    save_strategy="epoch",
)

In [None]:
metric = evaluate.load("rouge")

In [None]:
def compute_metrics(eval_pred):
    logits, past_keys = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=past_keys) 

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_tokz,
    eval_dataset=val_data_tokz,
    compute_metrics=compute_metrics,
)

In [None]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
if torch.backends.mps.is_available():
    torch.mps.empty_cache()

In [None]:
trainer.train()
trainer.save_model(model_dir)