In [None]:
!pip install --upgrade accelerate evaluate transformers datasets numpy tiktoken torch rouge_score trl peft wandb

In [None]:
# !nvidia-smi
# !nvcc --version

In [None]:
import evaluate
import numpy as np
import torch
import os
import math
import transformers
# from trl import SFTTrainer
# from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training
from transformers import AutoModelForCausalLM, Trainer, AutoTokenizer, TrainingArguments
from datasets import load_dataset
from io import open

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
if torch.backends.mps.is_available():
    device = 'mps'

In [None]:
print(device)
print(transformers.__version__)

In [None]:
project_root = '..'
dataset_name = 'vblagoje/lfqa_support_docs'
dir_name = 'lfqa'
# dataset_name = 'wikitext'
# dataset_variant = 'wikitext-2-raw-v1'
# dir_name = 'wikitext'
data_dir = os.path.join(project_root, 'data', dir_name)
model_dir = os.path.join(project_root, 'models')
# model_checkpoint = 'gpt2'
model_checkpoint = 'distilgpt2'
# model_checkpoint = 'mistralai/Mistral-7B-v0.1'
# model_checkpoint = 'Salesforce/xgen-7b-8k-base'

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_checkpoint, cache_dir=model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, cache_dir=model_dir, use_fast=True)

In [None]:
model

In [None]:
special_tokens_dict = {'eos_token': '<|endoftext|>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

In [None]:
# if not os.path.exists(data_dir):
#     os.mkdir(data_dir)
try:
    datasets = load_dataset(dataset_name, dataset_variant, cache_dir=data_dir)
except:
    datasets = load_dataset(dataset_name, cache_dir=data_dir)
    

In [None]:
datasets

In [None]:
def tokenize(examples):
    return tokenizer(examples["text"])

In [None]:
def transform(examples):
    return {"text": f"Question: {examples['input']} \n Answer: {examples['output'][0]['answer']}"}

In [None]:
datasets = datasets.map(transform, num_proc=4, remove_columns=["input", "output", "meta", "id"])

In [None]:
tokenized_datasets = datasets.map(tokenize, batched=True, num_proc=4, remove_columns=["text"])

In [None]:
# block_size = tokenizer.model_max_length
block_size = 128
block_size

In [None]:
def group_inputs(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
grouped_data = tokenized_datasets.map(
    group_inputs,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [None]:
grouped_data

In [None]:
tokenizer.decode(grouped_data["train"][1]["input_ids"])

In [None]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    f"../{model_dir}/{model_name}-finetuned-{dataset_name}",
    # optim="adamw_torch",
    # logging_steps=100,
    evaluation_strategy="epoch", 
    num_train_epochs=1,
    learning_rate=2e-4,
    # lr_scheduler_type="linear",
    # warmup_ratio=0.1,
    weight_decay=0.01,
    # prediction_loss_only=True,
    # save_strategy="epoch",
)

In [None]:
# metric = evaluate.load("rouge")

In [None]:
# def compute_metrics(eval_pred):
#     logits, past_keys = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=past_keys) 

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=grouped_data['train'],
    eval_dataset=grouped_data['validation'],
    # compute_metrics=compute_metrics,
)

In [None]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
if torch.backends.mps.is_available():
    torch.mps.empty_cache()

In [None]:
trainer.train()
trainer.save_model(model_dir)

In [None]:
model_path = os.path.join(model_dir, model_name)
model_path

In [None]:
trainer.save_model(model_path)

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
model.push_to_hub('distilgpt2-lfqa')


In [None]:
tokenizer.push_to_hub('distilgpt2-lfqa')