In [None]:
!pip install -q accelerate==0.18.0
!pip install -q datasets
!pip install -q transformers
!pip install -q sentencepiece
!pip install -q gdown
!pip install -q wandb

In [None]:
import json
import os
from typing import Union,List
import sys

import torch
from transformers import XGLMTokenizer, XGLMForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset


In [None]:
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3,4,5,6,7"

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
max_memory = {i:f"{int(mem/1024**3)}GB"for i,mem in enumerate(torch.cuda.mem_get_info())}
print(max_memory)

In [None]:
def load_model(base_model:str="facebook/xglm-564M"):
    tokenizer = XGLMTokenizer.from_pretrained(base_model)
    model = XGLMForCausalLM.from_pretrained(base_model,device_map='auto',max_memory=max_memory)
    
    new_tokens = ['<human>:', '<bot>:']
    tokenizer.add_tokens(list(new_tokens))
    model.resize_token_embeddings(len(tokenizer))
    return model,tokenizer
model,tokenizer = load_model()

In [None]:
# model.hf_device_map

In [None]:
import gdown

url = 'https://drive.google.com/uc?export=download&id=1jbbUtwgwoSQgGnXxzTh-nMReVzEU7ZTU&confirm=t&uuid=d79e2e78-51de-466f-9ceb-3944606141a2&at=AKKF8vwcgi95TGSnSQUNCKx4NTqS:1682865249145'
output = 'output.jsonl'
gdown.download(url, output, quiet=False)

In [None]:
def format_prompt(prompt):
    text = {'prompt':f"{prompt['Background:']} <human>: {prompt['<human>:']} <bot>: {prompt['<bot>:']}"
           }
    text['token_prompt'] = len(tokenizer.tokenize(text['prompt']))
    return text
def find(prompt):
    return prompt['token_prompt']<254
def preprocess(prompt):
    inputs = tokenizer(
        prompt['prompt'],
        truncation=True,
        max_length=256,
        padding=False,
        return_tensors=None,
    )
    inputs['input_ids'].append(tokenizer.eos_token_id)
    inputs['attention_mask'].append(1)
    inputs['labels'] = inputs['input_ids'].copy()
    return inputs

In [None]:
from datasets import load_dataset
datasets = load_dataset('json',data_files = 'output.jsonl')
datasets

In [None]:
import multiprocessing
cpu_cores = multiprocessing.cpu_count()


In [None]:
# from datasets import Dataset
# datasets = Dataset.from_dict(datasets['train'][:1_000]) # sample data for test
datasets = datasets['train']
datasets = datasets.map(format_prompt,remove_columns=['Background:', '<human>:', '<bot>:'],num_proc=cpu_cores)
datasets = datasets.filter(find,num_proc=cpu_cores) # two for <\s> token
datasets = datasets.map(preprocess,remove_columns=['prompt','token_prompt'],num_proc=cpu_cores) 
print(datasets)

In [None]:
batch_size = 128
micro_batch_size = 4
gradient_accumulation_steps = batch_size // micro_batch_size
num_epochs = 3
learning_rate = 3e-7
output_dir = 'checkpoint-xglm'

In [None]:
os.environ["WANDB_PROJECT"] = 'wandb_project'
os.environ["WANDB_LOG_MODEL"] = 'true'

In [None]:
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_api")

wandb.login(key=wandb_api)

In [None]:
train_args = TrainingArguments( # สร้าง class train-args
            per_device_train_batch_size=micro_batch_size, # btch_size 
            gradient_accumulation_steps=gradient_accumulation_steps, # https://huggingface.co/docs/accelerate/usage_guides/gradient_accumulation เหมือนจะ ค่อยๆคำนวนค่า gradient ตามค่าที่ใส่เข้าไปรอบ แล้วค่อยปรับ weight ทีเดียว ไม่รู้_
            gradient_checkpointing=True,
            warmup_steps=100,# ไม่รู้_
            num_train_epochs=num_epochs, # จำนวน epoch
            learning_rate=learning_rate,# ค่า learning-rate
            fp16= True, # ไม่รู้ว่าคืออะไร ตอนแรก default คือ True เลยเปลี่ยนเป็น False แทน แล้วรันได้เฉย _   ///////// เพราะเราเซ้ตข้างบนไว้ว่าเป็น torch.float16 
            logging_steps=gradient_accumulation_steps, # แสดงผลตอนเทรนทุกๆ n step gradient descent
            optim="adamw_torch",# ชื่อ optimizer มั้ง_ /////// yes!!
            save_strategy="steps", # ไม่รู้_ //////////////////////////////// save model based on epoch? steps?
            save_steps=1_000, # ไม่รู้_ ///////////////////// Save model every 200 optimizer.step()
            output_dir=output_dir, # ไม่รู้_ ////////////////////////// Where to save model
            save_total_limit=3, # ไม่รู้_ /////////////////////////// Limit model save amount (Not to have 300 model file when you train 300 epoch)
            report_to="wandb", # ใช้ wandb
            run_name='finetune-xglm', # ชื่อ task
        )
print({i:f"{mem/1024**3}GB"for i,mem in enumerate(torch.cuda.mem_get_info())})

trainer = Trainer(
      model=model,# model ที่จะเอาไปเทรน
      train_dataset=datasets, # data ใน train-set
      # eval_dataset=val_data,
      args=train_args,
      data_collator=DataCollatorForSeq2Seq(
          tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True 
      ),
  )
print({i:f"{mem/1024**3}GB"for i,mem in enumerate(torch.cuda.mem_get_info())})
model.config.use_cache = False
if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)
# train-ai ปกติ
print({i:f"{mem/1024**3}GB"for i,mem in enumerate(torch.cuda.mem_get_info())})
trainer.train()

model.save_pretrained('checkpoint')