In [None]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install gdown
!pip install wandb

In [None]:
import json
import os
from typing import Union,List
import sys

import torch
from transformers import XGLMTokenizer, XGLMForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset


In [None]:
def load_model(base_model:str="facebook/xglm-564M"):
    tokenizer = XGLMTokenizer.from_pretrained(base_model)
    model = XGLMForCausalLM.from_pretrained(base_model)
    return model,tokenizer
model,tokenizer = load_model()

In [None]:
new_tokens = ['<human>:', '<bot>:']

tokenizer.add_tokens(list(new_tokens))

model.resize_token_embeddings(len(tokenizer))

In [None]:
import gdown

url = 'https://drive.google.com/uc?export=download&id=1jbbUtwgwoSQgGnXxzTh-nMReVzEU7ZTU&confirm=t&uuid=d79e2e78-51de-466f-9ceb-3944606141a2&at=AKKF8vwcgi95TGSnSQUNCKx4NTqS:1682865249145'
output = 'output.jsonl'
gdown.download(url, output, quiet=False)

In [None]:
def format_prompt(prompt):
    return {'prompt':f"{prompt['Background:']} <human>: {prompt['<human>:']} <bot>: {prompt['<bot>:']}"}
# format data like <sep> context <human>...<bot>...<sep>
def preprocess(prompt):
    data = tokenizer(
        prompt['prompt'],
        truncation=True,
        max_length=256,
        padding=False,
        return_tensors=None,
    )
    data['input_ids'].append(tokenizer.eos_token_id)
    data['attention_mask'].append(1)
    data['labels'] = data['input_ids'].copy()
    return data

In [None]:
from datasets import load_dataset
datasets = load_dataset('json',data_files = 'output.jsonl')
datasets

In [None]:
datasets = datasets['train']
datasets = datasets.map(format_prompt,remove_columns=['Background:', '<human>:', '<bot>:'])
datasets = datasets.map(lambda x:{'token':len(tokenizer.tokenize(x['prompt']))})
datasets = datasets.filter(lambda x:x['token']<255)
datasets = datasets.map(preprocess,remove_columns=['prompt','token']) 
datasets

In [None]:
datasets = datasets.filter(lambda x:x['input_ids'][0] == 2)
datasets = datasets.filter(lambda x:x['input_ids'][-1] == 2)  
datasets

In [None]:
batch_size = 128
micro_batch_size = 4
gradient_accumulation_steps = batch_size // micro_batch_size
num_epochs = 3
learning_rate = 3e-7
output_dir = 'checkpoint-xglm'

In [None]:
os.environ["WANDB_PROJECT"] = 'wandb_project'
os.environ["WANDB_LOG_MODEL"] = 'true'

In [None]:
train_args = TrainingArguments( # สร้าง class train-args
            per_device_train_batch_size=micro_batch_size, # btch_size 
            gradient_accumulation_steps=gradient_accumulation_steps, # https://huggingface.co/docs/accelerate/usage_guides/gradient_accumulation เหมือนจะ ค่อยๆคำนวนค่า gradient ตามค่าที่ใส่เข้าไปรอบ แล้วค่อยปรับ weight ทีเดียว ไม่รู้_
            gradient_checkpointing=True,
            warmup_steps=100,# ไม่รู้_
            num_train_epochs=num_epochs, # จำนวน epoch
            learning_rate=learning_rate,# ค่า learning-rate
            fp16= True, # ไม่รู้ว่าคืออะไร ตอนแรก default คือ True เลยเปลี่ยนเป็น False แทน แล้วรันได้เฉย _   ///////// เพราะเราเซ้ตข้างบนไว้ว่าเป็น torch.float16 
            logging_steps=1, # ไม่รู้_ //////////// แสดงผลตอนเทรนทุกๆ 10 step gradient descent
            optim="adamw_torch",# ชื่อ optimizer มั้ง_ /////// yes!!
            save_strategy="steps", # ไม่รู้_ //////////////////////////////// save model based on epoch? steps?
            save_steps=1_000, # ไม่รู้_ ///////////////////// Save model every 200 optimizer.step()
            output_dir=output_dir, # ไม่รู้_ ////////////////////////// Where to save model
            save_total_limit=3, # ไม่รู้_ /////////////////////////// Limit model save amount (Not to have 300 model file when you train 300 epoch)
            report_to="wandb", # ใช้ wandb
            run_name='finetune-xglm', # ชื่อ task
        )


trainer = Trainer(
      model=model,# model ที่จะเอาไปเทรน
      train_dataset=datasets, # data ใน train-set
      # eval_dataset=val_data,
      args=train_args,
      data_collator=DataCollatorForSeq2Seq(
          tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True 
      ),
  )

model.config.use_cache = False

# train-ai ปกติ
trainer.train()