In [1]:
!pip install -q accelerate==0.18.0
!pip install -q datasets
!pip install -q transformers
!pip install -q sentencepiece
!pip install -q gdown
!pip install -q wandb

[0m

In [2]:
import json
import os
from typing import Union,List
import sys

import torch
from transformers import XGLMTokenizer, XGLMForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset




In [3]:
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3,4,5,6,7"

In [4]:
import gc
torch.cuda.empty_cache()
gc.collect()

0

In [5]:
max_memory = {i:f"{int(mem/1024**3)}GB"for i,mem in enumerate(torch.cuda.mem_get_info())}
print(max_memory)

{0: '13GB', 1: '14GB'}


In [6]:
def load_model(base_model:str="facebook/xglm-564M"):
    tokenizer = XGLMTokenizer.from_pretrained(base_model)
    model = XGLMForCausalLM.from_pretrained(base_model,device_map='auto',max_memory=max_memory)
    
    new_tokens = ['<human>:', '<bot>:']
    tokenizer.add_tokens(list(new_tokens))
    model.resize_token_embeddings(len(tokenizer))
    return model,tokenizer
model,tokenizer = load_model()

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/4.92M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/276 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [7]:
# model.hf_device_map

{'model.embed_tokens': 0,
 'lm_head': 0,
 'model.embed_positions': 0,
 'model.layers.0': 0,
 'model.layers.1': 0,
 'model.layers.2': 0,
 'model.layers.3': 0,
 'model.layers.4': 0,
 'model.layers.5': 0,
 'model.layers.6': 0,
 'model.layers.7': 0,
 'model.layers.8': 0,
 'model.layers.9': 0,
 'model.layers.10': 0,
 'model.layers.11': 0,
 'model.layers.12': 1,
 'model.layers.13': 1,
 'model.layers.14': 1,
 'model.layers.15': 1,
 'model.layers.16': 1,
 'model.layers.17': 1,
 'model.layers.18': 1,
 'model.layers.19': 1,
 'model.layers.20': 1,
 'model.layers.21': 1,
 'model.layers.22': 1,
 'model.layers.23': 1,
 'model.layer_norm': 1}

In [8]:
import gdown

url = 'https://drive.google.com/uc?export=download&id=1jbbUtwgwoSQgGnXxzTh-nMReVzEU7ZTU&confirm=t&uuid=d79e2e78-51de-466f-9ceb-3944606141a2&at=AKKF8vwcgi95TGSnSQUNCKx4NTqS:1682865249145'
output = 'output.jsonl'
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?export=download&id=1jbbUtwgwoSQgGnXxzTh-nMReVzEU7ZTU&confirm=t&uuid=d79e2e78-51de-466f-9ceb-3944606141a2&at=AKKF8vwcgi95TGSnSQUNCKx4NTqS:1682865249145
To: /kaggle/working/output.jsonl
100%|██████████| 167M/167M [00:00<00:00, 269MB/s] 


'output.jsonl'

In [9]:
def format_prompt(prompt):
    text = {'prompt':f"{prompt['Background:']} <human>: {prompt['<human>:']} <bot>: {prompt['<bot>:']}"
           }
    text['token_prompt'] = len(tokenizer.tokenize(text['prompt']))
    return text
def find(prompt):
    return prompt['token_prompt']<254
def preprocess(prompt):
    inputs = tokenizer(
        prompt['prompt'],
        truncation=True,
        max_length=256,
        padding=False,
        return_tensors=None,
    )
    inputs['input_ids'].append(tokenizer.eos_token_id)
    inputs['attention_mask'].append(1)
    inputs['labels'] = inputs['input_ids'].copy()
    return inputs

In [10]:
from datasets import load_dataset
datasets = load_dataset('json',data_files = 'output.jsonl')
datasets

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-0a5fa842038583a8/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-0a5fa842038583a8/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Background:', '<human>:', '<bot>:'],
        num_rows: 116288
    })
})

In [11]:
import multiprocessing
cpu_cores = multiprocessing.cpu_count()


In [12]:
# from datasets import Dataset
# datasets = Dataset.from_dict(datasets['train'][:1_000]) # sample data for test
datasets = datasets['train']
datasets = datasets.map(format_prompt,remove_columns=['Background:', '<human>:', '<bot>:'],num_proc=cpu_cores)
datasets = datasets.filter(find,num_proc=cpu_cores) # two for <\s> token
datasets = datasets.map(preprocess,remove_columns=['prompt','token_prompt'],num_proc=cpu_cores) 
print(datasets)

    

#0:   0%|          | 0/500 [00:00<?, ?ex/s]

#1:   0%|          | 0/500 [00:00<?, ?ex/s]

    

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

    

#0:   0%|          | 0/252 [00:00<?, ?ex/s]

#1:   0%|          | 0/251 [00:00<?, ?ex/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 503
})


In [13]:
batch_size = 128
micro_batch_size = 4
gradient_accumulation_steps = batch_size // micro_batch_size
num_epochs = 3
learning_rate = 3e-7
output_dir = 'checkpoint-xglm'

In [14]:
os.environ["WANDB_PROJECT"] = 'wandb_project'
os.environ["WANDB_LOG_MODEL"] = 'true'

In [15]:
train_args = TrainingArguments( # สร้าง class train-args
            per_device_train_batch_size=micro_batch_size, # btch_size 
            gradient_accumulation_steps=gradient_accumulation_steps, # https://huggingface.co/docs/accelerate/usage_guides/gradient_accumulation เหมือนจะ ค่อยๆคำนวนค่า gradient ตามค่าที่ใส่เข้าไปรอบ แล้วค่อยปรับ weight ทีเดียว ไม่รู้_
            gradient_checkpointing=True,
            warmup_steps=100,# ไม่รู้_
            num_train_epochs=num_epochs, # จำนวน epoch
            learning_rate=learning_rate,# ค่า learning-rate
            fp16= True, # ไม่รู้ว่าคืออะไร ตอนแรก default คือ True เลยเปลี่ยนเป็น False แทน แล้วรันได้เฉย _   ///////// เพราะเราเซ้ตข้างบนไว้ว่าเป็น torch.float16 
            logging_steps=1, # ไม่รู้_ //////////// แสดงผลตอนเทรนทุกๆ 10 step gradient descent
            optim="adamw_torch",# ชื่อ optimizer มั้ง_ /////// yes!!
            save_strategy="steps", # ไม่รู้_ //////////////////////////////// save model based on epoch? steps?
            save_steps=1_000, # ไม่รู้_ ///////////////////// Save model every 200 optimizer.step()
            output_dir=output_dir, # ไม่รู้_ ////////////////////////// Where to save model
            save_total_limit=3, # ไม่รู้_ /////////////////////////// Limit model save amount (Not to have 300 model file when you train 300 epoch)
            report_to="wandb", # ใช้ wandb
            run_name='finetune-xglm', # ชื่อ task
        )
print({i:f"{mem/1024**3}GB"for i,mem in enumerate(torch.cuda.mem_get_info())})

trainer = Trainer(
      model=model,# model ที่จะเอาไปเทรน
      train_dataset=datasets, # data ใน train-set
      # eval_dataset=val_data,
      args=train_args,
      data_collator=DataCollatorForSeq2Seq(
          tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True 
      ),
  )
print({i:f"{mem/1024**3}GB"for i,mem in enumerate(torch.cuda.mem_get_info())})
model.config.use_cache = False
if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)
# train-ai ปกติ
print({i:f"{mem/1024**3}GB"for i,mem in enumerate(torch.cuda.mem_get_info())})
trainer.train()

model.save_pretrained('checkpoint')

{0: '10.415771484375GB', 1: '14.755615234375GB'}
(11183849472, 15843721216)
{0: '10.415771484375GB', 1: '14.755615234375GB'}


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
1,4.3277
2,7.5866
3,4.4737
4,4.325
5,4.3704
6,4.3995
7,4.3177
8,4.3951
9,4.4397


TrainOutput(global_step=9, training_loss=4.737281216515435, metrics={'train_runtime': 203.9766, 'train_samples_per_second': 7.398, 'train_steps_per_second': 0.044, 'total_flos': 456238727135232.0, 'train_loss': 4.737281216515435, 'epoch': 2.29})