In [None]:
!pip install transformers<=4.28.0
!pip install sentencepiece
!pip install gdown

In [None]:
import json
import os
from typing import Union,List
import sys

import torch
from transformers import XGLMTokenizer, XGLMForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset


In [None]:
def load_model(base_model:str="facebook/xglm-564M"):
    tokenizer = XGLMTokenizer.from_pretrained(base_model)
    model = XGLMForCausalLM.from_pretrained(base_model,
                                           torch_dtype=torch.float16)
    return model,tokenizer
model,tokenizer = load_model()

In [None]:
new_tokens = ['<human>:', '<bot>:']

tokenizer.add_tokens(list(new_tokens))

model.resize_token_embeddings(len(tokenizer))

In [None]:
import gdown

url = 'https://drive.google.com/uc?export=download&id=1jbbUtwgwoSQgGnXxzTh-nMReVzEU7ZTU&confirm=t&uuid=d79e2e78-51de-466f-9ceb-3944606141a2&at=AKKF8vwcgi95TGSnSQUNCKx4NTqS:1682865249145'
output = 'output.jsonl'
gdown.download(url, output, quiet=False)


In [None]:
# format data like <sep> context <human>...<bot>...
def preprocess(prompt):
    data = tokenizer(
        prompt,
        truncation=True,
        max_length=256,
        padding=False,
        return_tensors=None,
    )
    data['input_ids'].append(tokenizer.eos_token_id)
    data['attention_mask'].append(1)
    data = datacol(data)
    return data

In [None]:
from datasets import load_dataset
dataset = load_dataset('json',data_files='/kaggle/working/output.jsonl')
dataset

In [None]:
dataset = dataset['train']
dataset

In [None]:
from datasets import Dataset
# dataset = Dataset.from_dict(dataset['train'][:1000])
# dataset

In [None]:
def format_prompt(prompt):
    return {'prompt':f"{prompt['Background:']} {prompt['<human>:']} {prompt['<bot>:']}"}

In [None]:
dataset = dataset.map(format_prompt,remove_columns=['Background:', '<human>:', '<bot>:'])
dataset

In [None]:
# format data like <sep> context <human>...<bot>...<sep>
def preprocess(prompt):
    data = tokenizer(
        prompt['prompt'],
        truncation=True,
        max_length=256,
        padding=False,
        return_tensors=None,
    )
    data['input_ids'].append(tokenizer.eos_token_id)
    data['attention_mask'].append(1)
    data['labels'] = data['input_ids']
    return data

In [None]:
dataset = dataset.map(preprocess,remove_columns=['prompt']) 
dataset 

In [None]:
dataset= dataset.filter(lambda x:x['input_ids'][0] ==2)
dataset=dataset.filter(lambda x:x['input_ids'][-1] == 2)  
dataset

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(
            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True # ไม่รู้_
        )

In [None]:
# sample = tokenizer('hell I am your father',
#                    truncation=True,
#                    max_length=256,
#                    padding=False,
#                    return_tensors=None)
# sample['labels'] = sample['input_ids']
# sample

In [None]:
# data_collator([sample])

In [None]:
from torch.utils.data import DataLoader

batch_size = 128
mini_batch_size = 4
gradient_accumulation_steps = batch_size //mini_batch_size
print(gradient_accumulation_steps)
train_dataloader = DataLoader(
    dataset, shuffle=True, batch_size=4, collate_fn=data_collator
)
print(len(train_dataloader))
  

In [None]:
from transformers import AdamW,get_scheduler
import math
optimizer = AdamW(model.parameters(), lr=3e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=100,
    num_training_steps=math.ceil(num_training_steps/gradient_accumulation_steps),
)
print(num_training_steps,math.ceil(num_training_steps/gradient_accumulation_steps)) # Learning Rate Schedules

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") # ตรงนี้สำหรับคนใช้ GPU/CPU ในการเทรน
model.to(device)
print(device)

In [None]:
# with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
#     data = next(iter(train_dataloader))
#     data = {k:v.to(device) for k,v in data.items()}
#     print(model(**data))

In [None]:
output_dir_min_loss = 'xglm-checkpoint-min-loss'
OPTIMIZER_NAME = "optimizer.pt"
SCHEDULER_NAME = "scheduler.pt"

In [None]:
print('hello world')

In [None]:
# train
print('start_training')
for epoch in range(num_epochs):
    for index,data in enumerate(train_dataloader):
        data = {k:v.to(device) for k,v in data.items()}
        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(**data)
            loss = outputs.loss / gradient_accumulation_steps
            loss.backward()
        
        if (index + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            if (index+1) % gradient_accumulation_steps**2 == 0:
                print(epoch,index+1,(index+1)%gradient_accumulation_steps,loss)
    model.save_pretrained(output_dir_min_loss) 
    with open(os.path.join(output_dir_min_loss, 'loss.txt'),'w') as f:
        text = f'{epoch},{index+1},{(index+1)%gradient_accumulation_steps},{loss.item()}'
        f.write(text)
    torch.save(optimizer.state_dict(), os.path.join(output_dir_min_loss, OPTIMIZER_NAME))
    torch.save(lr_scheduler.state_dict(), os.path.join(output_dir_min_loss, SCHEDULER_NAME))
             


In [None]:
output_dir_latest_version = 'xglm-checkpoint-latest-version'
model.save_pretrained(output_dir_latest_version) 
with open(os.path.join(output_dir_latest_version, 'loss.txt'),'w') as f:
    f.write(str(loss.item()))
torch.save(optimizer.state_dict(), os.path.join(output_dir_latest_version, OPTIMIZER_NAME))
torch.save(lr_scheduler.state_dict(), os.path.join(output_dir_latest_version, SCHEDULER_NAME))

In [None]:
print('end')