<a href="https://colab.research.google.com/github/ohmreborn/AiBuilder-2023-project/blob/main/xglm-564M/finetune_xglm_564m_single_gpu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install gdown

In [None]:
!pip install datasets

In [None]:
import json
import os
from typing import Union,List
import sys

import torch
from transformers import XGLMTokenizer, XGLMForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset


In [None]:
def load_model(base_model:str="facebook/xglm-564M"):
    tokenizer = XGLMTokenizer.from_pretrained(base_model)
    model = XGLMForCausalLM.from_pretrained(base_model,
                                           torch_dtype=torch.float16)
    return model,tokenizer
model,tokenizer = load_model()

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/4.92M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/276 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [None]:
new_tokens = ['<human>:', '<bot>:']

tokenizer.add_tokens(list(new_tokens))

model.resize_token_embeddings(len(tokenizer))

Embedding(256010, 1024)

In [None]:
import gdown

url = 'https://drive.google.com/uc?export=download&id=1jbbUtwgwoSQgGnXxzTh-nMReVzEU7ZTU&confirm=t&uuid=d79e2e78-51de-466f-9ceb-3944606141a2&at=AKKF8vwcgi95TGSnSQUNCKx4NTqS:1682865249145'
output = 'output.jsonl'
gdown.download(url, output, quiet=False)


Downloading...
From: https://drive.google.com/uc?export=download&id=1jbbUtwgwoSQgGnXxzTh-nMReVzEU7ZTU&confirm=t&uuid=d79e2e78-51de-466f-9ceb-3944606141a2&at=AKKF8vwcgi95TGSnSQUNCKx4NTqS:1682865249145
To: /content/output.jsonl
100%|██████████| 167M/167M [00:02<00:00, 79.4MB/s]


'output.jsonl'

In [None]:
from datasets import load_dataset
dataset = load_dataset('json',data_files='output.jsonl')
dataset



  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Background:', '<human>:', '<bot>:'],
        num_rows: 116288
    })
})

In [None]:
dataset = dataset['train']
dataset

Dataset({
    features: ['Background:', '<human>:', '<bot>:'],
    num_rows: 116288
})

In [None]:
# from datasets import Dataset
# dataset = Dataset.from_dict(dataset[:20_000])
# dataset

Dataset({
    features: ['Background:', '<human>:', '<bot>:'],
    num_rows: 20000
})

In [None]:
def format_prompt(prompt):
    return {'prompt':f"{prompt['Background:']} <human>: {prompt['<human>:']} <bot>: {prompt['<bot>:']}"}

In [None]:
dataset = dataset.map(format_prompt,remove_columns=['Background:', '<human>:', '<bot>:'])
dataset

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt'],
    num_rows: 20000
})

In [None]:
dataset[0]

{'prompt': ' Tubulin in molecular biology can refer either to the tubulin protein superfamily of globular proteins or one of the member proteins of that superfamily α and βtubulins polymerize into microtubules a major component of the eukaryotic cytoskeleton Microtubules function in many essential cellular processes including mitosis Tubulinbinding drugs kill cancerous cells by inhibiting microtubule dynamics which are required for DNA segregation and therefore cell division there are six members of the tubulin superfamily although not all are present in all species Both α and β tubulins have a mass of around 50 kDa and are thus in a similar range compared to actin with a mass of 42 kDa In contrast tubulin polymers microtubules tend to be much bigger than actin filaments due to their cylindrical nature Tubulin was long thought to be specific to eukaryotes More recently however several prokaryotic proteins have been shown to be related to tubulin <human>:  What are some examples of drug

In [None]:
# format data like <sep> context <human>...<bot>...<sep>
def preprocess(prompt):
    data = tokenizer(
        prompt['prompt'],
        truncation=True,
        max_length=256,
        padding=False,
        return_tensors=None,
    )
    data['input_ids'].append(tokenizer.eos_token_id)
    data['attention_mask'].append(1)
    data['labels'] = data['input_ids']
    return data

In [None]:
dataset = dataset.map(preprocess,remove_columns=['prompt']) 
dataset 

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 20000
})

In [None]:
dataset= dataset.filter(lambda x:x['input_ids'][0] ==2)
dataset=dataset.filter(lambda x:x['input_ids'][-1] == 2)  
dataset

Filter:   0%|          | 0/20000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/20000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 20000
})

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(
            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True # ไม่รู้_
        )

In [None]:
# sample = tokenizer('hell I am your father',
#                    truncation=True,
#                    max_length=256,
#                    padding=False,
#                    return_tensors=None)
# sample['labels'] = sample['input_ids']
# sample

In [None]:
# data_collator([sample])

In [None]:
from torch.utils.data import DataLoader

batch_size = 128
mini_batch_size = 4
gradient_accumulation_steps = batch_size //mini_batch_size
print(gradient_accumulation_steps)
train_dataloader = DataLoader(
    dataset, shuffle=True, batch_size=4, collate_fn=data_collator
)
print(len(train_dataloader))
  

32
5000


In [None]:
from transformers import AdamW,get_scheduler
import math
optimizer = AdamW(model.parameters(), lr=3e-7)
num_epochs = 3
l_train_loader = len(train_dataloader)
num_training_steps = num_epochs * l_train_loader
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=100,
    num_training_steps=math.ceil(num_training_steps/gradient_accumulation_steps),
)
print(num_training_steps,math.ceil(num_training_steps/gradient_accumulation_steps)) # Learning Rate Schedules

15000 469




In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") # ตรงนี้สำหรับคนใช้ GPU/CPU ในการเทรน
model.to(device)
print(device)

cuda


In [None]:
# with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
#     data = next(iter(train_dataloader))
#     data = {k:v.to(device) for k,v in data.items()}
#     print(model(**data))

In [None]:
output_dir_min_loss = 'xglm-checkpoint-min-loss'
OPTIMIZER_NAME = "optimizer.pt"
SCHEDULER_NAME = "scheduler.pt"

In [None]:
# train
print('start_training')
for epoch in range(num_epochs):
    for index,data in enumerate(train_dataloader):
        data = {k:v.to(device) for k,v in data.items()}
        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(**data)
            loss = outputs.loss / gradient_accumulation_steps
            loss.backward()
        
        if (index + 1) % gradient_accumulation_steps == 0 or (index + 1) == l_train_loader:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            if (index+1) % gradient_accumulation_steps**2 == 0:
                print(epoch,index+1,(index+1)%gradient_accumulation_steps,loss)

    model.save_pretrained(f'{output_dir_min_loss} {epoch}') 
    with open(os.path.join(f'{output_dir_min_loss} {epoch}', 'loss.txt'),'w') as f:
        text = f'{epoch},{index+1},{(index+1)%gradient_accumulation_steps},{loss.item()}'
        f.write(text)
    torch.save(optimizer.state_dict(), os.path.join(f'{output_dir_min_loss} {epoch}', OPTIMIZER_NAME))
    torch.save(lr_scheduler.state_dict(), os.path.join(f'{output_dir_min_loss} {epoch}', SCHEDULER_NAME))
             


start_training
0 1024 0 tensor(0.1279, device='cuda:0', grad_fn=<DivBackward0>)
0 2048 0 tensor(0.1302, device='cuda:0', grad_fn=<DivBackward0>)
0 3072 0 tensor(0.1085, device='cuda:0', grad_fn=<DivBackward0>)
0 4096 0 tensor(0.1183, device='cuda:0', grad_fn=<DivBackward0>)
1 1024 0 tensor(0.1193, device='cuda:0', grad_fn=<DivBackward0>)
1 2048 0 tensor(0.1079, device='cuda:0', grad_fn=<DivBackward0>)
1 3072 0 tensor(0.1076, device='cuda:0', grad_fn=<DivBackward0>)
1 4096 0 tensor(0.1317, device='cuda:0', grad_fn=<DivBackward0>)
2 1024 0 tensor(0.1142, device='cuda:0', grad_fn=<DivBackward0>)
2 2048 0 tensor(0.1207, device='cuda:0', grad_fn=<DivBackward0>)
2 3072 0 tensor(0.1243, device='cuda:0', grad_fn=<DivBackward0>)
2 4096 0 tensor(0.1188, device='cuda:0', grad_fn=<DivBackward0>)


In [None]:
print('end')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import auth
auth.authenticate_user()
from googleapiclient.discovery import build
drive_service = build('drive', 'v3')
from googleapiclient.http import MediaFileUpload

file_metadata = {
  'name': f'pytorch_model.bin',
  'mimeType': 'text/plain'
}
media = MediaFileUpload('/content/xglm-checkpoint-min-loss 2/pytorch_model.bin', 
                        mimetype='text/plain',
                        resumable=True)
created = drive_service.files().create(body=file_metadata,
                                       media_body=media,
                                       fields='id').execute()

In [None]:
file_metadata = {
  'name': f'optimizer.pt',
  'mimeType': 'text/plain'
}
media = MediaFileUpload('/content/xglm-checkpoint-min-loss 2/optimizer.pt', 
                        mimetype='text/plain',
                        resumable=True)
created = drive_service.files().create(body=file_metadata,
                                       media_body=media,
                                       fields='id').execute()

In [None]:
sample = XGLMForCausalLM.from_pretrained('/content/drive/MyDrive', local_files_only=True,
                                           torch_dtype=torch.float16)
sample