In [None]:
!pip install git+https://github.com/huggingface/accelerate
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install gdown

In [None]:
import multiprocessing
cpu_cores = multiprocessing.cpu_count()
print(cpu_cores)

In [None]:
import json
import os
from typing import Union,List
import sys

import torch
from transformers import XGLMTokenizer, XGLMForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq, GenerationConfig
from datasets import load_dataset


In [None]:
print('load_model')

In [None]:
def load_model(base_model:str="facebook/xglm-564M"):
    tokenizer = XGLMTokenizer.from_pretrained(base_model)
    model = XGLMForCausalLM.from_pretrained(base_model,
                                           torch_dtype=torch.float16)
    new_tokens = ['<human>:', '<bot>:']
    tokenizer.add_tokens(list(new_tokens))
    model.resize_token_embeddings(len(tokenizer))
    
    return model,tokenizer
model,tokenizer = load_model()

In [None]:
import gdown

url = 'https://drive.google.com/uc?export=download&id=1jbbUtwgwoSQgGnXxzTh-nMReVzEU7ZTU&confirm=t&uuid=d79e2e78-51de-466f-9ceb-3944606141a2&at=AKKF8vwcgi95TGSnSQUNCKx4NTqS:1682865249145'
output = 'output.jsonl'
gdown.download(url, output, quiet=False)


In [None]:
from datasets import load_dataset
datasets = load_dataset('json',data_files='output.jsonl')
print(datasets)

In [None]:
def format_prompt(prompt):
    return {'prompt':f"{prompt['Background:']} <human>: {prompt['<human>:']} <bot>: {prompt['<bot>:']}"}
# format data like  context ......
def preprocess(prompt):
    data = tokenizer(
        prompt['prompt'],
        truncation=True,
        max_length=256,
        padding=False,
        return_tensors=None,
    )
    data['input_ids'].append(tokenizer.eos_token_id)
    data['attention_mask'].append(1)
    data['labels'] = data['input_ids'].copy()
    return data

In [None]:
print('preprocess')

In [None]:
# from datasets import Dataset
# datasets = Dataset.from_dict(datasets['train'][:20_000]) # sample data for test
datasets = datasets['train']
datasets = datasets.map(format_prompt,remove_columns=['Background:', '<human>:', '<bot>:'],num_proc=cpu_cores)
print(datasets['prompt'][0])
datasets = datasets.map(lambda x:{'token':len(tokenizer.tokenize(x['prompt']))},num_proc=cpu_cores)
datasets = datasets.filter(lambda x:x['token']<255,num_proc=cpu_cores)
datasets = datasets.map(preprocess,remove_columns=['prompt','token'],num_proc=cpu_cores) 
datasets = datasets.filter(lambda x:x['input_ids'][0] == 2,num_proc=cpu_cores)
datasets = datasets.filter(lambda x:x['input_ids'][-1] == 2,num_proc=cpu_cores)  

datasets

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(
            tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True 
        )

In [None]:
from accelerate import Accelerator
from torch.utils.data import DataLoader
from transformers import AdamW,get_scheduler
from torch.cuda.amp import  GradScaler
import math
def train(model):

    batch_size = 128
    mini_batch_size = 4
    gradient_accumulation_steps = batch_size //mini_batch_size
    train_dataloader = DataLoader(
        datasets, shuffle=True, batch_size=mini_batch_size, collate_fn=data_collator
    )
    
    accelerator = Accelerator(gradient_accumulation_steps=gradient_accumulation_steps)
    
    num_epochs = 6
    n_gpus = torch.cuda.device_count()
    l_data_loader = len(train_dataloader)
    num_training_steps = num_epochs * l_data_loader

    optimizer = AdamW(model.parameters(), lr=3e-7)
    lr_scheduler = get_scheduler(
      "linear",
      optimizer=optimizer,
      num_warmup_steps=100,
      num_training_steps=num_training_steps,
    )
    accelerator.print(num_training_steps)
    
    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(model, optimizer, train_dataloader, lr_scheduler)

    accelerator.print('start-training')
    try :
        for epoch in range(num_epochs):
            for index,data in enumerate(train_dataloader):
                with accelerator.accumulate(model):
                    with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
                        outputs = model(**data)
                        loss = outputs.loss 
                        accelerator.backward(loss)
                        optimizer.step()
                        lr_scheduler.step()
                        optimizer.zero_grad()
            accelerator.print(epoch,index+1,loss)
                    
        accelerator.wait_for_everyone() 
        model = accelerator.unwrap_model(model)
        model.save_pretrained('checkpoint', is_main_process=accelerator.is_main_process, save_function=accelerator.save)
        
    except Exception as e:
        accelerator.print(e)
        
        accelerator.wait_for_everyone() 
        model = accelerator.unwrap_model(model)
        model.save_pretrained('checkpoint', is_main_process=accelerator.is_main_process, save_function=accelerator.save)
        raise e



In [None]:
from accelerate import notebook_launcher
print('train')
notebook_launcher(train, args=(model,),num_processes=torch.cuda.device_count()) 

In [None]:
sample = XGLMForCausalLM.from_pretrained('/kaggle/working/checkpoint', local_files_only=True,torch_dtype=torch.float16)

In [None]:
generation_config = GenerationConfig(
        temperature=0.1,
        top_p=0.75,
        top_k=40,
        num_beams=4,
    )
device = torch.device('cuda')
sample.to(device)

In [None]:
import re
def gen(prompt,generation_config=generation_config):
    prompt = f'{prompt} <human>:'
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"]
    input_ids = input_ids.to(device)
    with torch.no_grad():
        generation_output = sample.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=256,
        )
    output_1_ids = generation_output.sequences[0]
    output = tokenizer.decode(output_1_ids)  
    bot_prompt = re.split(tokenizer.eos_token,output)[1]
    bot_prompt = f'{bot_prompt} <bot>:'
    inputs = tokenizer(bot_prompt, return_tensors="pt")
    input_ids = inputs["input_ids"]
    input_ids = input_ids.to(device)
    with torch.no_grad():
        generation_output = sample.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=256,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    res = re.split('<human>: | <bot>:',output)
    if len(res) == 3:
        return f'<human>: {res[1]} \n<bot>: {res[2]}'
    else:
        return res

In [None]:
sample_prompt = """
Doraemon  is a Japanese manga series written and illustrated 
by Fujiko F. Fujio. The manga was first serialized in December 1969, 
with its 1,345 individual chapters compiled into 45 tankōbon volumes 
and published by Shogakukan from 1970 to 1996. The story revolves around 
an earless robotic cat named Doraemon, who travels back in time from the 
22nd century to aid a boy named Nobita Nobi."""

In [None]:
response = gen(sample_prompt)
print(response)