In [1]:
import bitsandbytes
import os
import wandb
import json
import tqdm
from datasets import Dataset, concatenate_datasets, load_dataset
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn as nn
from peft import LoraConfig
import wandb
from trl import DataCollatorForCompletionOnlyLM, SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!wandb.login()
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"
os.environ["WANDB_WATCH"] = "gradients"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb=1024"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
wandb_entity="wandb-japan"
wandb_project="llm-finetuning"

/bin/bash: -c: line 2: syntax error: unexpected end of file


In [3]:
config = {
    "BASE_MODEL":"cyberagent/open-calm-large",
    "model_artifacts": 'wandb-japan/llm-finetuning/cyberagent-open-calm-medium:v0', # change
    "tuning_data_artifacts": {"Anthropic_hh_rlfh":'wandb-japan/llm-finetuning/Anthropic_hh_rlfh:v0',
                              "OpenAssistant_oasst1":"wandb-japan/llm-finetuning/OpenAssistant_oasst1:v0",
                              "databricks-dolly-15k-ja":"wandb-japan/llm-finetuning/databricks-dolly-15k-ja:v0"},
    "max_seq_length":1024,
    "lora_config":{
        "r":8,
        "lora_alpha":16,
        "target_modules":["query_key_value"],
        "lora_dropout":.1,
        "bias":"none",
        "task_type":"CAUSAL_LM"
    },
    "training_args":{
        "dataloader_num_workers":2,
        "evaluation_strategy":"steps",
        "per_device_train_batch_size":8,
        "gradient_accumulation_steps":2,
        "warmup_ratio": 0.1, 
        "warmup_steps":5,
        "num_train_epochs":1,
        #"max_steps": 100,
        "learning_rate":1e-4,
        "fp16":True,
        "logging_steps":10,
        "save_steps":1000,
        "output_dir":'./outputs',
        "report_to":"wandb"
    }
}

In [4]:
with wandb.init(entity=wandb_entity, project=wandb_project,  config=config, job_type='finetuning') as run:
    model_name = run.config["BASE_MODEL"]
    artifact = run.use_artifact(run.config["model_artifacts"], type='model')
    model_dir = artifact.download()
    torch.cuda.empty_cache()
    tokenizer = AutoTokenizer.from_pretrained(model_dir,use_fast=True,trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id
    model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", use_cache=False)

    lora_config =  LoraConfig(**wandb.config["lora_config"])


    # cast the small parameters (e.g. layernorm) to fp32 for stability
    for param in model.parameters():
        param.requires_grad = False # freeze the model - train adapters later
        if param.ndim == 1:
            param.data = param.data.to(torch.float32)
    model.gradient_checkpointing_enable()  # reduce number of stored activations
    model.enable_input_require_grads()
    class CastOutputToFloat(nn.Sequential):
        def forward(self, x): return super().forward(x).to(torch.float32)
    model.embed_out = CastOutputToFloat(model.embed_out)

    datasets = []
    # load Anthropic_hh_rlfh
    dataset_artifacts=run.use_artifact(run.config["tuning_data_artifacts"]["Anthropic_hh_rlfh"], type='dataset')
    dataset_artifacts_folder=dataset_artifacts.download()
    dataset = load_dataset('json', data_files=dataset_artifacts_folder+'/train_mpt_hhrlhf_49k_ja.json')
    dataset = dataset["train"]
    dataset = dataset.select_columns("text")
    datasets.append(dataset)
    # load OpenAssistant_oasst1
    dataset_artifacts=run.use_artifact(run.config["tuning_data_artifacts"]["OpenAssistant_oasst1"], type='dataset')
    dataset_artifacts_folder=dataset_artifacts.download()
    dataset = load_dataset('json', data_files=dataset_artifacts_folder+'/train_OpenAssistant_oasst1.json')
    dataset = dataset["train"]
    dataset = dataset.select_columns("text")
    datasets.append(dataset)

    # load databricks-dolly-15k-ja
    dataset_artifacts=run.use_artifact(run.config["tuning_data_artifacts"]["databricks-dolly-15k-ja"], type='dataset')
    dataset_artifacts_folder=dataset_artifacts.download()
    dataset = load_dataset('json', data_files=dataset_artifacts_folder+'/train_databricks-dolly-15k-ja.json')
    dataset = dataset["train"]
    dataset = dataset.select_columns("text")
    datasets.append(dataset)
    eot_token = tokenizer.eos_token

    for i in range(len(datasets)):
        datasets[i] = datasets[i].filter(lambda example: len(example["text"]) <= 1000)
        datasets[i] = datasets[i].map(lambda example: {"text": example["text"] + eot_token})
    
    train_dataset = concatenate_datasets(datasets)
    train_dataset = train_dataset.train_test_split(test_size=0.1)

    #instruction_ids = tokenizer.encode("\n\n### 指示:\n", add_special_tokens=False)[1:]
    #response_ids = tokenizer.encode("\n\n### 応答:\n", add_special_tokens=False)[1:]
    instruction_template = "### 指示:"
    response_template = "### 応答:"
    collator = DataCollatorForCompletionOnlyLM(
        instruction_template=instruction_template,response_template=response_template, tokenizer=tokenizer
    )

    trainer = SFTTrainer(
        model,
        args=transformers.TrainingArguments(**wandb.config["training_args"]),
        tokenizer=tokenizer,
        train_dataset=train_dataset["train"],
        eval_dataset=train_dataset["test"],
        peft_config=lora_config,
        dataset_text_field="text",
        max_seq_length=run.config["max_seq_length"],
        data_collator=collator,
    )
    trainer.train()
    trainer.save_model()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkeisuke-kamata[0m ([33mwandb-japan[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact cyberagent-open-calm-medium:v0, 783.23MB. 6 files... 
[34m[1mwandb[0m:   6 of 6 files downloaded.  
Done. 0:0:1.0
[34m[1mwandb[0m: Downloading large artifact Anthropic_hh_rlfh:v0, 68.05MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.3
[34m[1mwandb[0m: Downloading large artifact OpenAssistant_oasst1:v0, 114.69MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.3
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Map: 100%|██████████| 102592/102592 [00:19<00:00, 5188.59 examples/s]
Map: 100%|██████████| 11400/11400 [00:02<00:00, 5064.43 examples/s]
You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` me

Step,Training Loss,Validation Loss
10,3.4348,3.440608
20,3.4554,3.413065
30,3.4856,3.393063
40,3.386,3.381095
50,3.3342,3.371619
60,3.4358,3.364838
70,3.3343,3.359581
80,3.4082,3.355439
90,3.3545,3.351737
100,3.3022,3.348532


You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenize

0,1
eval/loss,█▆▅▄▃▃▃▂▂▂▂▁▁▁▁
eval/runtime,█▁▁▁▁▁▁▂▂▂▂▂▁▂▂
eval/samples_per_second,▁██████▇▇▇▇▇█▇▇
eval/steps_per_second,▁██████▇▇▇▇▇█▇▇
train/epoch,▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅█████████████
train/global_step,▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train/learning_rate,██▇▇▆▆▅▅▄▄▃▃▂▂▁▁
train/loss,▆▇█▄▂▆▂▅▃▁▄▅▄▂▂▆

0,1
eval/loss,3.3359
eval/runtime,53.255
eval/samples_per_second,214.064
eval/steps_per_second,26.758
train/epoch,0.02
train/global_step,160.0
train/learning_rate,0.0001
train/loss,3.4311


KeyboardInterrupt: 