In [1]:
import datasets
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from tqdm import tqdm
import os
import json
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wandb.login()
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"
os.environ["WANDB_WATCH"] = "gradients"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
wandb_entity="" # change here
wandb_project="" # change here

/bin/bash: -c: line 2: syntax error: unexpected end of file


# Model Registration

In [12]:
with wandb.init(entity=wandb_entity, project=wandb_project, name='foundation model upload', job_type='data_upload') as run:
    # model import
    model_name = "cyberagent/open-calm-medium"
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model_directory = "./model_directory"
    model.save_pretrained(model_directory)
    tokenizer.save_pretrained(model_directory)

    # create artifacts
    model_artifact = wandb.Artifact(f'{model_name.replace("/", "-")}', type='model')
    model_artifact.add_dir(model_directory)

    # log to wandb
    run.log_artifact(model_artifact)

[34m[1mwandb[0m: Adding directory to artifact (./model_directory)... Done. 1.3s


# Data Preparation

In [20]:
PROMPT_NO_INPUT_FORMAT = """以下は、あるタスクを説明する指示です。このリクエストを適切に完了する回答を書いてください。
### 指示: {instruction}
### 応答: """

PROMPT_WITH_INPUT_FORMAT = """以下は、あるタスクを説明する指示です。このリクエストを適切に完了する回答を書いてください。
### 指示: {instruction}
### 入力: {input}
### 応答: """

In [21]:
with wandb.init(entity=wandb_entity, project=wandb_project,  name='dolly-15k-ja upload', job_type='data_upload') as run:
  dataset = datasets.load_dataset("kunishou/databricks-dolly-15k-ja")
  dataset = dataset["train"]
  processed_data = []

  for record in tqdm(dataset):
    if 'input' in record:
        source_text = PROMPT_WITH_INPUT_FORMAT.format(instruction=record['instruction'], input=record['input'])
    else:
        source_text = PROMPT_NO_INPUT_FORMAT.format(instruction=record['instruction'])
    
    example_text = {"text": source_text + record['output'], "label":record['output']}
    processed_data.append(example_text)
    
  with open('train_databricks-dolly-15k-ja.json', 'w', encoding='utf-8') as f:
      json.dump(processed_data, f, ensure_ascii=False, indent=2)

  dataset_artifact = wandb.Artifact("databricks-dolly-15k-ja", type='dataset')
  dataset_artifact.add_file(f"train_databricks-dolly-15k-ja.json")
  run.log_artifact(dataset_artifact)

100%|██████████| 15015/15015 [00:00<00:00, 50356.20it/s]


In [22]:
with wandb.init(entity=wandb_entity, project=wandb_project,name='oasst1-89k-ja upload', job_type='data_upload') as run:
    ds = datasets.load_dataset("OpenAssistant/oasst1")
    train = ds["train"].to_pandas()
    val = ds["validation"].to_pandas()

    df_origin = pd.concat([train, val], axis=0).reset_index(drop=True)

    # import japanese oasst1
    df_ja = datasets.load_dataset("kunishou/oasst1-89k-ja")
    df_ja = df_ja["train"]
    df_ja = df_ja.to_pandas()

    # metge oasst1 original data ana japanese data
    df = pd.merge(df_origin, df_ja[["message_id", "text_ja"]], on="message_id", how="left").copy()
    df["text"] = df["text_ja"]

    df_assistant = df[(df.role == "assistant")].copy()
    df_prompter = df[(df.role == "prompter")].copy()
    df_prompter = df_prompter.set_index("message_id")
    df_assistant["output"] = df_assistant["text"].values

    inputs = []
    parent_ids = []
    for _, row in df_assistant.iterrows():
        input = df_prompter.loc[row.parent_id]
        inputs.append(input.text)
        parent_ids.append(input.parent_id)

    df_assistant["instruction"] = inputs
    df_assistant["parent_id"] = parent_ids

    df_assistant = df_assistant[
        ["instruction", "output", "message_id", "parent_id", "lang", "rank"]
    ].rename(columns={"message_id": "id"})

    # exclude wrong data
    df_assistant2 = df_assistant[~df_assistant["instruction"].str.contains("翻訳")]
    processed_data = []
    for _, record in tqdm(df_assistant2.iterrows(), total=df_assistant2.shape[0]):
        source_text = PROMPT_NO_INPUT_FORMAT.format(instruction=record['instruction'])
        example_text = {"text": source_text + record['output'], "label":record['output']}
        processed_data.append(example_text)

    with open('train_OpenAssistant_oasst1.json', 'w', encoding='utf-8') as f:
        json.dump(processed_data, f, ensure_ascii=False, indent=2)

    dataset_artifact = wandb.Artifact("OpenAssistant_oasst1", type='dataset')
    dataset_artifact.add_file(f"train_OpenAssistant_oasst1.json")
    run.log_artifact(dataset_artifact)

100%|██████████| 55359/55359 [00:00<00:00, 67102.43it/s]


In [23]:
with wandb.init(entity=wandb_entity, project=wandb_project,name='Anthropic_hh_rlfh upload',job_type='data_upload') as run:
  dataset = datasets.load_dataset("kunishou/hh-rlhf-49k-ja")
  dataset = dataset["train"]
  processed_data = []

  for record in tqdm(dataset):
    source_text = PROMPT_NO_INPUT_FORMAT.format(instruction=record['instruction'])
    example_text = {"text": source_text + record['output'], "label":record['output']}
    processed_data.append(example_text)
    
  with open('train_mpt_hhrlhf_49k_ja.json', 'w', encoding='utf-8') as f:
      json.dump(processed_data, f, ensure_ascii=False, indent=2)

  dataset_artifact = wandb.Artifact("Anthropic_hh_rlfh", type='dataset')
  dataset_artifact.add_file(f"train_mpt_hhrlhf_49k_ja.json")
  run.log_artifact(dataset_artifact)

100%|██████████| 49424/49424 [00:01<00:00, 43327.41it/s]
