In [1]:
from predibase import Predibase, FinetuningConfig, DeploymentConfig
from tokenizers import Tokenizer
from datasets import load_dataset
from dotenv import load_dotenv
import pandas as pd
import csv
import os
import warnings


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
load_dotenv()
warnings.filterwarnings("ignore")
pb = Predibase(api_token=os.getenv("PREDIBASE_API_KEY"))

In [3]:
def validate_data_csv(csv_file_name):
  """ Make sure it has prompt, completion, and split with all values """
  with open(csv_file_name, 'r',  encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
      assert row['prompt']
      assert row['completion']
      assert row['split']

  return True

In [4]:
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_pretrained("upstage/solar-1-mini-tokenizer")

def compute_cost(csv_file_name, price_per_million_tokens=0.5):
  """ Compute the cost of the dataset """

  total_num_of_tokens = 0
  with open(csv_file_name, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    # get all values
    values = [row['completion']+ " " + row['prompt'] for row in reader]
    for value in values:
      # tokenize
      enc = tokenizer.encode(value)
      num_of_tokens = len(enc.tokens)
      total_num_of_tokens += num_of_tokens


  return total_num_of_tokens / 1000000 * price_per_million_tokens

In [5]:
def process_csv(csv_file_name_raw, csv_file_name, max_rows=-1):
    template = {
        "prompt": """<|im_start|>user\n {question} <|im_end|>\n<|im_start|>assistant\n""",
        "completion": "{answer}<|im_end|>",
    }


    # Load the raw CSV file
    df_raw = pd.read_csv(csv_file_name_raw)

    # Prepare the new DataFrame
    df_new = pd.DataFrame()

    # Format 'question' and 'answer' according to the template and add 'split'
    df_new['prompt'] = df_raw['Question'].apply(lambda x: template['prompt'].format(question=x))
    df_new['completion'] = df_raw['Answer'].apply(lambda x: template['completion'].format(answer=x))
    df_new['split'] = 'train'

    # Write the new DataFrame to the new CSV file, up to max_rows
    df_new[:max_rows].to_csv(f"{csv_file_name}", index=False)

In [6]:
dataset_name = "jeju"
dataset_name_csv = f"{dataset_name}.csv"

In [7]:
try:
    pb_dataset = pb.datasets.get(dataset_name)
    print(f"Dataset found: {pb_dataset}")
except RuntimeError:
    print(f"Dataset not found: {dataset_name}")
    print(f"Creating dataset: {dataset_name}")

    process_csv(f"data/raw/{dataset_name}.csv", f"data/finetune/{dataset_name_csv}")

    print(f"Dataset Validation: {validate_data_csv(f'data/finetune/{dataset_name_csv}')}")

    print(f"One step FT Cost: {compute_cost(f'data/finetune/{dataset_name_csv}')} USD")
    
    print("Uploading dataset...")
    
    pb_dataset = pb.datasets.from_file(file_path=f"data/finetune/{dataset_name_csv}", name=dataset_name)

Dataset not found: jeju
Creating dataset: jeju
Dataset Validation: True
One step FT Cost: 0.002407 USD
Uploading dataset...


In [8]:
pb_dataset

Dataset(uuid='0aea382e-547c-48f3-9b6c-71b7e00046fd', name='jeju', connection_type='file', connection_name='file_uploads', status='connected')

In [9]:
# Create an adapter repository
repo_name = f"news-{dataset_name}-model"
repo = pb.repos.create(name=repo_name, description="", exists_ok=True)
print(repo)

uuid='441a6eaf-c5ab-4787-9d61-174628cc3908' name='news-jeju-model' description=''


In [10]:
# Start a fine-tuning job, blocks until training is finished
adapter = pb.adapters.create(
    config=FinetuningConfig(
        base_model="solar-1-mini-chat-240612",
        epochs=3,
        rank=1, # default: 16
    ),
    dataset=pb_dataset, # Also accepts the dataset name as a string
    repo=repo,
    description="Finetune with database_system dataset",
)

Successfully requested finetuning of solar-1-mini-chat-240612 as `news-jeju-model/2`. (Job UUID: 92efa625-9718-4c3b-860b-9d661689936d).

Watching progress of finetuning job 92efa625-9718-4c3b-860b-9d661689936d. This call will block until the job has finished. Canceling or terminating this call will NOT cancel or terminate the job itself.

Job is starting. Total queue time: 0:00:46         
Waiting to receive training metrics...

┌────────────┬────────────┬─────────────────┐
│ checkpoint [0m│ train_loss [0m│ validation_loss [0m│
├────────────┼────────────┼─────────────────┤
│     1      [0m│   2.0599   [0m│        --       [0m│
│     2      [0m│   2.3026   [0m│        --       [0m│
│     3      [0m│   2.0130   [0m│        --       [0m│
└────────────┴────────────┴─────────────────┘
