<!-- TABS -->
# Fine tune LLM on database

<!-- TABS -->
## Configure your production system

:::note
If you would like to use the production features 
of SuperDuperDB, then you should set the relevant 
connections and configurations in a configuration 
file. Otherwise you are welcome to use "development" mode 
to get going with SuperDuperDB quickly.
:::

In [None]:
import os

os.makedirs('.pinnacledb', exist_ok=True)
os.environ['pinnacleDB_CONFIG'] = '.pinnacledb/config.yaml'

In [None]:
# <tab: MongoDB Community>
CFG = '''
data_backend: mongodb://127.0.0.1:27017/documents
artifact_store: filesystem://./artifact_store
cluster:
  cdc:
    strategy: null
    uri: ray://127.0.0.1:20000
  compute:
    uri: ray://127.0.0.1:10001
  vector_search:
    backfill_batch_size: 100
    type: in_memory
    uri: http://127.0.0.1:21000
'''

In [None]:
# <tab: MongoDB Atlas>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
        type: native
databackend: mongodb+srv://<user>:<password>@<mongo-host>:27017/documents
'''

In [None]:
# <tab: SQLite>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: sqlite://<path-to-db>.db
'''

In [None]:
# <tab: MySQL>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: mysql://<user>:<password>@<host>:<port>/database
'''

In [None]:
# <tab: Oracle>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: mssql://<user>:<password>@<host>:<port>
'''

In [None]:
# <tab: PostgreSQL>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: postgres://<user>:<password>@<host>:<port</<database>
'''

In [None]:
# <tab: Snowflake>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
metadata_store: sqlite://<path-to-sqlite-db>.db
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: snowflake://<user>:<password>@<account>/<database>
'''

In [None]:
# <tab: Clickhouse>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
metadata_store: sqlite://<path-to-sqlite-db>.db
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: clickhouse://<user>:<password>@<host>:<port>
'''

In [None]:
with open(os.environ['pinnacleDB_CONFIG'], 'w') as f:
    f.write(CFG)

<!-- TABS -->
## Start your cluster

:::note
Starting a SuperDuperDB cluster is useful in production and model development
if you want to enable scalable compute, access to the models by multiple users for collaboration, 
monitoring.

If you don't need this, then it is simpler to start in development mode.
:::

In [None]:
# <tab: Experimental Cluster>
!python -m pinnacledb local-cluster up

In [None]:
# <tab: Docker-Compose>
!make build_sandbox
!make testenv_init

<!-- TABS -->
## Connect to SuperDuperDB

:::note
Note that this is only relevant if you are running SuperDuperDB in development mode.
Otherwise refer to "Configuring your production system".
:::

In [None]:
# <tab: MongoDB>
from pinnacledb import pinnacle

db = pinnacle('mongodb://localhost:27017/documents')

In [None]:
# <tab: SQLite>
from pinnacledb import pinnacle
db = pinnacle('sqlite://my_db.db')

In [None]:
# <tab: MySQL>
from pinnacledb import pinnacle

user = 'pinnacle'
password = 'pinnacle'
port = 3306
host = 'localhost'
database = 'test_db'

db = pinnacle(f"mysql://{user}:{password}@{host}:{port}/{database}")

In [None]:
# <tab: Oracle>
from pinnacledb import pinnacle

user = 'sa'
password = 'pinnacle#1'
port = 1433
host = 'localhost'

db = pinnacle(f"mssql://{user}:{password}@{host}:{port}")

In [None]:
# <tab: PostgreSQL>
!pip install psycopg2
from pinnacledb import pinnacle

user = 'postgres'
password = 'postgres'
port = 5432
host = 'localhost'
database = 'test_db'
db_uri = f"postgres://{user}:{password}@{host}:{port}/{database}"

db = pinnacle(db_uri, metadata_store=db_uri.replace('postgres://', 'postgresql://'))

In [None]:
# <tab: Snowflake>
from pinnacledb import pinnacle

user = "pinnacleuser"
password = "pinnaclepassword"
account = "XXXX-XXXX"  # ORGANIZATIONID-USERID
database = "FREE_COMPANY_DATASET/PUBLIC"

snowflake_uri = f"snowflake://{user}:{password}@{account}/{database}"

db = pinnacle(
    snowflake_uri, 
    metadata_store='sqlite:///your_database_name.db',
)

In [None]:
# <tab: Clickhouse>
from pinnacledb import pinnacle

user = 'default'
password = ''
port = 8123
host = 'localhost'

db = pinnacle(f"clickhouse://{user}:{password}@{host}:{port}", metadata_store=f'mongomock://meta')

In [None]:
# <tab: DuckDB>
from pinnacledb import pinnacle

db = pinnacle('duckdb://mydb.duckdb')

In [None]:
# <tab: Pandas>
from pinnacledb import pinnacle

db = pinnacle(['my.csv'], metadata_store=f'mongomock://meta')

In [None]:
# <tab: MongoMock>
from pinnacledb import pinnacle

db = pinnacle('mongomock:///test_db')

## Install related dependencies

In [None]:
!pip install transformers torch accelerate trl peft datasets

<!-- TABS -->
## Get LLM Finetuning Data

The following are examples of training data in different formats.

In [None]:
# <tab: Text>
from datasets import load_dataset
from pinnacledb.base.document import Document
dataset_name = "timdettmers/openassistant-guanaco"
dataset = load_dataset(dataset_name)

train_dataset = dataset["train"]
eval_dataset = dataset["test"]

train_documents = [
    Document({**example, "_fold": "train"})
    for example in train_dataset
]
eval_documents = [
    Document({**example, "_fold": "valid"})
    for example in eval_dataset
]

datas = train_documents + eval_documents

In [None]:
# <tab: Prompt-Response>
from datasets import load_dataset
from pinnacledb.base.document import Document
dataset_name = "mosaicml/instruct-v3"
dataset = load_dataset(dataset_name)

train_dataset = dataset["train"]
eval_dataset = dataset["test"]

train_documents = [
    Document({**example, "_fold": "train"})
    for example in train_dataset
]
eval_documents = [
    Document({**example, "_fold": "valid"})
    for example in eval_dataset
]

datas = train_documents + eval_documents

In [None]:
# <tab: Chat>
from datasets import load_dataset
from pinnacledb.base.document import Document
dataset_name = "philschmid/dolly-15k-oai-style"
dataset = load_dataset(dataset_name)['train'].train_test_split(0.9)

train_dataset = dataset["train"]
eval_dataset = dataset["test"]

train_documents = [
    Document({**example, "_fold": "train"})
    for example in train_dataset
]
eval_documents = [
    Document({**example, "_fold": "valid"})
    for example in eval_dataset
]

datas = train_documents + eval_documents

We can define different training parameters to handle this type of data.

In [None]:
# <tab: Text>
# Function for transformation after extracting data from the database
transform = None
key = ('text')
training_kwargs=dict(dataset_text_field="text")

In [None]:
# <tab: Prompt-Response>
# Function for transformation after extracting data from the database
def transform(prompt, response):
    return {'text': prompt + response + "</s>"}

key = ('prompt', 'response')
training_kwargs=dict(dataset_text_field="text")

In [None]:
# <tab: Chat>
# Function for transformation after extracting data from the database
transform = None

key = ('messages')
training_kwargs=None

Example input_text and output_text

In [None]:
# <tab: Text>
data = datas[0]
input_text, output_text = data["text"].rsplit("### Assistant: ", maxsplit=1)
input_text += "### Assistant: "
output_text = output_text.rsplit("### Human:")[0]
print("Input: --------------")
print(input_text)
print("Response: --------------")
print(output_text)

In [None]:
# <tab: Prompt-Response>
data = datas[0]
input_text = data["prompt"]
output_text = data["response"]
print("Input: --------------")
print(input_text)
print("Response: --------------")
print(output_text)

In [None]:
# <tab: Chat>
data = datas[0]
messages = data["messages"]
input_text = messages[:-1]
output_text = messages[-1]["content"]
print("Input: --------------")
print(input_text)
print("Response: --------------")
print(output_text)

<!-- TABS -->
## Insert simple data

After turning on auto_schema, we can directly insert data, and pinnacledb will automatically analyze the data type, and match the construction of the table and datatype.

In [None]:
from pinnacledb import Document

table_or_collection = db['documents']

ids = db.execute(table_or_collection.insert([Document(data) for data in datas]))
select = table_or_collection.select()

## Select a Model

In [None]:
model_name = "facebook/opt-125m"
model_kwargs = dict()
tokenizer_kwargs = dict()

# or 
# model_name = "mistralai/Mistral-7B-Instruct-v0.2"
# token = "hf_xxxx"
# model_kwargs = dict(token=token)
# tokenizer_kwargs = dict(token=token)

<!-- TABS -->
## Build A Trainable LLM

**Create an LLM Trainer for training**

The parameters of this LLM Trainer are basically the same as `transformers.TrainingArguments`, but some additional parameters have been added for easier training setup.

In [None]:
from pinnacledb.ext.transformers import LLM, LLMTrainer
trainer = LLMTrainer(
    identifier="llm-finetune-trainer",
    output_dir="output/finetune",
    overwrite_output_dir=True,
    num_train_epochs=3,
    save_total_limit=3,
    logging_steps=10,
    evaluation_strategy="steps",
    save_steps=100,
    eval_steps=100,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    max_seq_length=512,
    key=key,
    select=select,
    transform=transform,
    training_kwargs=training_kwargs,
)

In [None]:
# <tab: Lora>
trainer.use_lora = True

In [None]:
# <tab: QLora>
trainer.use_lora = True
trainer.bits = 4

In [None]:
# <tab: Deepspeed>
!pip install deepspeed
deepspeed = {
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "gradient_accumulation_steps": "auto",
    "zero_optimization": {
        "stage": 2,
    },
}
trainer.use_lora = True
trainer.bits = 4
trainer.deepspeed = deepspeed

In [None]:
# <tab: Multi-GPUS>
trainer.use_lora = True
trainer.bits = 4
trainer.num_gpus = 2

Create a trainable LLM model and add it to the database, then the training task will run automatically.

In [None]:
llm = LLM(
    identifier="llm",
    model_name_or_path=model_name,
    trainer=trainer,
    model_kwargs=model_kwargs,
    tokenizer_kwargs=tokenizer_kwargs,
)

db.apply(llm)

## Load the trained model
There are two methods to load a trained model:

- **Load the model directly**: This will load the model with the best metrics (if the transformers' best model save strategy is set) or the last version of the model.
- **Use a specified checkpoint**: This method downloads the specified checkpoint, then initializes the base model, and finally pinnacles the checkpoint with the base model. This approach supports custom operations such as resetting flash_attentions, model quantization, etc., during initialization.

In [None]:
# <tab: Load Trained Model Directly>
llm = db.load("model", "llm")

In [None]:
# <tab: Use a specified checkpoint>
from pinnacledb.ext.transformers import LLM, LLMTrainer
experiment_id = db.show("checkpoint")[-1]
version = None # None means the last checkpoint
checkpoint = db.load("checkpoint", experiment_id, version=version)
llm = LLM(
    identifier="llm",
    model_name_or_path=model_name,
    adapter_id=checkpoint,
    model_kwargs=dict(load_in_4bit=True)
)

In [None]:
llm.predict_one(input_text, max_new_tokens=200)