<!-- TABS -->
# Fine tune LLM on database

<!-- TABS -->
## Configure your production system

:::note
If you would like to use the production features 
of SuperDuperDB, then you should set the relevant 
connections and configurations in a configuration 
file. Otherwise you are welcome to use "development" mode 
to get going with SuperDuperDB quickly.
:::

In [None]:
import os

os.makedirs('.pinnacledb', exist_ok=True)
os.environ['pinnacleDB_CONFIG'] = '.pinnacledb/config.yaml'

In [None]:
# <tab: MongoDB Community>
CFG = '''
data_backend: mongodb://127.0.0.1:27017/documents
artifact_store: filesystem://./artifact_store
cluster:
  cdc:
    strategy: null
    uri: ray://127.0.0.1:20000
  compute:
    uri: ray://127.0.0.1:10001
  vector_search:
    backfill_batch_size: 100
    type: in_memory
    uri: http://127.0.0.1:21000
'''

In [None]:
# <tab: MongoDB Atlas>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
        type: native
databackend: mongodb+srv://<user>:<password>@<mongo-host>:27017/documents
'''

In [None]:
# <tab: SQLite>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: sqlite://<path-to-db>.db
'''

In [None]:
# <tab: MySQL>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: mysql://<user>:<password>@<host>:<port>/database
'''

In [None]:
# <tab: Oracle>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: mssql://<user>:<password>@<host>:<port>
'''

In [None]:
# <tab: PostgreSQL>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: postgres://<user>:<password>@<host>:<port</<database>
'''

In [None]:
# <tab: Snowflake>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
metadata_store: sqlite://<path-to-sqlite-db>.db
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: snowflake://<user>:<password>@<account>/<database>
'''

In [None]:
# <tab: Clickhouse>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
metadata_store: sqlite://<path-to-sqlite-db>.db
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: clickhouse://<user>:<password>@<host>:<port>
'''

In [None]:
with open(os.environ['pinnacleDB_CONFIG'], 'w') as f:
    f.write(CFG)

<!-- TABS -->
## Start your cluster

:::note
Starting a SuperDuperDB cluster is useful in production and model development
if you want to enable scalable compute, access to the models by multiple users for collaboration, 
monitoring.

If you don't need this, then it is simpler to start in development mode.
:::

In [None]:
# <tab: Experimental Cluster>
!python -m pinnacledb local-cluster up

In [None]:
# <tab: Docker-Compose>
!make testenv_image
!make testenv_init

In [None]:
from pinnacledb import pinnacle

db = pinnacle()

<!-- TABS -->
## Connect to SuperDuperDB

:::note
Note that this is only relevant if you are running SuperDuperDB in development mode.
Otherwise refer to "Configuring your production system".
:::

In [None]:
# <tab: MongoDB>
from pinnacledb import pinnacle

db = pinnacle('mongodb://localhost:27017/documents')

In [None]:
# <tab: SQLite>
from pinnacledb import pinnacle
db = pinnacle('sqlite://my_db.db')

In [None]:
# <tab: MySQL>
from pinnacledb import pinnacle

user = 'pinnacle'
password = 'pinnacle'
port = 3306
host = 'localhost'
database = 'test_db'

db = pinnacle(f"mysql://{user}:{password}@{host}:{port}/{database}")

In [None]:
# <tab: Oracle>
from pinnacledb import pinnacle

user = 'sa'
password = 'pinnacle#1'
port = 1433
host = 'localhost'

db = pinnacle(f"mssql://{user}:{password}@{host}:{port}")

In [None]:
# <tab: PostgreSQL>
!pip install psycopg2
from pinnacledb import pinnacle

user = 'postgres'
password = 'postgres'
port = 5432
host = 'localhost'
database = 'test_db'
db_uri = f"postgres://{user}:{password}@{host}:{port}/{database}"

db = pinnacle(db_uri, metadata_store=db_uri.replace('postgres://', 'postgresql://'))

In [None]:
# <tab: Snowflake>
from pinnacledb import pinnacle

user = "pinnacleuser"
password = "pinnaclepassword"
account = "XXXX-XXXX"  # ORGANIZATIONID-USERID
database = "FREE_COMPANY_DATASET/PUBLIC"

snowflake_uri = f"snowflake://{user}:{password}@{account}/{database}"

db = pinnacle(
    snowflake_uri, 
    metadata_store='sqlite:///your_database_name.db',
)

In [None]:
# <tab: Clickhouse>
from pinnacledb import pinnacle

user = 'default'
password = ''
port = 8123
host = 'localhost'

db = pinnacle(f"clickhouse://{user}:{password}@{host}:{port}", metadata_store=f'mongomock://meta')

In [None]:
# <tab: DuckDB>
from pinnacledb import pinnacle

db = pinnacle('duckdb://mydb.duckdb')

In [None]:
# <tab: Pandas>
from pinnacledb import pinnacle

db = pinnacle(['my.csv'], metadata_store=f'mongomock://meta')

In [None]:
# <tab: MongoMock>
from pinnacledb import pinnacle

db = pinnacle('mongomock:///test_db')

<!-- TABS -->
## Get useful sample data

In [None]:
# <tab: Text>
!curl -O https://pinnacledb-public-demo.s3.amazonaws.com/text.json
import json

with open('text.json', 'r') as f:
    data = json.load(f)

<!-- TABS -->
## Setup tables or collections

In [None]:
# <tab: MongoDB>
# Note this is an optional step for MongoDB
# Users can also work directly with `DataType` if they want to add
# custom data
from pinnacledb import Schema, DataType
from pinnacledb.backends.mongodb import Collection

table_or_collection = Collection('documents')
USE_SCHEMA = False
datatype = None

if USE_SCHEMA and isinstance(datatype, DataType):
    schema = Schema(fields={'x': datatype})
    db.apply(schema)

In [None]:
# <tab: SQL>
from pinnacledb.backends.ibis import Table
from pinnacledb import Schema, DataType
from pinnacledb.backends.ibis.field_types import dtype

datatype = "str"

if isinstance(datatype, DataType):
    schema = Schema(identifier="schema", fields={"id": dtype("str"), "x": datatype})
else:
    schema = Schema(
        identifier="schema", fields={"id": dtype("str"), "x": dtype(datatype)}
    )

table_or_collection = Table('documents', schema=schema)

db.apply(table_or_collection)

<!-- TABS -->
## Insert data

In order to create data, we need to create a `Schema` for encoding our special `Datatype` column(s) in the databackend.

In [None]:
# <tab: MongoDB>
from pinnacledb import Document

def do_insert(data):
    schema = None
    
    if schema is None and datatype is None:
        data = [Document({'x': x}) for x in data]
        db.execute(table_or_collection.insert_many(data))
    elif schema is None and datatype is not None:
        data = [Document({'x': datatype(x)}) for x in data]
        db.execute(table_or_collection.insert_many(data))
    else:
        data = [Document({'x': x}) for x in data]
        db.execute(table_or_collection.insert_many(data, schema='my_schema'))

In [None]:
# <tab: SQL>
from pinnacledb import Document

def do_insert(data):
    db.execute(table_or_collection.insert([Document({'id': str(idx), 'x': x}) for idx, x in enumerate(data)]))

In [None]:
do_insert(data[:-len(data) // 4])

In [None]:
#<snippet: build_a_trainable_llm: Transformers>