In [1]:
from distilabel.models import LiteLLM
from distilabel.pipeline import Pipeline
from distilabel.steps.tasks import TextGeneration
from distilabel.steps import LoadDataFromDicts

import nest_asyncio

nest_asyncio.apply()

from pydantic import BaseModel, Field
from typing import List
from rich import print


In [2]:
class Column(BaseModel):
    "A column for a table in a sqlite database"
    column_name: str
    column_type: str

class Table(BaseModel):
    "A table in a sqlite database"
    columns: List[Column]

class DatabaseSchema(BaseModel):
    "A schema for a SQLite Database"
    tables: List[Table]

In [3]:
# llm = LiteLLM(model="gemini/gemini-2.0-flash", structured_output={"schema": DatabaseSchema})
# llm = OllamaLLM(model='gemma3:27b')

In [4]:
examples = [
    "I'd like a database schema for an ecommerce company selling electronics. It should manage customers, orders, and inventory.",
    "I'd like a database schema for a customer support system. It should track customers, tickets, agents, and responses."
]

system_prompt = """You are an expert database architect who specializes in SQLite. \
You create comprehensive and accurate database designs based on customer requests. \
You make sure that column names are clear, and that types are specific. \
You make sure that relations among tables are clear by column naming. \
"""

example_messages = [
    {
        "system_prompt": system_prompt,
        "instruction": ex
    }
    for ex in examples
]

In [5]:
system_prompt = """\
Please create a SQLite database schema, complete with tables, columns, and types.
Make sure the schema makes sense, and that there are no tables without relations.
"""

In [6]:
with Pipeline(
    name="schema-generation",
    description="Generate database schemas for SQLite"
) as pipeline:
    llm = LiteLLM(model="gemini/gemini-2.0-flash", structured_output={"schema": DatabaseSchema})
    dataset = LoadDataFromDicts(data=example_messages)
    task = TextGeneration(
        llm=llm,
        name="schemaGeneration",
        input_batch_size=4
    )
    dataset >> task
    

In [None]:
generated = pipeline.run()

In [10]:
from datasets import load_from_disk

In [14]:
load_from_disk("../db_schemas/default/")['train'].to_pandas()

Unnamed: 0,system_prompt,instruction,generation,distilabel_metadata,model_name
0,You are an expert database architect who speci...,I'd like a database schema for a library syste...,"{""tables"":[{""table_name"":""books"",""columns"":[{""...",{'raw_input_schemaGeneration': [{'content': 'Y...,gemini/gemini-2.0-flash
1,You are an expert database architect who speci...,I'd like a database schema for a hospital mana...,"{""tables"":[{""table_name"":""patients"",""columns"":...",{'raw_input_schemaGeneration': [{'content': 'Y...,gemini/gemini-2.0-flash
2,You are an expert database architect who speci...,I'd like a database schema for a university. I...,"{""tables"":[{""table_name"":""students"",""columns"":...",{'raw_input_schemaGeneration': [{'content': 'Y...,gemini/gemini-2.0-flash
3,You are an expert database architect who speci...,I'd like a database schema for a social media ...,"{""tables"":[{""table_name"":""users"",""columns"":[{""...",{'raw_input_schemaGeneration': [{'content': 'Y...,gemini/gemini-2.0-flash
4,You are an expert database architect who speci...,I'd like a database schema for a project manag...,"{""tables"":[{""table_name"":""projects"",""columns"":...",{'raw_input_schemaGeneration': [{'content': 'Y...,gemini/gemini-2.0-flash
5,You are an expert database architect who speci...,I'd like a database schema for a restaurant. I...,"{""tables"":[{""table_name"":""menu_items"",""columns...",{'raw_input_schemaGeneration': [{'content': 'Y...,gemini/gemini-2.0-flash
6,You are an expert database architect who speci...,I'd like a database schema for a hotel. It sho...,"{""tables"":[{""table_name"":""rooms"",""columns"":[{""...",{'raw_input_schemaGeneration': [{'content': 'Y...,gemini/gemini-2.0-flash
7,You are an expert database architect who speci...,I'd like a database schema for a car rental co...,"{""tables"":[{""table_name"":""vehicles"",""columns"":...",{'raw_input_schemaGeneration': [{'content': 'Y...,gemini/gemini-2.0-flash
8,You are an expert database architect who speci...,I'd like a database schema for a music streami...,"{""tables"":[{""table_name"":""songs"",""columns"":[{""...",{'raw_input_schemaGeneration': [{'content': 'Y...,gemini/gemini-2.0-flash
9,You are an expert database architect who speci...,I'd like a database schema for a delivery serv...,"{""tables"":[{""table_name"":""Orders"",""columns"":[{...",{'raw_input_schemaGeneration': [{'content': 'Y...,gemini/gemini-2.0-flash
