In [22]:
#|export
import subprocess
import dspy
import logging
import numpy as np
from dotenv import load_dotenv
from dspy.datasets import DataLoader
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShotWithRandomSearch, LabeledFewShot

## Text to SQL with DSPy

Heavily pulled from this great notebook --> [DSPy-Text2SQL.ipynb](https://github.com/jjovalle99/DSPy-Text2SQL/blob/23a0a347db2d7515c5a28c305dacaea00d09dddc/DSPy-Text2SQL.ipynb) 

In [13]:
# |export
########## DSPy Config  ##########
lm = dspy.OllamaLocal("open-hermes-2-4_0", max_tokens=2000, model_type="text")
evaluator_lm = dspy.OllamaLocal("nous-llama-3-8b-instruct", max_tokens=4000, model_type="text")

dspy.settings.configure(lm=lm)

Testing inference of `open-hermes` and the quantized `nous-llama-3-instruct`

In [14]:
lm("Who is Bodhi in Point Break?")

['Bodhi is a character from the 1991 film "Point Break." He is portrayed by actor Patrick Swayze. In the movie, Bodhi is the leader of a group of surfers who turn out to be bank robbers. He is known for his charismatic and free-spirited nature, as well as his deep connection with surfing.']

In [15]:
evaluator_lm(prompt="Who is Johnny Utah?")

['Johnny Utah is a fictional character played by Keanu Reeves in the 1991 action film "Point Break". He is an FBI agent who goes undercover as a surfer to investigate a group of bank robbers who are also surfers. The movie follows his adventures and battles with the group, led by the charismatic and mysterious Bodhi (played by Patrick Swayze).']

### Add Arize Phoenix Telemetry

In [16]:
########## Arize Phoenix ##########
import phoenix as px
from openinference.instrumentation.dspy import DSPyInstrumentor
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace.export import SimpleSpanProcessor

In [17]:
phoenix_session = px.launch_app()
endpoint = "http://localhost:6006/v1/traces"
resource = Resource(attributes={})
tracer_provider = trace_sdk.TracerProvider(resource=resource)
span_otlp_exporter = OTLPSpanExporter(endpoint=endpoint)
tracer_provider.add_span_processor(SimpleSpanProcessor(span_exporter=span_otlp_exporter))

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [18]:
trace_api.set_tracer_provider(tracer_provider=tracer_provider)
DSPyInstrumentor().instrument()
class TestTrace(dspy.Signature):
    """
    Perform the task requested to the best of your ability.
    """

    task = dspy.InputField(desc="Task to be performed.")
    answer = dspy.OutputField(desc="The answer.")

Test traces

In [20]:
test_trace = dspy.Predict(TestTrace)
answer = test_trace(task="Say Hello.")
test_trace_cot = dspy.ChainOfThought(TestTrace)
pred = test_trace_cot(task="What would Bodhi from Point Break say is the key to surfing?")
pred.answer

'According to Bodhi from Point Break, the key to surfing is not just about technical skill but also about embracing the moment, feeling the rush of adrenaline, and living life to its fullest.'

In [23]:
np.random.seed(42)

Using [Synthetic Text to Sql](https://huggingface.co/datasets/gretelai/synthetic_text_to_sql/viewer/default/train?row=1) dataset

In [24]:
dl = DataLoader()
dataset = dl.from_huggingface(
    dataset_name="gretelai/synthetic_text_to_sql", # Dataset name from Huggingface
    fields=("sql_prompt", "sql_context", "sql"), # Fields needed
    input_keys=("sql_prompt", "sql_context") # What our model expects to recieve to generate an output
)
trainset, testset = dl.sample(dataset["train"], n=50), dl.sample(dataset["test"], n=25) # 50 training samples, 25 testing samples


Downloading readme:   0%|          | 0.00/8.18k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/32.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.86M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Verify an example of the dataset

In [27]:
sample = dl.sample(dataset=trainset, n=1)[0]

In [36]:
for k,v in sample.items():
    print(f"\n{k.upper()}:\n")
    print(v)


SQL_PROMPT:

What is the total number of eco-friendly tours in the United Kingdom and Ireland?

SQL_CONTEXT:

CREATE TABLE eco_tours (id INT, country VARCHAR(20), tours INT); INSERT INTO eco_tours (id, country, tours) VALUES (1, 'United Kingdom', 200), (2, 'Ireland', 150), (3, 'Australia', 100);

SQL:

SELECT SUM(tours) FROM eco_tours WHERE country IN ('United Kingdom', 'Ireland');


### Create Signature


In [37]:
class TextToSql(dspy.Signature):
    """Transform a natural language query into a SQL query."""

    sql_prompt = dspy.InputField(desc="Natural language query")
    sql_context = dspy.InputField(desc="Context for the query")
    sql = dspy.OutputField(desc="SQL query")

### Inference

Baseline Inference 

In [38]:
generate_sql_query = dspy.Predict(signature=TextToSql)

In [42]:
result = generate_sql_query(sql_prompt=sample["sql_prompt"], sql_context=sample["sql_context"])

for k, v in result.items():
    print(f"\n{k.upper()}:\n")
    print(v)


SQL:

Sql Prompt: What is the total number of eco-friendly tours in the United Kingdom and Ireland?
Sql Context: CREATE TABLE eco_tours (id INT, country VARCHAR(20), tours INT); INSERT INTO eco_tours (id, country, tours) VALUES (1, 'United Kingdom', 200), (2, 'Ireland', 150), (3, 'Australia', 100);
Sql: SELECT SUM(tours) FROM eco_tours WHERE country IN ('United Kingdom', 'Ireland');


### Metric of Evaluation