# SQL Interp Datasets Notebook

In [36]:
%%capture
#!pip install git+https://github.com/gretelai/gretel-python-client@dev/data-designer-m1
!pip install git+https://github.com/gretelai/gretel-python-client

session_kwargs = {
    "api_key": "prompt",
    "endpoint": "https://api-dev.gretel.cloud",
    "cache": "yes",
}

In [37]:
from gretel_client.navigator import DataDesigner
from datasets import Dataset
import yaml

### 📘 Text-to-Python Blueprint

In [38]:
with open('cs11.yaml', 'r') as file:
    blueprint_string = file.read()

In [39]:
# Defines a new DataDesigner instance
designer = DataDesigner.from_config(blueprint_string, **session_kwargs)
designer

Found cached Gretel credentials
Using endpoint https://api-dev.gretel.cloud
Logged in as dhruv@gretel.ai ✅
[05:03:26] [INFO] 🦜 Using apache-2.0 model suite


DataDesigner(
    categorical_seed_columns: ['topic', 'instruction_phrase']
    generated_data_columns: ['table_name', 'column_name', 'column_data_type', 'sql_prompt', 'sql_context', 'sql']
    validator: code:ansi
    evaluator: text_to_sql
)

### 👀 Generating a dataset preview

In [40]:
preview = designer.generate_dataset_preview()

[05:03:26] [INFO] 🚀 Generating dataset preview
[05:03:27] [INFO] 🦜 Step 1: Generate seed category values
[05:03:29] [INFO] 🎲 Step 2: Sample data seeds
[05:03:29] [INFO] 🦜 Step 3: Generate column from template >> generating table name
[05:03:30] [INFO] 🦜 Step 4: Generate column from template >> generating column name
[05:03:32] [INFO] 🦜 Step 5: Generate column from template >> generating column data type
[05:03:33] [INFO] 🦜 Step 6: Generate column from template >> generating sql prompt
[05:03:35] [INFO] 🦜 Step 7: Generate column from template >> generating sql context
[05:03:36] [INFO] 🦜 Step 8: Generate column from template >> generating sql
[05:03:38] [INFO] 🔍 Step 9: Validate code
[05:03:40] [INFO] ⚖️ Step 10: Judge with llm
[05:03:49] [INFO] 🧐 Step 11: Evaluate dataset
[05:03:49] [INFO] 👀 Your dataset preview is ready for a peek!


In [41]:
preview.display_sample_record(index=5)

In [42]:
preview.output

Unnamed: 0,topic,instruction_phrase,table_name,column_name,column_data_type,sql_prompt,sql_context,sql,sql_context_is_valid,sql_context_validator_messages,sql_is_valid,sql_validator_messages,judged_by_llm,text_to_sql_llm_judge_results
0,customers,Compose a report on,customers_info,customer_id,INT,Compose a report on customer_id from customers...,CREATE TABLE customers_info (\n customer_id...,SELECT customer_id FROM customers_info;,True,[],True,[],True,"{'relevance': {'score': 4, 'reasoning': 'The S..."
1,human_resources,Generate a chart for,human_resources_employees,employee_id,INT,Generate a chart for employee_id from human_re...,CREATE TABLE human_resources_employees (\n ...,SELECT employee_id FROM human_resources_employ...,True,[],True,[],True,"{'relevance': {'score': 4, 'reasoning': 'The S..."
2,inventory,Retrieve the,inventory_items,inventory_item_id,INT,Retrieve the inventory_item_id from the invent...,CREATE TABLE inventory_items (\n inventory_...,SELECT inventory_item_id FROM inventory_items;,True,[],True,[],True,"{'relevance': {'score': 4, 'reasoning': 'The S..."
3,customers,Create a summary for,customers_info,customer_id,INT,Create a summary for customer_id from customer...,CREATE TABLE customers_info (\n customer_id...,SELECT customer_id FROM customers_info;,True,[],True,[],True,"{'relevance': {'score': 4, 'reasoning': 'The S..."
4,customers,Extract the,customers_info,customer_id,INT,Extract the customer_id from customers_info,CREATE TABLE customers_info (\n customer_id...,SELECT customer_id FROM customers_info;,True,[],True,[],True,"{'relevance': {'score': 4, 'reasoning': 'The S..."
5,orders,Create a summary for,orders_table,order_id,INT,Create a summary for the order_id from the ord...,CREATE TABLE orders_table (\n order_id INT\n);,SELECT order_id FROM orders_table;,True,[],True,[],True,"{'relevance': {'score': 4, 'reasoning': 'The S..."
6,finance,Retrieve the,finance_data,finance_amount,DECIMAL,Retrieve the finance_amount from finance_data,CREATE TABLE finance_data (\n finance_amoun...,SELECT finance_amount FROM finance_data;,True,[],True,[],True,"{'relevance': {'score': 4, 'reasoning': 'The S..."
7,human_resources,Develop a questionnaire for,human_resources_employees,employee_id,INT,Develop a questionnaire for selecting employee...,CREATE TABLE human_resources_employees (\n ...,SELECT employee_id FROM human_resources_employ...,True,[],True,[],True,"{'relevance': {'score': 4, 'reasoning': 'The S..."
8,human_resources,Draft a letter to,human_resources_employees,employee_id,INT,Draft a letter to select employee_id from huma...,CREATE TABLE human_resources_employees (\n ...,SELECT employee_id FROM human_resources_employ...,True,[],True,[],True,"{'relevance': {'score': 4, 'reasoning': 'The S..."
9,customers,Provide the,customers_info,customers_id,INT,Provide the customers_id from customers_info,CREATE TABLE customers_info (\n customers_i...,SELECT customers_id FROM customers_info;,True,[],True,[],True,"{'relevance': {'score': 4, 'reasoning': 'The S..."


In [None]:
dataset_preview = Dataset.from_pandas(preview.output)
dataset_preview.push_to_hub(f"dhruvnathawani/cs11-preview")

### 🤔 Like what you see? Generate an entire dataset

In [44]:
# Submit a batch workflow to generate records
results = designer.submit_batch_workflow(num_records=500)

[05:03:50] [INFO] ⚙️ Configuring Data Designer Workflow steps:
[05:03:50] [INFO]   |-- Step 1: generate-seed-category-values-1
[05:03:50] [INFO]   |-- Step 2: sample-data-seeds-2
[05:03:50] [INFO]   |-- Step 3: generate-column-from-template-3-generating-table-name
[05:03:50] [INFO]   |-- Step 4: generate-column-from-template-4-generating-column-name
[05:03:50] [INFO]   |-- Step 5: generate-column-from-template-5-generating-column-data-type
[05:03:50] [INFO]   |-- Step 6: generate-column-from-template-6-generating-sql-prompt
[05:03:50] [INFO]   |-- Step 7: generate-column-from-template-7-generating-sql-context
[05:03:50] [INFO]   |-- Step 8: generate-column-from-template-8-generating-sql
[05:03:50] [INFO]   |-- Step 9: validate-code-9
[05:03:50] [INFO]   |-- Step 10: judge-with-llm-10
[05:03:50] [INFO]   |-- Step 11: evaluate-dataset-11
[05:03:57] [INFO] 🛜 Connecting to your Gretel Project:
[05:03:57] [INFO] 🔗 -> https://console-dev.gretel.ai/proj_2pmd98MaNUS0rUw8YRNWlJdxJoJ
[05:04:00] 

In [None]:
# Fetch the dataset
df = results.fetch_dataset(wait_for_completion=True)

[05:04:06] [INFO] ⏳ Waiting for workflow step `judge-with-llm-10` to complete...
[05:04:06] [INFO] 👀 Follow along -> https://console-dev.gretel.ai/workflows/w_2pmdAE4QGugstI4JpldG1Ouqe6Z/runs/wr_2pmdAU7FdddgASkUH6MloOb5sBm


In [None]:
path = results.download_evaluation_report()

In [None]:
print(df.keys())