In [0]:
from pyspark.sql.functions import trim 
import pandas as pd 

In [0]:
question_dict = {
  'sql_q1': "Extract the exact code snippet from the context that creates a table/view called \'workloads\' and retrieves its schema. Do not generate any code.",
  'sql_q2': "Extract the exact code snippet from the context that returns only a distinct list of workspaceId. Do not generate any code.", 
  'sql_q3': "Extract the exact code snippet from the context that returns the number of unique clusters. Do not generate, modify, or infer any code.",
  'sql_q4': """Extract the exact code snippet from the context that returns the workload hours each day for the workspace ID in ordered fashion.  
**Do not generate, modify, or suggest any code. Only extract what is explicitly present.**  
**If no matching snippet is found, return an empty string for `code_snippet` and assign a score of 0.**""",
  'sql_q5': "Extract the exact code snippet from the context that returns interactive node hours per day on the different Spark versions over time. Do not generate any code",
  'sql_q6': "Extract the exact code snippet from the context which returns top two most recently shipped (shipDate) Line Items per Part using window function. Do not generate any code."
}

score_dict = {
  'sql_q1': 15,
  'sql_q2': 15,
  'sql_q3': 15,
  'sql_q4': 15,
  'sql_q5': 15,
  'sql_q6': 25
}

In [0]:
question_dict['sql_q1']

In [0]:
spark.sql("drop table if exists users.abhay_jalisatgi.few_shots_db")

In [0]:
all_correct_answers = spark.read.table("users.abhay_jalisatgi.test_All_correct_answer_SQL")

In [0]:

all_answers_df = pd.DataFrame()
for k, v in question_dict.items():
  filtered_answers = all_correct_answers.filter(
      (trim(all_correct_answers.question) == v) & 
      (all_correct_answers.score == score_dict[k])
  ).toPandas()
  all_answers_df = pd.concat([all_answers_df, filtered_answers], ignore_index=True)

all_answers_df = all_answers_df.drop_duplicates(subset=['question', 'score'])
all_answers_df.display()


In [0]:
spark.createDataFrame(all_answers_df).write.mode("append").saveAsTable("users.abhay_jalisatgi.few_shots_db")

In [0]:
attr_dict = {
  'sql_q1': "users.abhay_jalisatgi.test_q1_no_answer_sql",
  'sql_q2': "users.abhay_jalisatgi.test_q2_no_answer_sql",
  'sql_q3': "users.abhay_jalisatgi.test_q3_no_answer_sql",
  'sql_q4': "users.abhay_jalisatgi.test_q4_no_answer_sql",
  'sql_q5': "users.abhay_jalisatgi.test_q5_no_answer_sql",
  'sql_q6': "users.abhay_jalisatgi.test_q6_no_answer_sql"
}

In [0]:
for k,v in attr_dict.items():
  df = spark.read.table(v)
  filtered_answer = df.filter((trim(df.question) == question_dict[k]) & (df.score == 0)).toPandas()
  filtered_answer = filtered_answer.drop_duplicates(subset=['question', 'score'])
  spark.createDataFrame(filtered_answer).write.mode("append").saveAsTable("users.abhay_jalisatgi.few_shots_db")

In [0]:
df = spark.read.table("users.abhay_jalisatgi.few_shots_db")
df_selected = df.select('question', 'context', 'score', 'chain_of_thought_reasoning')
df_selected.display()

In [0]:
import dspy
from dspy.datasets import HotPotQA

# Define your LLM (replace with your actual configuration)
turbo = dspy.OpenAI(model="gpt-3.5-turbo-instruct", max_tokens=300) # or gpt-4, or Cohere, etc.
dspy.settings.configure(lm=turbo)


# Define a simple signature (you'll likely have a more complex one)
class BasicQA(dspy.Signature):
    """Answer questions with short factoid answers."""
    question = dspy.InputField()
    answer = dspy.OutputField()

# Create a module that uses the signature
class GenerateAnswer(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.Predict(BasicQA)

    def forward(self, question):
        return self.generate_answer(question=question)

# Create few-shot examples using dspy.Example
train_examples = [
    dspy.Example(question="What is the capital of France?", answer="Paris").with_inputs("question"),
    dspy.Example(question="What is the highest mountain in the world?", answer="Mount Everest").with_inputs("question"),
    dspy.Example(question="Who painted the Mona Lisa?", answer="Leonardo da Vinci").with_inputs("question"),
]

# Key Changes and Explanations:

# * dspy.Example(...).with_inputs("question"):  This is CRUCIAL.  It tells DSPy which fields in your
#   Example correspond to the input fields defined in your Signature (BasicQA in this case).  Without
#   with_inputs, DSPy won't know how to use the examples correctly.  It specifies that only the
#   "question" field should be considered as input during training or demonstration. The "answer"
#   field is *not* an input; it's what we *want* the LM to produce.

# Instantiate the module
qa_module = GenerateAnswer()

# Demonstrate the few-shot examples to the module
qa_module.generate_answer.demos = train_examples # This is how you attach the examples

# Now, use the module
prediction = qa_module(question="Who wrote Hamlet?")
print(prediction.answer)

# Example with a more complex signature and multiple inputs
class ContextualQA(dspy.Signature):
    """Answer questions based on the given context."""
    context = dspy.InputField()
    question = dspy.InputField()
    answer = dspy.OutputField()

class GenerateContextualAnswer(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.Predict(ContextualQA)

    def forward(self, context, question):
        return self.generate_answer(context=context, question=question)

train_examples_contextual = [
    dspy.Example(context="The Eiffel Tower is a wrought-iron lattice tower located in Paris, France.", question="Where is the Eiffel Tower located?", answer="Paris, France").with_inputs("context", "question"),
    dspy.Example(context="Mount Everest, also known as Sagarmatha in Nepali, is Earth's highest mountain above sea level.", question="What is another name for Mount Everest?", answer="Sagarmatha").with_inputs("context", "question"),
]

contextual_qa_module = GenerateContextualAnswer()
contextual_qa_module.generate_answer.demos = train_examples_contextual

prediction = contextual_qa_module(context="Shakespeare wrote many famous plays, including Hamlet, Macbeth, and Romeo and Juliet.", question="Name a play written by Shakespeare.")
print(prediction.answer)