In [0]:
%pip install nbformat databricks-sdk[openai]==0.38.0 dspy --quiet
dbutils.library.restartPython()

In [0]:
df = spark.read.table("users.abhay_jalisatgi.few_shots_db")
df_selected = df.select('question', 'context', 'score', 'code_snippet')
df_selected.display()

In [0]:
import dspy 
class Example(dspy.Signature):
  text = dspy.InputField(desc="The text to analyze")
  score: int = dspy.OutputField(desc="The score provided by the model")
  code_snippet: str = dspy.OutputField(desc=""" supporting code snippet provided by the model for the score """)
  explanation: str = dspy.OutputField(desc=""" explanation provided by the model for the score """)

In [0]:
lm = dspy.LM('databricks/databricks-meta-llama-3-3-70b-instruct')
dspy.settings.configure(lm=lm)

In [0]:
# Define a simple signature (you'll likely have a more complex one)
class BasicQA(dspy.Signature):
    """Answer questions with short factoid answers."""
    question = dspy.InputField()
    answer = dspy.OutputField()

# Create a module that uses the signature
class GenerateAnswer(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.Predict(BasicQA)

    def forward(self, question):
        return self.generate_answer(question=question)

# Create few-shot examples using dspy.Example
train_examples = [
    dspy.Example(question="What is the capital of France?", answer="Paris").with_inputs("question"),
    dspy.Example(question="What is the highest mountain in the world?", answer="Mount Everest").with_inputs("question"),
    dspy.Example(question="Who painted the Mona Lisa?", answer="Leonardo da Vinci").with_inputs("question"),
]

In [0]:
# Instantiate the module
qa_module = GenerateAnswer()

# Demonstrate the few-shot examples to the module
qa_module.generate_answer.demos = train_examples 

In [0]:
class SQLPrompt(dspy.Signature):
  text = dspy.InputField(desc="The text to analyze")
  score: int = dspy.OutputField(desc="The score provided by the model")
  code_snippet: str = dspy.OutputField(desc=""" supporting code snippet provided by the model for the score """)
  explanation: str = dspy.OutputField(desc=""" explanation provided by the model for the score """)

class GenerateAnswerSQLPrompt(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.ChainOfThought(SQLPrompt)

    def forward(self, context):
        return self.generate_answer(text=context)

In [0]:
training = [dspy.Example(text=row.context, score=row.score, code_snippet = row.code_snippet).with_inputs("text") for row in df_selected.collect()]

In [0]:
# Instantiate the module
qa_module = GenerateAnswerSQLPrompt()

# Demonstrate the few-shot examples to the module
qa_module.generate_answer.demos = training

In [0]:
context = """ 
                              # Databricks Coding Challenge - SQL
### Note: All questions should be done using SQL language

## Spark SQL and DataFrames 

In this section, you'll read in data to create a DataFrame in Spark.  We'll be reading in a dataset stored in the Databricks File System (DBFS).  Please see this [link](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html#databricks-file-system-dbfs) for more details on how to use DBFS.##Understanding the data set 

###Overview:
The data set used throughout the coding assessment resembles telemetry data that any software as a service (SaaS) company might collect. One record represents the node hours for a single workload running on a transient cluster aggregated at the date and workload type level. This data set may be used to help Databricks understand consumption patterns and user behaviors on our platform. For example, we can inspect this data to understand if a given customer prefers our `automated` or `interactive` features, or understand which AWS instance types are preferred among all of our customers. 

###Format: 
 * JSON
 * Resides on S3

###Schema:
* date (String)
* nodeHours (Double)
* workloadType (String) (read more [here](https://databricks.com/product/aws-pricing#clusters))
* metadata (Struct)
 * clusterMetadata (Struct): Describes the cluster configuration
 * runtimeMetadata (Struct): Describes the software configuration
 * workloadMetadata (Struct): Describes the customer. Each shard may have one or many workspaces and each workspace may have zero or many clusters 

### Part A: SparkSQL and Dataframes 

In this section, you'll read in data to create a dataframe in Spark.  We'll be reading in a dataset stored in the Databricks File System (DBFS).  Please see this link for more details on how to use DBFS:
https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html#databricks-file-system-dbfsExecute the command below to list the files in a directory that you will be analyzing.  There are several files in this test dataset.%fs ls /databricks-coding-challenge/workloads%fs head dbfs:/databricks-coding-challenge/workloads/part-00000-tid-7467717951814126607-30bac750-dd23-4160-a2a6-e57064ff0dc6-1506091-1-c000.json
### Question 1 (15 points):
Please create a temporary Spark SQL view called "workloads" from the json files in the directory listed up above%python
df = (spark.read
  .format("json")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("/databricks-coding-challenge/workloads/")
)
df.createOrReplaceTempView("workloads")
What is the schema for this table?-- use describe command to get schema of this table
-- alteratively can describe in python using df.printSchema()
desc workloads-- Get an idea of overall size of the table 
-- SELECT COUNT(*) FROM workloads
### Question 2 (15 points):

Please print out all the unique workspaceId's for this dataset and order them such that workspaceId's are increasing in number.-- unique workspace id's in ascending order
-- Q: workspaceId is string containing int values, do we need to order by their actual value? 
SELECT DISTINCT metadata.workloadMetadata.workspaceId
FROM workloads
ORDER BY abs(workspaceId)  --order by increasing in number-- sanity check 
-- Q: Why is no of unique workspaceId so less 
-- A: distribution is skewed, most are mapped to one workspaceId
SELECT metadata.workloadMetadata.workspaceId,
       count(*) as no_of_records
FROM workloads
GROUP BY metadata.workloadMetadata.workspaceId
ORDER BY no_of_records DESC--sanity check 2 
SELECT sum(no_of_records) as total_records
FROM (
  SELECT metadata.workloadMetadata.workspaceId,
       count(*) as no_of_records
  FROM workloads
  GROUP BY metadata.workloadMetadata.workspaceId
  ORDER BY no_of_records DESC
)
### Question 3 (15 points):

What is the number of unique clusters in this data set?  A cluster is identified by the `metadata.workloadMetadata.clusterId` field.-- no of unique clusters in this data set 
SELECT COUNT(DISTINCT metadata.workloadMetadata.clusterId) AS unique_clusters 
FROM workloads### Question 4 (15 points): 
What is the number of workload hours each day for the workspaceID - `-9014487477555684744`?-- no of workload hours each day for workspaceID - -9014487477555684744 
-- Assumption: using nodeHours as a proxy for workload hours 
SELECT date,
       SUM(nodeHours) AS workload_hours
FROM workloads 
WHERE metadata.workloadMetadata.workspaceId = '-9014487477555684744'
GROUP BY date 
ORDER By date
Determine how many nodes are spot vs. on demand for a given cluster.-- Get some cluster id which have both
select metadata.workloadMetadata.clusterId,
       collect_set(metadata.clusterMetadata.containerIsSpot) as value_set
from workloads 
group by metadata.workloadMetadata.clusterId
having size(value_set) = 2-- #  change the cluster id here to change the output in cmd24 
CREATE WIDGET TEXT clusterId DEFAULT "-1048771871094449110"-- Determine how many nodes are spot vs. on demand for a given cluster.
-- Sol: group by metadata.clusterMetadata.containerIsSpot for a given cluster id 

SELECT CASE WHEN metadata.clusterMetadata.containerIsSpot THEN 'spot'
            ELSE 'on-demand'
       END AS node_type,
       count(*) as no_of_nodes
FROM workloads WHERE metadata.workloadMetadata.clusterId = $clusterId
GROUP BY metadata.clusterMetadata.containerIsSpotREMOVE WIDGET clusterId### Question 5 (15 points): 

How many interactive node hours per day are there on the different Spark versions over time.-- How many interactive node hours per day are there on the different Spark versions over time.
SELECT metadata.runtimeMetadata.sparkVersion,
       date,
       SUM(nodeHours) AS total_node_hours
FROM workloads 
WHERE LOWER(workloadType) = 'interactive'
GROUP BY date,
         metadata.runtimeMetadata.sparkVersion
ORDER BY metadata.runtimeMetadata.sparkVersion,
         date### Question 6 (25 points):
#### TPC-H Dataset
You're provided with a Line Items records from the TPC-H data set. The data is located in `/databricks-datasets/tpch/data-001/lineitem`.
Find the top two most recently shipped (shipDate) Line Items per Part using the simplest and most efficient approach.

You're free to use any combinations of SparkSQL, PySpark or Scala Spark to answer this challenge.![](https://docs.deistercloud.com/mediaContent/Databases.30/TPCH%20Benchmark.90/media/tpch_schema.png?v=0)%python
src ='/databricks-datasets/tpch/data-001/lineitem/lineitem.tbl'
schema =", ".join(['orderkey int', 'partkey int', 'suppkey int', 'lineNumber int', 'quantity int', 'extendedprice float', 'discount float', 'tax float', 'returnflag string', 'linestatus string', 'shipdate date', 'commitdate date', 'receiptdate date', 'shipinstruct string', 'shipmode string', 'comment string'])
tpc_h = (spark.read.format("csv") 
          .schema(schema)
          .option("header", False)
          .option("sep", "|")
          #.option("inferSchema", True)
          .load(src)
        )
%python
dbutils.fs.head('/databricks-datasets/tpch/data-001/lineitem/lineitem.tbl')%python
display(tpc_h)%python
tpc_h.createOrReplaceTempView('tpc_h')Find the top two most recently shipped (shipDate) Line Items per Part using the simplest and most efficient approach.-- Q: How much info do we need to return for line item. Is order key enough? 

WITH line_item_ranked AS 
(
SELECT orderkey,
       partkey,
       shipdate,
       -- assuming if there are two shipped on the same date, then only those 2 are returned
       row_number() OVER(PARTITION BY partkey ORDER BY shipDate DESC) AS most_recent_ranked
FROM tpc_h
)
SELECT partkey,
       orderkey,
       shipdate 
FROM line_item_ranked 
WHERE most_recent_ranked <= 2 
                       """

In [0]:
prediction = qa_module(context=context)

In [0]:
print(prediction)

In [0]:
%run ../helper/GradingModule

In [0]:
class SQLPrompt1(dspy.Signature):
  """ Check if the provided context has any code snippet which creates a table/view called workloads and get shcema of workloads """

  text: str = dspy.InputField()

  score: str = dspy.OutputField(desc="15 if any code snippet below creates a table/view and displays its schema, 7.5 if it creates a table/view but does not display its schema,  0 if no code snippet creates a table/view")

  code_snippet: str = dspy.OutputField(desc="provide the code snippet which creates a table/view and displays its schema encapsulated as a string")

In [0]:
x = dspy.ChainOfThought(SQLPrompt1)

In [0]:
x.__dict__['predict'].__dict__['signature'].instructions

In [0]:
help(dspy.ChainOfThought)

In [0]:
class Answer(dspy.Signature):
  score: int = dspy.OutputField(desc="The score provided by the model")
  reasoning: str = dspy.OutputField(desc="The reason provided by the model for the score")
  code_snippet: str = dspy.OutputField(desc=""" supporting code snippet provided by the model for the score """)

In [0]:
# Create a prediction
actual_output = Answer(
    score="15",
    reasoning="The code snippet provided creates a temporary Spark SQL view called \"workloads\" from the JSON files in the specified directory. The schema of this table can be obtained using the `DESCRIBE` command or the `printSchema()` method in Python",
    code_snippet=""" ```python
                      df = (spark.read
                        .format("json")
                        .option("header", "true")
                        .option("inferSchema", "true")
                        .load("/databricks-coding-challenge/workloads/")
                      )
                      df.createOrReplaceTempView("workloads")

                      # Get the schema of the table
                      desc workloads
                      ``` """
)

In [0]:
qa_pair = dspy.Example(question=SQLPrompt1, answer=actual_output)

In [0]:
module = Module(databricks.sdk.WorkspaceClient())

In [0]:
module.set_module_dict({'module_1': dspy.ChainOfThought(SQLPrompt1)})

In [0]:
candidate_dict = [
  ("X","/Workspace/Users/vibhor.nigam@databricks.com/interview-scripts/interview-grading/example-notebooks-sql/01-SQL-X")
]

human_graded_dict = {
  "X": [15]
}
# w = databricks.sdk.WorkspaceClient()

In [0]:
lm = dspy.LM('databricks/databricks-meta-llama-3-3-70b-instruct')
dspy.settings.configure(lm=lm)

In [0]:
context_path = candidate_dict[0][1]
human_answers_list = human_graded_dict[candidate_dict[0][0]]
table_name = "users.abhay_jalisatgi.gen_ai_eval"
section = "SQL"
candidate = "X"

print(f"Context path: {context_path}\n, Human answers list: {human_answers_list}\n, Table name: {table_name}\n, Section: {section}\n, Candidate: {candidate}\n")

In [0]:
results = module.get_error_and_answer_dict(context_path, human_answers_list, table_name, section, candidate)

In [0]:
results['answers_dict']

In [0]:
qa_pair.answer.score

In [0]:
def evaluate_score(example, pred, trace=None):
  actual_values = np.array(example.answer.score)
  pred = np.array(pred)
  return np.sqrt(np.mean((actual_values - pred)**2))

In [0]:
pred = results['answers_dict']['score'].tolist()
pred

In [0]:
print(evaluate_score(qa_pair, pred))

In [0]:
results['answers_dict'].display()

In [0]:
spark.sql("select * from users.abhay_jalisatgi.gen_ai_eval limit 5 ").display()

In [0]:
%sql
create table users.abhay_jalisatgi.training_set as 
select distinct score
      , code_snippet
      , candidate
      ,secton as section

from users.abhay_jalisatgi.gen_ai_eval

In [0]:
df = spark.read.table("users.abhay_jalisatgi.training_set")
df = df.dropDuplicates(["candidate", "section"])
df.write.mode("overwrite").saveAsTable("users.abhay_jalisatgi.training_set")

In [0]:
import dspy
from dspy.signatures import Signature
from dspy.signatures.field import InputField, OutputField

# Define a Signature class with a docstring
class SentimentAnalysis(Signature):
  """Analyze sentiment in text"""
  input_text = InputField(desc="Text to be analyzed")
  sentiment = OutputField(desc="Overall sentiment (positive, negative, neutral)")
  key_insights = OutputField(desc="Specific elements contributing to the sentiment")

# Function to extract docstring from a Signature class
def get_signature_docstring(signature_class):
    docstring = signature_class.__doc__
    return docstring.strip() if docstring else "No docstring available"

# Extract and print the docstring
docstring = get_signature_docstring(SentimentAnalysis)
print("Docstring of SentimentAnalysis:")
print(docstring)

# Example of handling a Signature without a docstring
class TopicAnalysis(Signature):
    input_text = InputField(desc="Text to be analyzed")
    topics = OutputField(desc="Main topics identified")

docstring = get_signature_docstring(TopicAnalysis)
print("\nDocstring of TopicAnalysis:")
print(docstring)