In [0]:
%pip install nbformat databricks-sdk[openai]==0.38.0 dspy --quiet
dbutils.library.restartPython()

In [0]:
import pandas as pd

In [0]:
import dspy
lm = dspy.LM('databricks/databricks-meta-llama-3-3-70b-instruct')
dspy.settings.configure(lm=lm)

In [0]:
%run ../prompts/ml_prompts_with_DSPy

In [0]:
import base64

import databricks.sdk
from databricks.sdk.service.workspace import ImportFormat
import nbformat
import numpy as np 

In [0]:
candidate_dict = [
  ("X","/Workspace/Users/vibhor.nigam@databricks.com/interview-scripts/interview-grading/example-notebooks-ml/04-Machine-Learning-X")
]

human_graded_dict = {
  "X": [5, 30, 25, 5, 0]
}
w = databricks.sdk.WorkspaceClient()

In [0]:
module_dict = {
  'module_1': dspy.ChainOfThought(MLPrompt1),
  'module_2': dspy.ChainOfThought(MLPrompt2),
  'module_3': dspy.ChainOfThought(MLPrompt3),
  'module_4': dspy.ChainOfThought(MLPrompt4),
  'module_5': dspy.ChainOfThought(MLPrompt5)
}

In [0]:
def get_code_from_notebooks(workspace_client, notebook_path):
   notebook = workspace_client.workspace.export(notebook_path, 
                                    format=ImportFormat.JUPYTER)
   ipynb = base64.decodebytes(notebook.content.encode('ascii')).decode("utf-8")
   notebook = nbformat.reads(ipynb, as_version=4)
   code_dict = {}
   x=0
   for cell in notebook.cells:
      if cell.cell_type == 'code' or cell.cell_type == 'markdown':
         code_dict[x] = cell.source
         x+=1
   context = ''.join(code_dict.values())
   return context
# ipynb

In [0]:
def evaluate_responses(context, module_dict):
  answer_list = []
  for k,module in module_dict.items():
    response = module(text=context)
    response_dict = {
      'score': response.score,
      'reasoning': response.reasoning,
      'code_snippet': response.code_snippet
    }
    # print(response_dict)
    answer_list.append(response_dict)
  answer_list_df = pd.DataFrame(answer_list)

  return answer_list_df

In [0]:
# spark.sql(f" drop table if exists users.abhay_jalisatgi.gen_ai_eval ")

In [0]:

error_dict = {}
answers_dict = {}
for candidate in candidate_dict:
  context = get_code_from_notebooks(w, candidate[1])
  answers = evaluate_responses(context, module_dict)
  answers['candidate'] = candidate[0]
  answers['score'] = answers['score'].astype('float64')
  # spark.createDataFrame(answers).write.mode("append").saveAsTable("users.abhay_jalisatgi.gen_ai_eval")

  human_answers_list = human_graded_dict[candidate[0]]
  model_asnwers_list = answers['score'].tolist()
  diff = [human_answers_list[i] - model_asnwers_list[i] for i in range(len(human_answers_list))]
  
  abs_error = np.abs(diff).sum()
  inaccurate_predictions = len([x for x in diff if x != 0])

  error_dict[candidate[0]] = {'abs_error': abs_error, 'inaccurate_predictions': inaccurate_predictions, 'context': diff}
  answers_dict[candidate[0]] = answers

In [0]:
error_dict

In [0]:
answers_dict['X'].display()

In [0]:
def evaluate_model(diff):
  unmatched_elements = [i for i in diff if i != 0]
  diff_in_score = sum(unmatched_elements)
  return diff_in_score

In [0]:
# human_answer_list_Y = [15,15,15,12,15,25]
# model_answer_list_Y = [x.score for x in spark.read.table("users.abhay_jalisatgi.gen_ai_eval").filter(f"candidate = 'Y'").select("score").collect()]
# diff = [human_answer_list_Y[i] - model_answer_list_Y[i] for i in range(len(human_answer_list_Y))]
# print(f" Human score - Model score : {evaluate_model(diff)}, answers differ for {diff}")

# # Grader Notes for Q4: For Part2, the grouping should not be performed based on date. Partial credit awarded.
# # Grading Criteria for Q4: Workload hours each day for the workspaceID (15)- If ordering is missing, deduct 3 points. Each section is 7.5 points.