In [0]:
%pip install nbformat --quiet


In [0]:
%pip install -U dspy --quiet
dbutils.library.restartPython()

In [0]:
from concurrent.futures import ThreadPoolExecutor
import sys

In [0]:
%run ../prompts/sql_prompts_with_module_DSPy

In [0]:
%run ../helper/GradingModule

In [0]:
"""
    # Define global variables to be used throughout the notebook 
"""

global_candidate_dict = {
  "X": {
    f"SQL": "/Workspace/Users/vibhor.nigam@databricks.com/interview-scripts/interview-grading/example-notebooks-sql/test-cases/01-SQL-X-no-answer-q1"
  }
}

global_human_graded_dict = {
  "X": {
    f"SQL": [0,15,15,12,15,25]
  }
}

global_llm_models_dict = {
  # 'llm-405B': 'databricks/databricks-meta-llama-3-1-405b-instruct',
  'llm-70B': 'databricks/databricks-meta-llama-3-3-70b-instruct'
  # 'claude': "databricks/databricks-claude-3-7-sonnet"
}

global_module = Module(databricks.sdk.WorkspaceClient())
global_module_to_check = "SQL"

In [0]:
print(global_candidate_dict)
print(global_human_graded_dict)

In [0]:
def evaluate_scores(predictions, candidate, notebook_name):
  print("evaluate scores")
  mismatch_dict = {}

  human_grading = global_human_graded_dict[candidate][notebook_name]

  scores_list = predictions['score'].tolist()
  code_snippet_list = predictions['code_snippet'].tolist()
  reasoning_list = predictions['chain_of_thought_reasoning'].tolist()

  for i in range(0, len(human_grading)):
    if scores_list[i]!= human_grading[i]:
      mismatch_dict[f"{candidate}_{notebook_name}_q{i+1}"] = {
        'predicted_score': scores_list[i],
        'human_score': human_grading[i],
        'code_snippet': code_snippet_list[i],
        'reasoning': reasoning_list[i]
      }
  if not mismatch_dict:
    print("No mismatch found")
    return None

  print(mismatch_dict)
  return mismatch_dict

In [0]:
def evaluate_section_with_llm(lm, context_path,  candidate, notebook_name):
  
  start_time = time.time()

  # Set the module dictionary to be used. 
  # A module dictionary will have prompts for each question in a module 
  global_module.set_module_dict(global_module_to_check)

  # Get results for the section
  results = global_module.get_error_and_answer_dict(context_path)
  

  results['answers_dict']['candidate'] = candidate
  results['answers_dict']['notebook_name'] = notebook_name
  results['answers_dict']['context'] = results['context'] 

  # End time calculation 
  end_time = time.time()
  execution_time = end_time - start_time
  print(f" time taken by model {llm_name} for execution of notebook {notebook_name} is {execution_time} seconds")

  return results

In [0]:
def evaluate_candidate_with_llm(llm_name, candidate):

  section_df = pd.DataFrame()
  mismatch_list = [] 

  # Iterate through multiple notebooks of a candidate
  for notebook_name, notebook_path in notebook_dict.items():
    print(f" Evaluating {notebook_name} for {candidate}")
    
    # Evaluate result of a candidate's notebook
    notebook_result = evaluate_section_with_llm(
                                                  llm_name
                                              , notebook_path
                                              , candidate
                                              , notebook_name
                                              )
    
    # evaluate results for all notebooks of a candidate 
    evaluation_result = evaluate_scores(notebook_result['answers_dict'],  candidate, notebook_name)
    
    #concatenate results of all notebooks for a candidate 
    section_df = pd.concat([section_df, notebook_result['answers_dict']], ignore_index=True)

    #combine results of all mismatch of a candidate
    mismatch_list.append(evaluation_result)

  return {
    'result_df': section_df,
    'mismatch_list': mismatch_list
  }

In [0]:
import time 

num_iterations = 6  #Since num_iterations are used to create parallel threads and it is an I/O bound task. max(num_iterations) = 3 x VCPU in the instance
 
candidate_dict = {} # To store end results for each model

# For each candidate 
for candidate, notebook_dict in global_candidate_dict.items():
  # Take the candidate name and corresponding notebook 
  llm_dict = {}
  # iterate through multiple llms 
  for llm_name,llm_signature in global_llm_models_dict.items():

    # dspy.settings can only be changed by the thread that initially configured it.
    lm = dspy.LM(llm_signature, cache=False, temperature=0)
    dspy.configure(lm=lm)

    iteration_dict = {}
    # For each llm run multiple iterations to check for consistency
    # Use a thread pool to manage threads
    with ThreadPoolExecutor(max_workers=num_iterations) as executor:
        # run num_iterations threads in parallel
        futures = {executor.submit(evaluate_candidate_with_llm, llm_name, candidate): i for i in range(1, num_iterations+1)}

        # Collect results as threads complete
        for future in futures:
            count = futures[future]  # Get the loop count associated with this future
            try:
                result = future.result()  # Get the result returned by the thread
                iteration_dict[f'iteration_{count}'] = result  # Store in dictionary
            except Exception as e:
                print(f"Error in thread for count {count}: {e}")

    llm_dict[llm_name] = iteration_dict

  candidate_dict[candidate] = llm_dict    
  

In [0]:
candidate_name = 'X'
llm_name = 'llm-70B'
llm_results = candidate_dict[candidate_name][llm_name]
# llm_results

In [0]:
all_iteration_combined_df = pd.DataFrame()
for k, v in llm_results.items():
  all_iteration_combined_df = pd.concat([all_iteration_combined_df, v['result_df']], ignore_index=True)
all_iteration_combined_df.display()

In [0]:
combined_mismatch_list = {k:v['mismatch_list'] for k,v in llm_results.items()}
combined_mismatch_list

In [0]:
f"users.abhay_jalisatgi.test_{question_with_no_answer}_{module_to_check}"

In [0]:
spark.createDataFrame(df).write.mode("overwrite").saveAsTable(f"users.abhay_jalisatgi.test_{question_with_no_answer}_{module_to_check}")