This notebook judges generated Verilog code against ground truth code and assigns scores to several aspects of the generated code. 

In [None]:
#Install required libraries if needed
# %pip install openai pandas wandb weave

In [None]:
from openai import OpenAI
from openai import OpenAIError, RateLimitError

import pandas as pd
import json
import re
import os
import weave
import math
import time

from dotenv import load_dotenv

In [2]:
# Load the variables from the .env file
load_dotenv()

# Set your API keys
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
TOGETHER_API_KEY=os.getenv("TOGETHER_API_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
SAMBANOVA_API_KEY= os.getenv("SAMBANOVA_API_KEY")

In [336]:
# def openai_response(prompt):
#     # client = OpenAI()

#     # Initialize Weave Tracing
#     # weave.init('SVx')

#     #test_sambanova_api()
#     client = OpenAI(
#         api_key=os.environ.get("SAMBANOVA_API_KEY"),
#         base_url="https://api.sambanova.ai/v1",
#     )

#     response = client.chat.completions.create(
#             model="Llama-3.1-Tulu-3-405B",
#             messages=[{"role": "system", "content": "You are an expert in Verilog hardware verification."},
#                       {"role": "user", "content": prompt}],
#             temperature=0
#         )    
#     # print(response)
#     # response = client.chat.completions.create(
#     #         model="gpt-4o",
#     #         messages=[{"role": "system", "content": "You are an expert in Verilog hardware verification."},
#     #                   {"role": "user", "content": prompt}],
#     #         temperature=0
#     #     )    
#     # result = response["choices"][0]["message"]["content"]
#     result = response.choices[0].message.content
#     # print(result)
#     return result

In [None]:
def openai_response(prompt):
    # client = OpenAI(
    #     api_key=os.environ.get("SAMBANOVA_API_KEY"),
    #     base_url="https://api.sambanova.ai/v1",
    # )

    client = OpenAI()

    # Initialize Weave Tracing
    weave.init('SVx')

    while True:
        try:
            # response = client.chat.completions.create(
            #     model="Llama-3.1-Tulu-3-405B",
            #     messages=[{"role": "system", "content": "You are an expert in Verilog hardware verification."},
            #               {"role": "user", "content": prompt}],
            #     temperature=0
            # )

            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "system", "content": "You are an expert in Verilog hardware verification."},
                        {"role": "user", "content": prompt}],
                temperature=0
            )  

            result = response.choices[0].message.content
            return result

        except RateLimitError:
            print("Rate limit exceeded. Retrying after a short delay...")
            time.sleep(10)  # Wait 10 seconds before retrying

        except OpenAIError as e:
            print(f"An error occurred: {e}. Retrying...")
            time.sleep(5)  # A general delay in case of other API errors

        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            break  # Exit the loop if there is an unexpected error

    return None


LaaJ Evaluation metrics:
- Logical Equivalence
- Signal Behavior
- Edge Case Handling
- Code Modularity
- Resource Efficiency
- Timing and Pipeline Depth

In [339]:
def extract_json(text):
    # Regular expression to find JSON-like structures
    json_pattern = re.compile(r'\{.*?\}', re.DOTALL)
    
    matches = json_pattern.findall(text)
    
    for match in matches:
        try:
            parsed_json = json.loads(match)
            return parsed_json  # Return the first successfully parsed JSON
        except json.JSONDecodeError:
            continue
    
    return None

In [340]:
def evaluate_verilog_code(ground_truth, generated_code):
    
    prompt = f"""
    Evaluate the following Verilog/SystemVerilog code against the ground truth based on these criteria:
    
    **Ground Truth Code:**
    ```verilog
    {ground_truth}
    ```

    **Generated Code:**
    ```verilog
    {generated_code}
    ```
    The evaluation criteria are as follows:
    - **Logical Equivalence** (0-5) : 
        - Think about the below points before making the output JSON object
        - Do both implementations produce identical outputs for all inputs?
        - Identify any functional differences.
        - Assign an integer **score (0-5)** value based on how closely the generated code matches the ground truth.
        
    - **Signal Behavior** (0-5):
        - Think about the below points before making the output JSON object
        - Do the registers, wires, and combinational logic behave the same way in both versions?
        - Are state transitions in FSMs identical?
        - Are signal updates happening at the correct time?
        - Assign an integer **score (0-5)** value based on how accurately the generated code preserves signal behavior.

    - **Edge Case Handling** (0-5):
        - Think about the below points before making the output JSON object
        - Does the generated code correctly implement synchronous and asynchronous resets?
        - Are there any potential issues with metastability, clock domain transfers, or data integrity?
        - Are unexpected behaviors prevented under corner cases?
        - Assign an integer **score (0-5)** value accordingly.

    - **Code Modularity** (0-5):
        - Think about the below points before making the output JSON object
        - Does the generated code follow modular design principles?
        - Are parameters, functions, and submodules effectively used?
        - Are unnecessary dependencies avoided?
        - Assign an integer **score (0-5)** value accordingly.

    - **Resource Efficiency** (0-5):
        - Think about the below points before making the output JSON object
        - Does the generated code use an excessive number of registers, combinational logic, or memory?
        - Are there inefficient resource utilization patterns?
        - Are logic and memory components minimized while maintaining functionality?
        - Assign an integer **score (0-5)** value accordingly.


    - **Timing & Pipeline Depth** (0-5):
        - Does the generated code introduce unnecessary delays?
        - Does it increase the critical path?
        - Are pipeline stages maintained or improved?
        - Assign an integer **score (0-5)** value.


    Return the scores exactly in the following **JSON format**:
    ```
    {{
      "logical_equivalence": X,
      "signal_behavior": X,
      "edge_case_handling": X,
      "code_modularity": X,
      "resource_efficiency": X,
      "timing_pipeline_depth": X
    }}
    ```
    For the scores, only mention the integer score value (0-5) and nothing else.
    Only return the score in JSON format. No need to explain anythings else. Do not include any other text or information in the response.
    """

    try:
        result = openai_response(prompt)

        scores = extract_json(result)
        
        # Convert JSON response into Python dictionary
        # scores = json.loads(parsed_result)
        # print(scores)

    except Exception as e:
        return "Error", str(e)
    
    # result = {}
    return scores

In [341]:
# def evaluate_logical_equivalence(ground_truth, generated_code):
#     """Uses an LLM to evaluate logical equivalence between ground truth and generated Verilog code."""
    
#     prompt = f"""
#     Compare the logical equivalence of the following Verilog/SystemVerilog codes. 
    
#     **Ground Truth Code:**
#     ```verilog
#     {ground_truth}
#     ```

#     **Generated Code:**
#     ```verilog
#     {generated_code}
#     ```

#     - Do both implementations produce identical outputs for all inputs?
#     - Identify any functional differences.
#     - Assign a **score (0-5)** based on how closely the generated code matches the ground truth.

#     Format the response as:
#     ```
#     Score: X
#     ```
#     """

#     try:
         
#         result = openai_response(prompt)
        
#         # Extract score and explanation
#         lines = result.split("\n")
#         score = next((line.split(":")[1].strip() for line in lines if "Score:" in line), "N/A")        
#         return int(score) if score.isdigit() else "N/A"

#     except Exception as e:
#         return "Error", str(e)


In [342]:
# def evaluate_signal_behavior(ground_truth, generated_code):
#     """Uses an LLM to evaluate signal behavior differences between ground truth and generated Verilog code."""
    
#     prompt = f"""
#     Analyze the signal behavior of the following Verilog/SystemVerilog codes. 
    
#     **Ground Truth Code:**
#     ```verilog
#     {ground_truth}
#     ```

#     **Generated Code:**
#     ```verilog
#     {generated_code}
#     ```

#     - Do the registers, wires, and combinational logic behave the same way in both versions?
#     - Are state transitions in FSMs identical?
#     - Are signal updates happening at the correct time?
#     - Assign a **score (0-5)** based on how accurately the generated code preserves signal behavior.

#     Format the response as:
#     ```
#     Score: X
#     ```
#     """


#     try:
        
#         result = openai_response(prompt)
        
#         # Extract score and explanation
#         lines = result.split("\n")
#         score = next((line.split(":")[1].strip() for line in lines if "Score:" in line), "N/A")        
#         return int(score) if score.isdigit() else "N/A"

#     except Exception as e:
#         return "Error", str(e)

In [343]:
# def evaluate_edge_case_handling(ground_truth, generated_code):
#     """Uses an LLM to evaluate edge case handling in Verilog code."""
    
#     prompt = f"""
#     Evaluate whether the generated Verilog code correctly handles reset conditions, clock domain crossings, 
#     and asynchronous behavior compared to the ground truth.

#     **Ground Truth Code:**
#     ```verilog
#     {ground_truth}
#     ```

#     **Generated Code:**
#     ```verilog
#     {generated_code}
#     ```

#     - Does the generated code correctly implement synchronous and asynchronous resets?
#     - Are there any potential issues with metastability, clock domain transfers, or data integrity?
#     - Are unexpected behaviors prevented under corner cases?
#     - Assign a **score (0-5)**.

#     Format the response as:
#     ```
#     Score: X
#     ```
#     """

#     try:
#         result = openai_response(prompt)
        
#         # Extract score and explanation
#         lines = result.split("\n")
#         score = next((line.split(":")[1].strip() for line in lines if "Score:" in line), "N/A")        
#         return int(score) if score.isdigit() else "N/A"

#     except Exception as e:
#         return "Error", str(e)

In [344]:
# def evaluate_code_modularity(ground_truth, generated_code):
#     """Evaluates code modularity between ground truth and generated Verilog code."""
    
#     prompt = f"""
#     Compare the modularity of the following Verilog/SystemVerilog codes. 
    
#     **Ground Truth Code:**
#     ```verilog
#     {ground_truth}
#     ```

#     **Generated Code:**
#     ```verilog
#     {generated_code}
#     ```

#     - Does the generated code follow modular design principles?
#     - Are parameters, functions, and submodules effectively used?
#     - Are unnecessary dependencies avoided?
#     - Assign a **score (0-5)** .

#     Format the response as:
#     ```
#     Score: X
#     ```
#     """

#     try:
#         result = openai_response(prompt)
        
#         # Extract score and explanation
#         lines = result.split("\n")
#         score = next((line.split(":")[1].strip() for line in lines if "Score:" in line), "N/A")        
#         return int(score) if score.isdigit() else "N/A"

#     except Exception as e:
#         return "Error", str(e)

In [345]:
# def evaluate_resource_efficiency(ground_truth, generated_code):
#     """Evaluates resource efficiency between ground truth and generated Verilog code."""
    
#     prompt = f"""
#     Compare the resource efficiency of the following Verilog/SystemVerilog codes. 
    
#     **Ground Truth Code:**
#     ```verilog
#     {ground_truth}
#     ```

#     **Generated Code:**
#     ```verilog
#     {generated_code}
#     ```

#     - Does the generated code use an excessive number of registers, combinational logic, or memory?
#     - Are there inefficient resource utilization patterns?
#     - Are logic and memory components minimized while maintaining functionality?
#     - Assign a **score (0-5)** .

#     Format the response as:
#     ```
#     Score: X
#     ```
#     """

#     try:
#         result = openai_response(prompt)
        
#         # Extract score and explanation
#         lines = result.split("\n")
#         score = next((line.split(":")[1].strip() for line in lines if "Score:" in line), "N/A")
        
#         return int(score) if score.isdigit() else "N/A"

#     except Exception as e:
#         return "Error", str(e)

In [346]:
# def evaluate_timing_pipeline_depth(ground_truth, generated_code):
#     """Evaluates timing and pipeline depth between ground truth and generated Verilog code."""
    
#     prompt = f"""
#     Compare the timing and pipeline depth of the following Verilog/SystemVerilog codes. 
    
#     **Ground Truth Code:**
#     ```verilog
#     {ground_truth}
#     ```

#     **Generated Code:**
#     ```verilog
#     {generated_code}
#     ```

#     - Does the generated code introduce unnecessary delays?
#     - Does it increase the critical path?
#     - Are pipeline stages maintained or improved?
#     - Assign a **score (0-5)** .

#     Format the response as:
#     ```
#     Score: X
#     ```
#     """

#     try:
#         result = openai_response(prompt)
        
#         # Extract score and explanation
#         lines = result.split("\n")
#         score = next((line.split(":")[1].strip() for line in lines if "Score:" in line), "N/A")
        
#         return int(score) if score.isdigit() else "N/A"

#     except Exception as e:
#         return "Error", str(e)

In [347]:
def compute_scaled_score(logical_equivalence, signal_behavior, edge_case_handling,
                         code_modularity, resource_efficiency, timing_pipeline_depth):
    weights = {
        "Logical_Score": 0.50,
        "Signal_Score": 0.10,
        "EdgeCase_Score": 0.10,
        "Modularity_Score": 0.10,
        "ResourceEfficiency_Score": 0.10,
        "Timing_Score": 0.10
    }
    
    # Compute weighted sum
    weighted_sum = (
        logical_equivalence * weights["Logical_Score"] +
        signal_behavior * weights["Signal_Score"] +
        edge_case_handling * weights["EdgeCase_Score"] +
        code_modularity * weights["Modularity_Score"] +
        resource_efficiency * weights["ResourceEfficiency_Score"] +
        timing_pipeline_depth * weights["Timing_Score"]
    )
    
    # Maximum possible score with given weights
    max_score = 5.0  # Each variable has a max value of 5
    max_weighted_sum = sum(weight * max_score for weight in weights.values())
    
    # Scale the weighted sum to be between 0.0 and 1.0
    scaled_score = weighted_sum / max_weighted_sum

    # Ensure two decimal places using ceiling function
    scaled_score = math.ceil(scaled_score * 100) / 100.0
    
    return scaled_score


In [348]:
def make_json(ground_truth, generated_code):

    # logical_equivalence = evaluate_logical_equivalence(ground_truth, generated_code)
    # signal_behavior = evaluate_signal_behavior(ground_truth, generated_code)
    # edge_case_handling = evaluate_edge_case_handling(ground_truth, generated_code)
    # code_modularity = evaluate_code_modularity(ground_truth, generated_code)
    # resource_efficiency = evaluate_resource_efficiency(ground_truth, generated_code)
    # timing_pipeline_depth = evaluate_timing_pipeline_depth(ground_truth, generated_code)

    scores = evaluate_verilog_code(ground_truth, generated_code)
    print(scores)

    logical_equivalence = scores["logical_equivalence"]
    signal_behavior = scores["signal_behavior"]
    edge_case_handling = scores["edge_case_handling"]
    code_modularity = scores["code_modularity"]
    resource_efficiency = scores["resource_efficiency"] 
    timing_pipeline_depth = scores["timing_pipeline_depth"]

    # logical_equivalence = scores.logical_equivalence
    # signal_behavior = scores.signal_behavior
    # edge_case_handling = scores.edge_case_handling
    # code_modularity = scores.code_modularity
    # resource_efficiency = scores.resource_efficiency
    # timing_pipeline_depth = scores.timing_pipeline_depth
    
    scaled_score = compute_scaled_score(logical_equivalence, signal_behavior, edge_case_handling,
                         code_modularity, resource_efficiency, timing_pipeline_depth)
    
    score_json = {
        "scaled_score":scaled_score,
        "logical_equivalence":logical_equivalence,
        "signal_behavior":signal_behavior,
        "edge_case_handling":edge_case_handling,
        "code_modularity":code_modularity,
        "resource_efficiency":resource_efficiency,
        "timing_pipeline_depth":timing_pipeline_depth
    }



    return json.dumps(score_json)

In [349]:
def add_score_column(df_source, df_target, col1, col2, new_col_name):
    """
    Takes df_source, extracts values from col1 and col2, processes them with make_json,
    and adds a new column to df_target.

    Args:
        df_source (pd.DataFrame): Source dataframe with input columns. -> benchmark dataset 
        col1 (str): Name of the first column in df_source. -> ground truth
        col2 (str): Name of the second column in df_source. -> generated code (for a specific LLM)
        df_target (pd.DataFrame): Target dataframe to add the new column. -> Laaj scores dataset
        new_col_name (str): Name of the new column in df_target. -> specific LLM name

    Returns:
        pd.DataFrame: Updated df_target with the new column.
    """

    df_target[new_col_name] = df_source.apply(lambda row: make_json(row[col1], row[col2]), axis=1).to_list()


    return df_target


In [350]:
df = pd.read_csv("data/result.csv")
df.head(4)

Unnamed: 0,ground_truth,aisuite_Qwen-2.5-72B-It,aisuite_Claude-3.5-Sonnet,together_Llama-3.3-70B-It
0,\n assign enabled = enable; \n dmac_data_move...,```verilog\nreg [ID_WIDTH-1:0] response_id_reg...,reg [2:0] state;\nreg [ID_WIDTH-1:0] active_id...,always @ (posedge s_axis_aclk) \nbegin \n i...
1,"\n wire or_1, or_2; \n wire [W-2:0] zero_comp...","wire or_1, or_2;\nwire [W-2:0] zero_comp;\nwir...","wire or_1, or_2;\nwire [W-2:0] zero_comp;\nwir...","wire or_1, or_2; \nwire [W-2:0] zero_comp; \nw..."
2,"\n input clk,rst; \n input rd_en; \n input wr...",```verilog\nparameter MAX_COUNT = 15;\nreg [3:...,"reg full, emptyp;\nreg [3:0] count;\nparameter...","reg [3:0] count;\nreg fullp, emptyp;\nalways @..."
3,\n reg rxd_buf; \n reg clk_buf; \n reg [7:...,```verilog\nreg rxd_buf;\nreg clk_buf;\nreg [7...,reg rxd_buf;\nreg clk_buf;\nreg [7:0] data_buf...,reg [7:0] rxd_buf;\nreg clk_buf;\nreg [7:0] da...


In [351]:
output_df = pd.DataFrame()
output_df = add_score_column(df, output_df, "ground_truth", "together_Llama-3.3-70B-It","together_Llama-3.3-70B-It")
output_df = add_score_column(df, output_df, "ground_truth", "aisuite_Qwen-2.5-72B-It","aisuite_Qwen-2.5-72B-It")
output_df = add_score_column(df, output_df, "ground_truth", "aisuite_Claude-3.5-Sonnet","aisuite_Claude-3.5-Sonnet")

{'logical_equivalence': 1, 'signal_behavior': 2, 'edge_case_handling': 3, 'code_modularity': 2, 'resource_efficiency': 3, 'timing_pipeline_depth': 2}
{'logical_equivalence': 5, 'signal_behavior': 5, 'edge_case_handling': 5, 'code_modularity': 5, 'resource_efficiency': 5, 'timing_pipeline_depth': 5}
{'logical_equivalence': 4, 'signal_behavior': 4, 'edge_case_handling': 5, 'code_modularity': 4, 'resource_efficiency': 4, 'timing_pipeline_depth': 5}
{'logical_equivalence': 2, 'signal_behavior': 3, 'edge_case_handling': 3, 'code_modularity': 4, 'resource_efficiency': 4, 'timing_pipeline_depth': 3}
{'logical_equivalence': 1, 'signal_behavior': 2, 'edge_case_handling': 3, 'code_modularity': 2, 'resource_efficiency': 2, 'timing_pipeline_depth': 2}
{'logical_equivalence': 5, 'signal_behavior': 5, 'edge_case_handling': 5, 'code_modularity': 5, 'resource_efficiency': 5, 'timing_pipeline_depth': 5}
{'logical_equivalence': 3, 'signal_behavior': 3, 'edge_case_handling': 4, 'code_modularity': 4, 'res

In [353]:
output_df.head(4)

Unnamed: 0,together_Llama-3.3-70B-It,aisuite_Qwen-2.5-72B-It,aisuite_Claude-3.5-Sonnet
0,"{""scaled_score"": 0.34, ""logical_equivalence"": ...","{""scaled_score"": 0.32, ""logical_equivalence"": ...","{""scaled_score"": 0.62, ""logical_equivalence"": ..."
1,"{""scaled_score"": 1.0, ""logical_equivalence"": 5...","{""scaled_score"": 1.0, ""logical_equivalence"": 5...","{""scaled_score"": 1.0, ""logical_equivalence"": 5..."
2,"{""scaled_score"": 0.84, ""logical_equivalence"": ...","{""scaled_score"": 0.68, ""logical_equivalence"": ...","{""scaled_score"": 0.9, ""logical_equivalence"": 4..."
3,"{""scaled_score"": 0.54, ""logical_equivalence"": ...","{""scaled_score"": 0.45, ""logical_equivalence"": ...","{""scaled_score"": 0.6, ""logical_equivalence"": 3..."
