# Turbulence Benchmark Testing

In [None]:
import os
import sys
from dotenv import load_dotenv

In [None]:
curr_dir = os.getcwd()
parent_dir = os.path.dirname(curr_dir)
proj_dir = os.path.dirname(parent_dir)
sys.path.append(proj_dir)
load_dotenv()

In [None]:
from utility.constants import LexicalMutations, PromptTypes, CodeGeneration, ReasoningModels, NonReasoningModels, SamplingMethods, InputPrediction, OutputPrediction, SyntacticMutations, LogicalMutations, Turbulence
from baseline.turbulence_benchmark.turbulence_tester import TurbulenceTester

In [None]:
## Declaring Prompt Type Constants
ZERO_SHOT = PromptTypes.ZERO_SHOT
ONE_SHOT = PromptTypes.ONE_SHOT
FEW_SHOT = PromptTypes.FEW_SHOT

## Declaring Mutation Constants
FOR2WHILE = SyntacticMutations.FOR2WHILE
FOR2ENUMERATE = SyntacticMutations.FOR2ENUMERATE

RANDOM_MUTATION = LexicalMutations.RANDOM
SEQUENTIAL_MUTATION = LexicalMutations.SEQUENTIAL
LITERAL_FORMAT = LexicalMutations.LITERAL_FORMAT

BOOLEAN_LITERAL = LogicalMutations.BOOLEAN_LITERAL
DEMORGAN = LogicalMutations.DEMORGAN
COMMUTATIVE_REORDER = LogicalMutations.COMMUTATIVE_REORDER
CONSTANT_UNFOLD = LogicalMutations.CONSTANT_UNFOLD
CONSTANT_UNFOLD_ADD = LogicalMutations.CONSTANT_UNFOLD_ADD
CONSTANT_UNFOLD_MULT = LogicalMutations.CONSTANT_UNFOLD_MULT


## Declaring Benchmark Name Constants
TURBULENCE = Turbulence.NAME

## Declaring Reasoning Model Name Constants
GPT5 = ReasoningModels.GPT5['name']

## Declaring Non-Reasoning Model Name Constants
CODESTRAL = NonReasoningModels.CODESTRAL['name']
GPT4O = NonReasoningModels.GPT4O['name']
DEEPSEEK = NonReasoningModels.DEEPSEEK_CHAT['name']

## Declaring Sampling Methods for Turbulence Dataset
SYSTEMATIC = SamplingMethods.SYSTEMATIC
RANDOM = SamplingMethods.RANDOM

## Declaring Task Types
INPUT_PREDICTION = InputPrediction.NAME
OUTPUT_PREDICTION = OutputPrediction.NAME
CODE_GENERATION = CodeGeneration.NAME

In [None]:
reasoning_models = [getattr(ReasoningModels, model) for model in dir(ReasoningModels) if not model.startswith("_")]
non_reasoning_models = [getattr(NonReasoningModels, model) for model in dir(NonReasoningModels) if not model.startswith("_")]
print('Reasoning models supported by this framework are:')
for idx, model in enumerate(reasoning_models):
    print(f"{idx+1}: '{model['name']}'")
print('=' * 50)
print('Non-reasoning models supported by this framework are:')
for idx, model in enumerate(non_reasoning_models):
    print(f"{idx+1}: '{model['name']}'")

In [None]:
task_set = TURBULENCE

try:
    llmtester = TurbulenceTester(
        qn_database= os.getenv('MONGODB_TURBULENCE_COLLECTION'),
        base_db=os.getenv('MONGODB_TURBULENCE_DATABASE'),
        n = 5
        )
except Exception as e:
    print(f'llmtester could not launch due to the following error: {e}')

In [None]:
num_tests = llmtester.question_database.count_documents({})

In [None]:
valid_mutations = CodeGeneration.MUTATIONS
print("These are the valid mutation names for code generation:")
for idx, mutation in enumerate(valid_mutations):
    if mutation != LITERAL_FORMAT:
        print(idx+1, mutation)

# Run your experiments

Two experiments on the Turbulence benchmark can be run from this notebook
- Code Generation
- Output/Input Prediction

### Turbulence Output and Input Prediction with MuCoCo

Use ```run_prediction_inconsistency_test```. The following table outlines the valid parameters for this function.

| Parameter              | Type        | Description                                                                                                              |
| ---------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------ |
| `output_file_path`     | `str`       | Full path to the CSV where predictions and metrics are saved. Filename is built from model, task type, and mutation tag. |
| `num_tests`            | `int`       | Number of test turbulence questions to evaluate. Set to `num_tests` to run all Turbulence question templates.                                                   |
| `mutations`            | `List[str]` | Mutation operators to apply (e.g., `["FOR2WHILE"]`, `["CONSTANT_UNFOLD"]`). Empty list means **no_mutation**.            |
| `model_name`           | `str`       | Identifier of the LLM under test (e.g., `GPT4O`). Used for routing and naming.                                           |
| `sampling_method`      | `str`       | Optional parameter method for `num_samples_per_task` test cases (e.g., `RANDOM` or `STRATIFIED`). Ensure that the `num_samples_per_task` is filled in to use with this parameter or remove it if running all templates.                                                                               |
| `num_samples_per_task` | `int`       | Optional parameter of independent generations per task to assess consistency. Each Turbulence task have up to 100 tasks. Use this parameter to specify the sample of tasks to run. Use in conjunction with `sampling_method` variable or remove it if running all templates.                                                         |
| `task_type`            | `str`       | Task to test on (e.g., `OUTPUT_PREDICTION` or `INPUT_PREDICTION`).                                |
| `continue_from_task`   | `str`       | Optional parameter for starting evaluation from a specified task ID (e.g., `"TurbulenceQ27"`)                                                 |


### Output Prediction Example

The following code sample runs a sample of Turbulence benchmark questions for all question templates (Q1 to Q60). The task type is output prediction and model used is GPT4O. A `random` sampling method is used to select `5` random samples to test on per question template.

Do fill in the `.env` with the GPT4O API key as specified in `.env.example` and change the seed value in `.env` as desired. The default seed value is `1234`.

To run the full Turbulence Output Prediction experiment, remove the `num_samples_per_task` and `sampling_method` parameters.

In [None]:
# %%script false --no-raise-error
mutations = []
prompt_type = ZERO_SHOT
model_name = GPT4O

# Forming the results directory
results_dir =os.path.join(proj_dir, f'results/code_generation/{model_name}')
os.makedirs(results_dir, exist_ok=True)

mutation_str = "_".join(mutations) if len(mutations) > 0 else "no_mutation"
output_file_path=f"{results_dir}/{task_set}_{prompt_type}_{mutation_str}.csv"  # modify the output file path here as desired.

pass_count = llmtester.run_prediction_inconsistency_test(
    output_file_path=output_file_path,
    num_tests=num_tests,
    mutations = mutations,
    model_name= model_name,
    sampling_method= RANDOM,
    num_samples_per_task=5,
    task_type = OUTPUT_PREDICTION,
)

print(fr"Results saved in {output_file_path}")


### Input Prediction Example

The following code sample runs a sample of Turbulence benchmark questions for all question templates (Q1 to Q60). The task type is input prediction and model used is GPT4O. A `random` sampling method is used to select `5` random samples to test on per question template.

Do fill in the `.env` with the GPT4O API key as specified in `.env.example` and change the seed value in `.env` as desired. The default seed value is `1234`.

To run the full Turbulence Input Prediction experiment, remove the `num_samples_per_task` and `sampling_method` parameters.

In [None]:
# %%script false --no-raise-error
mutations = []
prompt_type = ZERO_SHOT
model_name = GPT4O

# Forming the results directory
results_dir =os.path.join(proj_dir, f'results/code_generation/{model_name}')
os.makedirs(results_dir, exist_ok=True)

mutation_str = "_".join(mutations) if len(mutations) > 0 else "no_mutation"
output_file_path=f"{results_dir}/{task_set}_{prompt_type}_{mutation_str}.csv"  # modify the output file path here as desired.

pass_count = llmtester.run_prediction_inconsistency_test(
    output_file_path=output_file_path,
    num_tests=3,
    mutations = mutations,
    model_name= model_name,
    sampling_method= RANDOM,
    num_samples_per_task=5,
    task_type = INPUT_PREDICTION,
)

print(fr"Results saved in {output_file_path}")


### Turbulence Code Generation with MuCoCo

Use ```run_code_generation_test```. The following table outlines the valid parameters for this function.

| Parameter              | Type        | Description                                                                                                              |
| ---------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------ |
| `output_file_path`     | `str`       | Full path to the CSV where predictions and metrics are saved. Filename is built from model, task type, and mutation tag. |
| `num_tests`            | `int`       | Number of test turbulence questions to evaluate. Set to `num_tests` to run all question templates.                                                                               |
| `mutations`            | `List[str]` | Mutation operators to apply (e.g., `["FOR2WHILE"]`, `["CONSTANT_UNFOLD"]`). Empty list means **no_mutation**.            |
| `model_name`           | `str`       | Identifier of the LLM under test (e.g., `GPT4O`). Used for routing and naming.                                           |
| `sampling_method`      | `str`       | How to sample test cases (e.g., `RANDOM` or `STRATIFIED`).                                                                               |
| `num_samples_per_task` | `int`       | Number of independent generations per task to assess consistency. Each Turbulence task have up to 100 tasks. Use this parameter to specify the sample of tasks to run.                                                         |
| `continue_from_task`   | `str`       | Optional parameter for starting evaluation from a specified task ID (e.g., `"TurbulenceQ27"`)                                                 |


### Code Generation Example

The following code sample runs a sample of Turbulence benchmark questions for all question templates (Q1 to Q60). The task type is input prediction and model used is GPT4O. A `random` sampling method is used to select `5` random samples to test on per question template.

Do fill in the `.env` with the GPT4O API key as specified in `.env.example` and change the seed value in `.env` as desired. The default seed value is `1234`.

To run the full Turbulence Code Generation experiment, remove the `num_samples_per_task` and `sampling_method` parameters.

In [None]:
# %%script false --no-raise-error
mutations = []
prompt_type = ZERO_SHOT
model_name = GPT4O

# Forming the results directory
results_dir =os.path.join(proj_dir, f'results/code_generation/{model_name}')
os.makedirs(results_dir, exist_ok=True)

mutation_str = "_".join(mutations) if len(mutations) > 0 else "no_mutation"
output_file_path=f"{results_dir}/{task_set}_{prompt_type}_{mutation_str}.csv"

pass_count = llmtester.run_code_generation_test(
    output_file_path=output_file_path,
    num_tests=num_tests,
    mutations = mutations,
    model_name= model_name,
    sampling_method= RANDOM,
    num_samples_per_task=5,
)

print(fr"Results saved in {output_file_path}")
