## LLM Consistency Testing with Mistral LLM

This notebook contains code for testing code inconsistency in Mistral LLM

In [None]:
import os
import sys

In [None]:
curr_dir = os.getcwd()
par_dir = os.path.dirname(curr_dir)
proj_dir = os.path.dirname(par_dir)
sys.path.append(proj_dir)

In [None]:
from prediction_inconsistency.prediction_inconsistency_tester import LLMConsistencyTester
from prediction_inconsistency.prompt_templates.prompt_template import PredictionInconsistencyPromptTemplate
from utility.constants import Tasks, PromptTypes, LexicalMutations, SyntacticMutations, LogicalMutations, ReasoningModels, NonReasoningModels, CruxEval, HumanEval

# Declaring constants

In [None]:

## Declaring Task Type Constants
OUTPUT_PREDICTION = Tasks.OutputPrediction.NAME
INPUT_PREDICTION = Tasks.InputPrediction.NAME

## Declaring Benchmark Name Constants
CRUXEVAL = CruxEval.NAME
HUMANEVAL = HumanEval.NAME

## Declaring Prompt Type Constants
ZERO_SHOT = PromptTypes.ZERO_SHOT
ONE_SHOT = PromptTypes.ONE_SHOT
FEW_SHOT = PromptTypes.FEW_SHOT

## Declaring Mutation Constants
FOR2WHILE = SyntacticMutations.FOR2WHILE
FOR2ENUMERATE = SyntacticMutations.FOR2ENUMERATE

RANDOM_MUTATION = LexicalMutations.RANDOM
SEQUENTIAL_MUTATION = LexicalMutations.SEQUENTIAL
LITERAL_FORMAT = LexicalMutations.LITERAL_FORMAT

BOOLEAN_LITERAL = LogicalMutations.BOOLEAN_LITERAL
DEMORGAN = LogicalMutations.DEMORGAN
COMMUTATIVE_REORDER = LogicalMutations.COMMUTATIVE_REORDER
CONSTANT_UNFOLD = LogicalMutations.CONSTANT_UNFOLD
CONSTANT_UNFOLD_ADD = LogicalMutations.CONSTANT_UNFOLD_ADD
CONSTANT_UNFOLD_MULT = LogicalMutations.CONSTANT_UNFOLD_MULT

## Declaring Reasoning Model Name Constants
GPT5 = ReasoningModels.GPT5['name']

## Declaring Non-Reasoning Model Name Constants
CODESTRAL = NonReasoningModels.CODESTRAL['name']
GPT4O = NonReasoningModels.GPT4O['name']
DEEPSEEK = NonReasoningModels.DEEPSEEK_CHAT['name']

In [None]:
task_set = HUMANEVAL
database_name = f"{task_set}_Input_Output"
llmtester = LLMConsistencyTester(database_name, n =5)

In [None]:
reasoning_models = [getattr(ReasoningModels, model) for model in dir(ReasoningModels) if not model.startswith("_")]
non_reasoning_models = [getattr(NonReasoningModels, model) for model in dir(NonReasoningModels) if not model.startswith("_")]
print('Reasoning models supported by this framework are:')
for idx, model in enumerate(reasoning_models):
    print(f"{idx+1}: '{model['name']}'")
print('=' * 50)
print('Non-reasoning models supported by this framework are:')
for idx, model in enumerate(non_reasoning_models):
    print(f"{idx+1}: '{model['name']}'")

In [None]:
num_tests = llmtester.question_database.count_documents({})

The following example runs an output prediction inconsistency test on the HumanEval benchmark for all tasks in HumanEval. To add mutations such as Random mutation, add the corresponding mutation string to the `mutations` list like so: `mutations = [RANDOM_MUTATION]`.  The mutations available for prediction inconsistency testing have been declared as constants above.

Kindly refer to `input_prediction_consistency_test_notebook_humaneval.ipynb` under the same parent directory for more details on the function parameters for HumanEval prediction inconsistency testing.

In [None]:

# %%script false --no-raise-error
mutations = []
prompt_type = FEW_SHOT
model_name = GPT4O
task_type = OUTPUT_PREDICTION 
mutation_str = "_".join(mutations) if len(mutations) > 0 else "no_mutation"


results_dir =os.path.join(proj_dir, f'results', task_type, model_name)
os.makedirs(results_dir, exist_ok=True)

mutation_str = "_".join(mutations) if len(mutations) > 0 else "no_mutation"
output_file_path=f"{results_dir}/{task_set}_{prompt_type}_{mutation_str}.csv"

pass_count = llmtester.run_code_consistency_test(
    prompt_helper = PredictionInconsistencyPromptTemplate.return_appropriate_prompt(task_type, prompt_type),
    num_tests=num_tests,
    prompt_type= prompt_type,
    mutations=mutations,
    output_file_path=output_file_path,
    task_set = task_set,
    task_type=task_type,
    model_name=model_name,
)