# Mistral 7B Finetune CodeMath

After training with the python script, we run evals in this notebook

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
max_seq_length=2048
# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
model_save_path = "model_save_path/mistral_7b_finetuned_trace_python"
tokenizer_save_path = "tokenizer_save_path/mistral_7b_finetuned_trace_python"
test_dataset_path = "test.jsonl"

model = AutoModelForCausalLM.from_pretrained(model_save_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)

tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token


### Testing on Traces

Use BLEU to test the outputs of the model. Need some kind of custom testing or eval here later

In [None]:
from datasets import load_dataset

json_file_path = "./python_states_singleline.json"

trace_prompt = """<s>[INST] Below is an input which contains the state of variables and code that acts upon these variables or not. Given the state and the code give the state after the code executes for each variable. Be very careful. You should clearly outline your intermediate steps and your final answer should be a newline with exactly the variables and their values. Here is the State and Code. {}
Now generate the final state for each variable. Generate intermediate outputs.[/INST] {}</s>"""

trace_prompt2 = """<s>[INST] Input: {}
Now generate the final state for each variable. Generate intermediate outputs.[/INST]</s>"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


def formatting_prompts_func(examples):
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for input, output in zip(inputs, outputs):
        text = trace_prompt2.format(input)
        texts.append(text)
    return {
        "text": texts,
    }

dataset = load_dataset("json", data_files=json_file_path, split="train")
num_examples_to_select = 10
subset_start = max(0, len(dataset) - num_examples_to_select)
test_dataset = dataset.select(range(subset_start, len(dataset)))

test_dataset_formatted = test_dataset.map(formatting_prompts_func, batched=True)


In [None]:
# from datasets import load_metric
# from transformers import pipeline
# import numpy as np
# from tqdm import tqdm



# gen_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)#, device=0)  # Assuming using GPU 0

# # Function to generate predictions for the test dataset
# def generate_predictions(dataset, gen_pipeline):
#     predictions = []
#     # Use tqdm to add a progress bar
#     for example in tqdm(dataset, desc="Generating predictions"):
#         # Generate text based on the input
#         input_text = example['text']  # Ensure this matches your dataset structure
#         generated_text = gen_pipeline(input_text, max_length=2048, num_return_sequences=1)[0]['generated_text']
#         predictions.append(generated_text)
#     return predictions

# # Load the BLEU metric
# bleu = load_metric("bleu")

# # Generate predictions for the formatted test dataset
# predictions = generate_predictions(test_dataset_formatted, gen_pipeline)

# # Prepare references in the format expected by the BLEU metric (a list of lists)
# references = [[example['output'].split()] for example in test_dataset_formatted]  # Adjust based on your dataset structure

# # Prepare predictions in the format expected by the BLEU metric
# predictions_processed = [pred.split() for pred in predictions]

# # Calculate BLEU score
# results = bleu.compute(predictions=predictions_processed, references=references)

# print(f"BLEU score: {results['bleu'] * 100:.2f}")

In [None]:
def generate_predictions_and_compare(dataset, gen_pipeline):
    for example in tqdm(dataset, desc="Generating predictions and comparing"):
        # Generate text based on the input
        input_text = example['text']  # Adjust based on your dataset structure
        generated_text = gen_pipeline(input_text, max_length=2048, num_return_sequences=1)[0]['generated_text']

        # Print input, expected output, and generated output for comparison
        print("\nInput Text:\n", input_text)
        print("\nExpected Output:\n", example['output'])
        print("\nGenerated Output:\n", generated_text)
        print("-" * 80)

generate_predictions_and_compare(test_dataset_formatted, gen_pipeline)
