In [None]:
from magistral_benchmark import MagistralBenchmarkConfig, MagistralReasoningBenchmark

In [None]:
config = MagistralBenchmarkConfig(
    tokenizer_name="mistralai/Mistral-Nemo-Instruct-2407",
    batch_size=6,
    min_vram_gb=8,
    test_file="./italic.jsonl",
    max_new_tokens=500,
    max_eval_samples=1000,
    system_message="Sei un assistente utile e intelligente.",
    output_prefix="magistral_small_finetuned_reasoning",
    # QLoRA adapter settings
    qlora_adapter_path="./path/to/your/qlora/adapter",
    merged_model_save_path=None, 
    # Quantization settings
    use_quantization=True,
    quantization_type="nf4",
    quantization_compute_dtype="float16",
    # Optimizations
    use_flash_attention=True,
    use_torch_compile=True,
)

In [None]:
config.print_config()

In [None]:
benchmark = MagistralReasoningBenchmark(config)

In [None]:
results, accuracy, category_stats = benchmark.run_benchmark()

In [None]:
print(f"\nFine-tuned reasoning benchmark completed with {accuracy:.4f} accuracy")

In [None]:
# Check reasoning usage
valid_reasoning = [r for r in results if r['reasoning_length'] > 0]
avg_reasoning_length = sum(r['reasoning_length'] for r in valid_reasoning) / len(valid_reasoning) if valid_reasoning else 0
print(f"Questions with reasoning: {len(valid_reasoning)}/{len(results)} ({len(valid_reasoning)/len(results)*100:.1f}%)")
print(f"Average reasoning length: {avg_reasoning_length:.0f} characters")

In [None]:
print(f"Results saved with prefix: {config.output_prefix}")

In [None]:
benchmark.analyse_results_by_category(category_stats)

In [None]:
benchmark.show_sample_predictions(results, 10)