In [None]:
from magistral_benchmark import MagistralBenchmarkConfig, MagistralBenchmark

In [None]:
config = MagistralBenchmarkConfig(
    model_name="mistralai/Magistral-Small-2506",
    tokenizer_name="mistralai/Mistral-Nemo-Instruct-2407",
    batch_size=8,
    min_vram_gb=8,
    test_file="./data/test.jsonl",
    max_new_tokens=350,
    max_eval_samples=None,
    system_message="Sei un assistente utile e intelligente.",
    output_prefix="magistral_small",
    # Quantization settings
    use_quantization=True,
    quantization_type="nf4",
    quantization_compute_dtype="float16",
    # Optimizations
    use_flash_attention=True,
    use_torch_compile=True,
)

In [None]:
benchmark = MagistralBenchmark(config)

‚úÖ CUDA optimizations enabled (TF32, cuDNN benchmark)
‚úÖ CUDA available: True
GPU: NVIDIA GeForce RTX 3090
VRAM: 23.6 GB
‚úÖ GPU has sufficient VRAM for the model


In [None]:
results, accuracy, category_stats = benchmark.run_benchmark()


MAGISTRAL BENCHMARK

MAGISTRAL BENCHMARK CONFIGURATION
‚úì Model: mistralai/Magistral-Small-2506
‚úì Tokenizer: mistralai/Mistral-Nemo-Instruct-2407
‚úì Test file: ./data/test.jsonl
‚úì Max samples: All
‚úì Batch size: 8
‚úì Max new tokens: 350
‚úì Min VRAM required: 8GB
‚úì Quantization: nf4 (float16)
‚úì Flash Attention 2: Enabled
‚úì torch.compile: Enabled
‚úì Output prefix: magistral_small
‚úì System message: Sei un assistente utile e intelligente.

üìö Loading ITALIC dataset...
Loaded 10000 questions

Using 10000 questions for evaluation
Categories: {'art_history': 980, 'civic_education': 973, 'current_events': 92, 'geography': 979, 'history': 978, 'lexicon': 979, 'literature': 984, 'morphology': 140, 'orthography': 971, 'synonyms_and_antonyms': 971, 'syntax': 973, 'tourism': 980}
üîÑ Loading mistralai/Magistral-Small-2506...
Loading tokenizer: mistralai/Mistral-Nemo-Instruct-2407


‚ö†Ô∏è Flash Attention 2 not available - using standard attention
Loading model with optimizations...


Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

Compiling model with torch.compile for optimization...
‚úÖ Model compiled successfully!
‚úÖ Model loaded successfully!
Model device: cuda:0
Model dtype: torch.float16
GPU Memory - Allocated: 13.19 GB, Available: 8.19 GB
üöÄ Optimal batch size determined: 8

üß™ Testing inference...
Question: La frase "Secondo recenti sondaggi il candidato repubblicano gode di scarsissima popolarit√†: la poss...
Test response: 'C'
Expected answer: 'D'
Extracted answer: 'C'
Correct: False

STARTING EVALUATION

üîç Evaluating mistralai/Magistral-Small-2506 on 10000 questions...


Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1250/1250 [31:44<00:00,  1.52s/it]



üìä FINAL RESULTS:
Total questions: 10000
Correct answers: 7597
Accuracy: 0.7597 (75.97%)

üìà RESULTS BY CATEGORY:
------------------------------------------------------------
Category                      Accuracy  Correct    Total
------------------------------------------------------------
art_history                     71.53%      701      980
civic_education                 75.64%      736      973
current_events                  80.43%       74       92
geography                       82.02%      803      979
history                         82.00%      802      978
lexicon                         86.21%      844      979
literature                      74.80%      736      984
morphology                      47.14%       66      140
orthography                     62.31%      605      971
synonyms_and_antonyms           89.60%      870      971
syntax                          65.06%      633      973
tourism                         74.18%      727      980
------------------

In [None]:
print(f"\nBenchmark completed with {accuracy:.4f} accuracy")


Benchmark completed with 0.7597 accuracy


In [None]:
print(f"Results saved with prefix: {config.output_prefix}")

Results saved with prefix: magistral_small


In [None]:
# Optional: Access individual components if you need custom analysis
benchmark.analyse_results_by_category(category_stats)


üìà RESULTS BY CATEGORY:
------------------------------------------------------------
Category                      Accuracy  Correct    Total
------------------------------------------------------------
art_history                     71.53%      701      980
civic_education                 75.64%      736      973
current_events                  80.43%       74       92
geography                       82.02%      803      979
history                         82.00%      802      978
lexicon                         86.21%      844      979
literature                      74.80%      736      984
morphology                      47.14%       66      140
orthography                     62.31%      605      971
synonyms_and_antonyms           89.60%      870      971
syntax                          65.06%      633      973
tourism                         74.18%      727      980
------------------------------------------------------------


In [None]:
benchmark.show_sample_predictions(results, 10)


üîç SAMPLE PREDICTIONS:

Example 1 ‚úÖ CORRECT:
  Category: syntax
  Question: Nella frase "Lungo la riva ha raccolto alcune erbe spontanee", "Lungo la riva" √® un sintagma:...
  Expected: C
  Predicted: C
  Raw output: 'C...'

Example 2 ‚úÖ CORRECT:
  Category: literature
  Question: Come si chiamava il periodico, pubblicato dal 1764 al 1766, principale strumento di diffusione del p...
  Expected: C
  Predicted: C
  Raw output: 'C...'

Example 3 ‚úÖ CORRECT:
  Category: syntax
  Question: Nella frase "le ho scritto una lettera con il cuore" che funzione logica svolge l'espressione "con i...
  Expected: C
  Predicted: C
  Raw output: 'C...'

Example 4 ‚úÖ CORRECT:
  Category: civic_education
  Question: Le fonti di produzione si dividono in:...
  Expected: C
  Predicted: C
  Raw output: 'C...'

Example 5 ‚úÖ CORRECT:
  Category: art_history
  Question: L'arte, cos√¨ come rappresentata nel libro sulla "Storia dell'Arte", datato 1842, veniva suddivisa in...
  Expected: C
  Predicted: C