# Load libraries

In [None]:
import json
from pathlib import Path
import numpy as np
import pandas as pd

In [None]:
# Normalization function
def normalize_within_range(value, lower_bound=0, higher_bound=1):
    return (np.clip(value - lower_bound, 0, None)) / (higher_bound - lower_bound) * 100

In [None]:
bbh_subtasks = {
    "sports_understanding": 2,
    "tracking_shuffled_objects_three_objects": 3,
    "navigate": 2,
    "snarks": 2,
    "date_understanding": 6,
    "reasoning_about_colored_objects": 18,
    "object_counting": 19,
    "logical_deduction_seven_objects": 7,
    "geometric_shapes": 11,
    "web_of_lies": 2,
    "movie_recommendation": 6,
    "logical_deduction_five_objects": 5,
    "salient_translation_error_detection": 6,
    "disambiguation_qa": 3,
    "temporal_sequences": 4,
    "hyperbaton": 2,
    "logical_deduction_three_objects": 3,
    "causal_judgement": 2,
    "formal_fallacies": 2,
    "tracking_shuffled_objects_seven_objects": 7,
    "ruin_names": 6,
    "penguins_in_a_table": 5,
    "boolean_expressions": 2,
    "tracking_shuffled_objects_five_objects": 5
}

In [None]:
musr_subtasks = {
    'murder_mysteries': 2,
    'object_placements': 5,
    'team_allocation': 3
}

# Load and process results data

In [None]:
# Load the new_results.json file
with open('/content/results_2024-09-23T09-45-37.626278.json', 'r') as file:
    data = json.load(file)

In [None]:
# Extract the model name and precision
model_name = data['model_name']
precision = data['config']['model_dtype']
revision = data['config']['model_revision']

In [None]:
# Normalize BBH subtasks scores
bbh_scores = []
for subtask, num_choices in bbh_subtasks.items():
    subtask_key = f'leaderboard_bbh_{subtask}'
    if subtask_key in data['results']:
        bbh_raw_score = data['results'][subtask_key]['acc_norm,none']
        lower_bound = 1 / num_choices
        normalized_score = normalize_within_range(bbh_raw_score, lower_bound, 1.0)
        bbh_scores.append(normalized_score)

# Average BBH score
bbh_score = sum(bbh_scores) / len(bbh_scores)

In [None]:
# Calculate the MATH score
math_raw_score = data['results']['leaderboard_math_hard']['exact_match,none']
math_score = normalize_within_range(math_raw_score, 0, 1.0)

In [None]:
# Normalize GPQA scores
gpqa_raw_score = data['results']['leaderboard_gpqa']['acc_norm,none']
gpqa_score = normalize_within_range(gpqa_raw_score, 0.25, 1.0)

In [None]:
# Normalize MMLU PRO scores
mmlu_pro_raw_score = data['results']['leaderboard_mmlu_pro']['acc,none']
mmlu_pro_score = normalize_within_range(mmlu_pro_raw_score, 0.1, 1.0)

In [None]:
# Compute IFEval
ifeval_inst_score = data['results']['leaderboard_ifeval']['inst_level_strict_acc,none'] * 100
ifeval_prompt_score = data['results']['leaderboard_ifeval']['prompt_level_strict_acc,none'] * 100

# Average IFEval scores
ifeval_score = (ifeval_inst_score + ifeval_prompt_score) / 2

In [None]:
# Raw IFEval
raw_inst_score = data['results']['leaderboard_ifeval']['inst_level_strict_acc,none']
raw_prompt_score = data['results']['leaderboard_ifeval']['prompt_level_strict_acc,none']

In [None]:
# Normalize MUSR scores
musr_scores = []

for subtask, num_choices in musr_subtasks.items():
    musr_raw_score = data['results'][f'leaderboard_musr_{subtask}']['acc_norm,none']
    lower_bound = 1 / num_choices
    normalized_score = normalize_within_range(musr_raw_score, lower_bound, 1.0)
    musr_scores.append(normalized_score)

musr_score = sum(musr_scores) / len(musr_scores)

In [None]:
# Calculate overall score
overall_score = (bbh_score + math_score + gpqa_score + mmlu_pro_score + musr_score + ifeval_score) / 6

In [None]:
# Round all scores to 2 decimal places
bbh_score = round(bbh_score, 2)
math_score = round(math_score, 2)
gpqa_score = round(gpqa_score, 2)
mmlu_pro_score = round(mmlu_pro_score, 2)
musr_score = round(musr_score, 2)
ifeval_score = round(ifeval_score, 2)
overall_score = round(overall_score, 2)

## Normalized results

In [None]:
# Final results
results = {
    "Model name": model_name,
    "Precision": precision,
    "Revision": revision,
    "Average": overall_score,
    "IFEval": ifeval_score,
    "BBH": bbh_score,
    "MATH Lvl 5": math_score,
    "GPQA": gpqa_score,
    "MUSR": musr_score,
    "MMLU-PRO": mmlu_pro_score
}

results

{'Model name': 'meta-llama/Llama-3.2-1B',
 'Precision': 'torch.bfloat16',
 'Revision': 'a7c18587d7f473bfea02aa5639aa349403307b54',
 'Average': 4.03,
 'IFEval': 14.78,
 'BBH': 4.37,
 'MATH Lvl 5': 0.23,
 'GPQA': 0.0,
 'MUSR': 2.56,
 'MMLU-PRO': 2.26}