In [3]:
import os, json
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch.nn.functional as F

with open("results/all_results_processed.json") as f:
    all_results = json.load(f)

In [4]:
all_results['metadata']

{'model': 'Qwen/Qwen3-4B',
 'eval_model': 'gpt-5-nano-2025-08-07',
 'sampling_params': {'n': 50, 'temperature': 0.2, 'max_tokens': 512},
 'total_questions': 100,
 'total_candidates': 5000,
 'dataset': 'infobench'}

In [5]:
len(all_results['results'])

100

In [6]:
all_results['results'][0].keys()

dict_keys(['question_id', 'prompt', 'question_data', 'candidates'])

In [7]:
len(all_results['results'][0]['candidates'])

50

In [8]:
all_results['results'][0]['candidates'][0].keys()

dict_keys(['candidate_id', 'generated_text', 'finish_reason', 'token_ids', 'generation_len', 'full_chat', 'scores'])

In [None]:
# TIP: use these "generated_text" fields to count unique generations for each infobench question
all_results['results'][0]['candidates'][0]['generated_text']

'Sure! Below is a Python code snippet that constructs a two-hidden layer feedforward neural network using PyTorch\'s `torch.nn` module. The network includes an input layer, two hidden layers with ReLU activation, and an output layer. The number of neurons in the hidden layers is chosen between 32 and 128 to maintain a reasonably sized network. Comments are included to clarify each step.\n\n```python\nimport torch\nimport torch.nn as nn\n\n# Define the neural network class\nclass TwoHiddenLayerNetwork(nn.Module):\n    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):\n        super(TwoHiddenLayerNetwork, self).__init__()\n        \n        # Input layer: input_size -> hidden_size1\n        self.layer1 = nn.Linear(input_size, hidden_size1)\n        \n        # Hidden layer 1: hidden_size1 -> hidden_size2\n        self.layer2 = nn.Linear(hidden_size1, hidden_size2)\n        \n        # Hidden layer 2: hidden_size2 -> output_size\n        self.layer3 = nn.Linear(hidd

In [None]:
# TIP: Use half precision when loading the model for efficiency. Providing one leading example
model_name = "Qwen/Qwen3-4B"
model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.bfloat16).to('cuda:0')
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 56.06it/s]


In [None]:
all_results['results'][42]['candidates'][0].keys() 
# TIP: Remember that each of the 100 questions has 50 candidates
# You will put your calculated scores in the 'scores' dict

dict_keys(['candidate_id', 'generated_text', 'finish_reason', 'token_ids', 'generation_len', 'full_chat', 'scores'])

In [13]:
# TIP: We are providing a strict superset of the information you need to complete this problem. 
# The json contains multiple different forms for each example and candidate. Here is the code we 
# originally used to make these forms.

"""
for each in all_results['results']:
    og_input = each['question_data']['input']
    og_instruction = each['question_data']['instruction']
    intermediate_prompt = each['prompt']
    chat_form = [{"role": "user", "content": intermediate_prompt}]
    processed_prompt = tokenizer.apply_chat_template(chat_form, tokenize=False, add_generation_prompt=True, enable_thinking=False)
    prompt_tokens = tokenizer(processed_prompt)['input_ids']

    d = {'og_input': og_input,
         'og_instruction': og_instruction,
         'intermediate_prompt': intermediate_prompt,
         'chat_form': chat_form,
         'processed_prompt': processed_prompt,
         'prompt_tokens': prompt_tokens}
    
    each['prompt'] = d

    for candidate in each['candidates']:
       full_chat_form = [{"role": "user", "content": intermediate_prompt}, {"role": "assistant", "content": candidate['generated_text']}]
       full_chat_templated = tokenizer.apply_chat_template(full_chat_form, tokenize=False, add_generation_prompt=False)
       full_chat_tokens = tokenizer.apply_chat_template(full_chat_form)
       dd = {
           'full_chat_form': full_chat_form,
           'processed_full_chat': full_chat_templated,
           'full_chat_tokens': full_chat_tokens, 
       }
       candidate['full_chat'] = dd
       candidate['scores'] = {'infobench': candidate.pop('infobench_score'), # (just bringing this down into the scores dict)
                              'qwen3_4b': None,
                              'qwen3_13b': None,
                              'r_scalar': None,
                              'r_pairwise': None,
                              'mbr_bleu': None,
                              'mbr_bert': None,
                              }
"""

'\nfor each in all_results[\'results\']:\n    og_input = each[\'question_data\'][\'input\']\n    og_instruction = each[\'question_data\'][\'instruction\']\n    intermediate_prompt = each[\'prompt\']\n    chat_form = [{"role": "user", "content": intermediate_prompt}]\n    processed_prompt = tokenizer.apply_chat_template(chat_form, tokenize=False, add_generation_prompt=True, enable_thinking=False)\n    prompt_tokens = tokenizer(processed_prompt)[\'input_ids\']\n\n    d = {\'og_input\': og_input,\n         \'og_instruction\': og_instruction,\n         \'intermediate_prompt\': intermediate_prompt,\n         \'chat_form\': chat_form,\n         \'processed_prompt\': processed_prompt,\n         \'prompt_tokens\': prompt_tokens}\n    \n    each[\'prompt\'] = d\n\n    for candidate in each[\'candidates\']:\n       full_chat_form = [{"role": "user", "content": intermediate_prompt}, {"role": "assistant", "content": candidate[\'generated_text\']}]\n       full_chat_templated = tokenizer.apply_chat

In [None]:
all_results['results'][0]['candidates'][20]['scores']
# TIP: this is where you'll want to fill in the different scores you calculate
# "oracle" infobench score has been provided for you

{'infobench': 1.0,
 'qwen3_4b': None,
 'qwen3_13b': None,
 'r_scalar': None,
 'r_pairwise': None,
 'mbr_bleu': None,
 'mbr_bert': None}

In [14]:
in_toks = all_results['results'][0]['prompt']['prompt_tokens']

out_toks = all_results['results'][0]['candidates'][20]['token_ids']

all_toks = all_results['results'][0]['candidates'][20]['full_chat']['full_chat_tokens']

model_input = torch.tensor([all_toks]).to('cuda:0')

In [None]:
with torch.no_grad():
    logits = model(model_input).logits
    log_probs = F.log_softmax(logits[0], dim=-1)

# TIP: we're giving you this much with the above example because the harder part is yet to come

In [17]:
model_input.shape, len(in_toks), len(out_toks)

(torch.Size([1, 623]), 109, 512)

In [None]:
# TIP: calculate a logprob score with by accumulating a sum over the relevant 
# logits... For the exact example above, the value should be greater than -30
# (as in, less negative than -30)
cumulative_logprobs = 0

# TODO: your code here

... and now you have the rest of the problem to do :) Just remember to put your scores in
the `scores` dict for each candidate