In [2]:
import os 
os.environ["HF_HOME"] = "/scratch/ezq9qu/models/cache"
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
from peft import get_peft_model, LoraConfig
from tqdm import tqdm
from lm_eval import evaluator
import evaluate
from sklearn.model_selection import train_test_split

In [3]:
import random

# 1. Load Data
all_data = pd.read_csv("training_data(1).csv")
all_data = all_data.rename(columns={"nws_forecast": "prompt_text", "human_forecast": "Response"})

# 2. SPLIT FIRST (Crucial Step)
# We split immediately so we can isolate the "Training Pool" for our few-shot examples
df_train, df_temp = train_test_split(all_data, train_size=0.8, random_state=126)
df_val, df_test = train_test_split(df_temp, train_size=0.5, random_state=126)

# 3. Create Example Pool (Strictly from Train)
example_pool = df_train[['prompt_text', 'Response']].to_dict('records')

# 4. Define Dynamic Formatting Function
def construct_dynamic_prompt(row, include_answer=True):
    """
    Constructs a prompt with 3 random examples from the training pool.
    """
    # Sample 3 random examples
    try:
        samples = random.sample(example_pool, 3)
    except ValueError:
        samples = example_pool 

    examples_text = ""
    for i, sample in enumerate(samples, 1):
        examples_text += (
            f"Example {i}:\n"
            f"Input: {sample['prompt_text']}\n"
            f"Response: {sample['Response']}\n\n"
        )

    instruction_text = (
        "Output a human-readable surf-forecast similar to that of a veteran surf-observer. "
        "The response should take into account the winds, sea-state, and wave period. "
        "The final output should be a few short sentences, with some surfing lingo and flair."
    )

    prompt = (
        f"Q: {instruction_text}\n\n"
        f"Here are some examples of how to respond:\n{examples_text}"
        f"Now, respond to the following forecast data:\n"
        f"Input: {row['prompt_text']}\n"
        f"A: " 
    )
    
    # For training, we append the answer. For generation, we might want just the prompt.
    if include_answer:
        return prompt + str(row['Response'])
    else:
        return prompt

# 5. Apply to Dataframes
# Use .copy() to avoid SettingWithCopy warnings
df_train = df_train.copy()
df_val = df_val.copy()
df_test = df_test.copy()

# Create 'Instruct' column (Full text for training)
df_train['Instruct'] = df_train.apply(lambda row: construct_dynamic_prompt(row, include_answer=True), axis=1)
df_val['Instruct'] = df_val.apply(lambda row: construct_dynamic_prompt(row, include_answer=True), axis=1)

# Create 'Eval_Prompt' column (Text ending at "A:" for inference generation)
df_val['Eval_Prompt'] = df_val.apply(lambda row: construct_dynamic_prompt(row, include_answer=False), axis=1)

dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "validation": Dataset.from_pandas(df_val),
    "test": Dataset.from_pandas(df_test)
})

print("Data loaded and formatted with Dynamic Few-Shot examples.")
print(f"Sample Prompt:\n{df_train.iloc[0]['Instruct'][:500]}...") # Print preview

Data loaded and formatted with Dynamic Few-Shot examples.
Sample Prompt:
Q: Output a human-readable surf-forecast similar to that of a veteran surf-observer. The response should take into account the winds, sea-state, and wave period. The final output should be a few short sentences, with some surfing lingo and flair.

Here are some examples of how to respond:
Example 1:
Input: Wind: E winds 10 to 15 kt, Seas: 3 to 4 ft, Wave Detail: E 4 ft at 12 seconds and E 2 ft at 4 seconds.
Response: Hey everyone! Looks like there is still a knee to waist high wave out back. Win...


In [3]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Instruct-2507", padding_side = 'left')
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-4B-Instruct-2507",
    torch_dtype="auto",
    device_map="auto"
)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
num_virtual_tokens = 10
num_epochs = 5
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_type_id

In [5]:
# Updated Cell 8
def tokenize_function(samples):
    return tokenizer(
        samples['Instruct'], 
        truncation=True, 
        padding="max_length", 
        max_length=2048 # Increased to accommodate few-shot examples
    )

train = dataset["train"].map(tokenize_function, batched=True)
val = dataset["validation"].map(tokenize_function, batched=True)

Map:   0%|          | 0/716 [00:00<?, ? examples/s]

Map:   0%|          | 0/89 [00:00<?, ? examples/s]

In [6]:
text_gen = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    dtype = torch.bfloat16,
    device_map = "auto",
    do_sample = False
)

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [7]:
# Updated Cell 10
# We use the 'Eval_Prompt' column which strictly cuts off before the answer
outputs = text_gen(
    dataset["validation"]["Eval_Prompt"], 
    batch_size = 8, # Reduced batch size slightly as prompts are longer
    max_new_tokens = 128, # Limit generation length
    return_full_text = False # Only return the generated part
)

`generation_config` default values have been modified to match model-specific defaults: {'do_sample': True}. If this is not desired, please set these values explicitly.


In [8]:

predictions = []
for output in outputs:
    # If return_full_text=False, the output is just the generated response
    # If return_full_text=True, we need to split
    text = output[0]['generated_text']
    
    # Safety split just in case the model repeats the prompt or "A:"
    if "A:" in text:
        text = text.rsplit("A:", 1)[-1]
        
    predictions.append(text.strip())

In [9]:
rouge_metric = evaluate.load('rouge')
bert_metric = evaluate.load('bertscore')
bleu_metric = evaluate.load('bleu')

Downloading builder script: 0.00B [00:00, ?B/s]

In [11]:
references = val["Response"]

In [12]:
rouge_scores = rouge_metric.compute(predictions=predictions,references=references)
bert_scores = bert_metric.compute(predictions=predictions,references=references, lang= 'en')
bleu_score = bleu_metric.compute(predictions=predictions,references=references)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def show_results(rouge_score, bert_score, bleu_metric):

    print(predictions[0])
    print("\n")
    print(references[0])


    print("\n--- ROUGE Scores ---")
    print(f"  ROUGE-1: {rouge_scores['rouge1']:.4f}")
    print(f"  ROUGE-2: {rouge_scores['rouge2']:.4f}")
    print(f"  **ROUGE-L: {rouge_scores['rougeL']:.4f}**")

    print("\n--- BLEU Score ---")
    print(f"  **BLEU: {bleu_score['bleu']:.4f}**")

    print("\n--- BERTScore ---")
    avg_f1 = np.mean(bert_scores['f1'])
    print(f"  **Average F1: {avg_f1:.4f}**")

In [14]:
show_results(rouge_score=rouge_scores,bert_score=bert_scores,bleu_metric=bleu_score)

Good morning! The surf is holding steady with a clean, punchy 2-foot swell coming from the southeast at 8 seconds—perfect for carving and catching the long, smooth rides. A light NE breeze at 5 knots keeps things manageable, with minimal wind interference. The E swell at 5 seconds is a bit shorter and more choppy, so stay on the right side for the best shape. Tide’s rising, so keep an eye on the window—mid-morning to early afternoon should be peak. Ride the clean sets, and don’t let the wind get to you—this one’s shaping up to be a solid


Hey guys! There is a little longboard wave out back, but it is pretty calm. The ocean surface is clean, knee high, with barely any wind. The tide is going out, with low tide at 6:30pm. If you have time this evening, try to paddle out! Keep an eye on the cam and check back […]

--- ROUGE Scores ---
  ROUGE-1: 0.2283
  ROUGE-2: 0.0309
  **ROUGE-L: 0.1324**

--- BLEU Score ---
  **BLEU: 0.0119**

--- BERTScore ---
  **Average F1: 0.8321**


In [15]:
model = AutoModelForCausalLM.from_pretrained('/scratch/ezq9qu/wandb-sweeps/glorious-sweep-8/best_model/',device_map="auto",dtype=torch.bfloat16)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
text_gen = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    dtype = torch.bfloat16,
    device_map = "auto",
    do_sample = False
)

Device set to use cuda:0


In [17]:
# Updated Cell 10
# We use the 'Eval_Prompt' column which strictly cuts off before the answer
outputs = text_gen(
    dataset["validation"]["Eval_Prompt"], 
    batch_size = 8, # Reduced batch size slightly as prompts are longer
    max_new_tokens = 128, # Limit generation length
    return_full_text = False # Only return the generated part
)

In [18]:
predictions = []
for output in outputs:
    # If return_full_text=False, the output is just the generated response
    # If return_full_text=True, we need to split
    text = output[0]['generated_text']
    
    # Safety split just in case the model repeats the prompt or "A:"
    if "A:" in text:
        text = text.rsplit("A:", 1)[-1]
        
    predictions.append(text.strip())

In [19]:
rouge_metric = evaluate.load('rouge')
bert_metric = evaluate.load('bertscore')
bleu_metric = evaluate.load('bleu')

In [20]:
rouge_scores = rouge_metric.compute(predictions=predictions,references=references)
bert_scores = bert_metric.compute(predictions=predictions,references=references, lang= 'en')
bleu_score = bleu_metric.compute(predictions=predictions,references=references)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
show_results(rouge_score=rouge_scores,bert_score=bert_scores,bleu_metric=bleu_score)

Hey everyone! There’s a little longboard wave out back, about shin to knee high. It’s pretty clean, with the buoy at 2ft @ 8 seconds and wind blowing ENE at 5mph. High tide is at 5:08pm, so keep an eye on the cam as the afternoon goes on. Check back in tomorrow for another […]


Hey guys! There is a little longboard wave out back, but it is pretty calm. The ocean surface is clean, knee high, with barely any wind. The tide is going out, with low tide at 6:30pm. If you have time this evening, try to paddle out! Keep an eye on the cam and check back […]

--- ROUGE Scores ---
  ROUGE-1: 0.3277
  ROUGE-2: 0.0830
  **ROUGE-L: 0.2074**

--- BLEU Score ---
  **BLEU: 0.0702**

--- BERTScore ---
  **Average F1: 0.8717**


In [22]:
SUCCCCCCCCCCCESSSSSSSSSSSSS

NameError: name 'SUCCCCCCCCCCCESSSSSSSSSSSSS' is not defined

In [9]:
from huggingface_hub import login

# Option 1: Interactive login (recommended)
login() 


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
model.push_to_hub("SurfMine")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...pfr3nzcy7/adapter_model.safetensors:  37%|###6      | 17.3MB / 47.2MB            

CommitInfo(commit_url='https://huggingface.co/pfost-bit/SurfMine/commit/c31472a4d760348e0ff8f9a2526b294f41010756', commit_message='Upload Qwen3ForCausalLM', commit_description='', oid='c31472a4d760348e0ff8f9a2526b294f41010756', pr_url=None, repo_url=RepoUrl('https://huggingface.co/pfost-bit/SurfMine', endpoint='https://huggingface.co', repo_type='model', repo_id='pfost-bit/SurfMine'), pr_revision=None, pr_num=None)

In [26]:
tokenizer.push_to_hub("SurfMine")

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmp0_lg_mcw/tokenizer.json       : 100%|##########| 11.4MB / 11.4MB            

CommitInfo(commit_url='https://huggingface.co/pfost-bit/SurfMine/commit/20987b8d676d4ccbd2cbf00881727a70d0f5bb0d', commit_message='Upload tokenizer', commit_description='', oid='20987b8d676d4ccbd2cbf00881727a70d0f5bb0d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/pfost-bit/SurfMine', endpoint='https://huggingface.co', repo_type='model', repo_id='pfost-bit/SurfMine'), pr_revision=None, pr_num=None)

In [10]:
dataset["train"].push_to_hub("surf_forecaster_dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  86%|########6 |  531kB /  616kB            

README.md:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/pfost-bit/surf_forecaster_dataset/commit/058b8645e219fa1d08e3b8b09bcf076fcbb16a12', commit_message='Upload dataset', commit_description='', oid='058b8645e219fa1d08e3b8b09bcf076fcbb16a12', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/pfost-bit/surf_forecaster_dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='pfost-bit/surf_forecaster_dataset'), pr_revision=None, pr_num=None)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("pfost-bit/SurfMine", padding_side = 'left')
model = AutoModelForCausalLM.from_pretrained(
    "pfost-bit/SurfMine",
    torch_dtype="auto",
    device_map="auto"
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/496 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

adapter_config.json:   0%|          | 0.00/862 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/47.2M [00:00<?, ?B/s]

In [5]:
text_gen = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    dtype = torch.bfloat16,
    device_map = "auto",
    do_sample = False
)

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [17]:
example = df_val.iloc[0]['prompt_text']

In [24]:
def dynamic_prompt(prompt_text):
    instruction_text = (
        "Output a human-readable surf-forecast similar to that of a veteran surf-observer. "
        "The response should take into account the winds, sea-state, and wave period. "
        "The final output should be a few short sentences, with some surfing lingo and flair."
    )

    prompt = (
        f"Q: {instruction_text}\n\n"
        f"Now, respond to the following forecast data:\n"
        f"Input: {prompt_text}\n"
        f"A: " 
    )

    return prompt

In [25]:
prompt_example = dynamic_prompt(example)

In [27]:
prompt_example

'Q: Output a human-readable surf-forecast similar to that of a veteran surf-observer. The response should take into account the winds, sea-state, and wave period. The final output should be a few short sentences, with some surfing lingo and flair.\n\nNow, respond to the following forecast data:\nInput: Wind: NE winds 5 kt, Seas: 2 ft, Wave Detail: SE 2 ft at 8 seconds and E 1 ft at 5 seconds.\nA: '

In [28]:
text_gen(
    prompt_example,
    max_new_tokens = 128, # Limit generation length
    return_full_text = False
)

`generation_config` default values have been modified to match model-specific defaults: {'do_sample': True}. If this is not desired, please set these values explicitly.


[{'generated_text': '\xa0Hey everyone! There is not much going on behind the shop at the moment. Sets are still breaking close to shore in the knee high range but pretty much nothing bigger than that. Wind is blowing ENE at 13mph keeping some texture on the surface. We are approaching high tide, slotted for 3:06pm. Check back […]'}]

In [1]:
import os 
os.environ["HF_HOME"] = "/scratch/ezq9qu/models/cache"
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
from tqdm import tqdm
from lm_eval import evaluator


In [1]:
del model
del tokenizer
torch.cuda.empty_cache()

NameError: name 'model' is not defined

In [2]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Instruct-2507", padding_side = 'left')
model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-4B-Instruct-2507",
    torch_dtype="auto",
    device_map="auto"
)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
results = evaluator.simple_evaluate(
    model = "hf", #Specify huggingface model
    model_args = {"pretrained": model, "dtype": "bfloat16", "tokenizer": tokenizer}, #Define model arguments
    tasks = 'hellaswag', 
    log_samples = True, 
    batch_size = "1",
    limit = 20,
    random_seed = 126,
)


pretrained=Qwen3ForCausalLM(   (model): Qwen3Model(     (embed_tokens): Embedding(151936, 2560)     (layers): ModuleList(       (0-35): 36 x
        Qwen3DecoderLayer(         (self_attn): Qwen3Attention(           (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
        (k_proj): Linear(in_features=2560, out_features=1024, bias=False)           (v_proj): Linear(in_features=2560, out_features=1024,
        bias=False)           (o_proj): Linear(in_features=4096, out_features=2560, bias=False)           (q_norm): Qwen3RMSNorm((128,),
        eps=1e-06)           (k_norm): Qwen3RMSNorm((128,), eps=1e-06)         )         (mlp): Qwen3MLP(           (gate_proj):
        Linear(in_features=2560, out_features=9728, bias=False)           (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
        (down_proj): Linear(in_features=9728, out_features=2560, bias=False)           (act_fn): SiLUActivation()         )
        (input_layernorm): Qwen3RMSNorm((2560,), e

In [4]:
results['results']

{'hellaswag': {'alias': 'hellaswag',
  'acc,none': 0.4,
  'acc_stderr,none': 0.11239029738980327,
  'acc_norm,none': 0.45,
  'acc_norm_stderr,none': 0.11413288653790232}}

In [5]:
del model
del tokenizer
torch.cuda.empty_cache()

In [7]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct", padding_side = 'left')
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B-Instruct",
    torch_dtype="auto",
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
results = evaluator.simple_evaluate(
    model = "hf", #Specify huggingface model
    model_args = {"pretrained": model, "dtype": "bfloat16", "tokenizer": tokenizer}, #Define model arguments
    tasks = 'hellaswag', 
    log_samples = True, 
    batch_size = "1",
    limit = 20,
    random_seed = 126,
)

pretrained=LlamaForCausalLM(   (model): LlamaModel(     (embed_tokens): Embedding(128256, 3072)     (layers): ModuleList(       (0-27): 28 x
        LlamaDecoderLayer(         (self_attn): LlamaAttention(           (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
        (k_proj): Linear(in_features=3072, out_features=1024, bias=False)           (v_proj): Linear(in_features=3072, out_features=1024,
        bias=False)           (o_proj): Linear(in_features=3072, out_features=3072, bias=False)         )         (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)           (up_proj): Linear(in_features=3072,
        out_features=8192, bias=False)           (down_proj): Linear(in_features=8192, out_features=3072, bias=False)           (act_fn):
        SiLUActivation()         )         (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)         (post_attention_layernorm):
        LlamaRMSNorm((3072,), eps=1e-05)       )     )    

In [9]:
results['results']

{'hellaswag': {'alias': 'hellaswag',
  'acc,none': 0.4,
  'acc_stderr,none': 0.11239029738980327,
  'acc_norm,none': 0.55,
  'acc_norm_stderr,none': 0.11413288653790232}}

In [10]:
del model
del tokenizer
torch.cuda.empty_cache()

In [11]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", padding_side = 'left')
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b-it",
    torch_dtype="auto",
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
results = evaluator.simple_evaluate(
    model = "hf", #Specify huggingface model
    model_args = {"pretrained": model, "dtype": "bfloat16", "tokenizer": tokenizer}, #Define model arguments
    tasks = 'hellaswag', 
    log_samples = True, 
    batch_size = "1",
    limit = 20,
    random_seed = 126,
)

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration
100%|██████████| 20/20 [00:00<00:00, 3436.12it/s]
Running loglikelihood requests: 100%|██████████| 80/80 [00:03<00:00, 23.60it/s]


In [13]:
results['results']

{'hellaswag': {'alias': 'hellaswag',
  'acc,none': 0.4,
  'acc_stderr,none': 0.11239029738980327,
  'acc_norm,none': 0.6,
  'acc_norm_stderr,none': 0.11239029738980327}}

In [None]:
del model
del tokenizer
torch.cuda.empty_cache()

In [14]:
tokenizer = AutoTokenizer.from_pretrained("pfost-bit/SurfMine", padding_side = 'left')
model = AutoModelForCausalLM.from_pretrained(
    "pfost-bit/SurfMine",
    torch_dtype="auto",
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
results = evaluator.simple_evaluate(
    model = "hf", #Specify huggingface model
    model_args = {"pretrained": model, "dtype": "bfloat16", "tokenizer": tokenizer}, #Define model arguments
    tasks = 'hellaswag', 
    log_samples = True, 
    batch_size = "1",
    limit = 20,
    random_seed = 126,
)

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration
100%|██████████| 20/20 [00:00<00:00, 3398.95it/s]
Running loglikelihood requests: 100%|██████████| 80/80 [00:04<00:00, 19.12it/s]


In [16]:
results['results']

{'hellaswag': {'alias': 'hellaswag',
  'acc,none': 0.35,
  'acc_stderr,none': 0.1094243309804831,
  'acc_norm,none': 0.55,
  'acc_norm_stderr,none': 0.11413288653790232}}