## Core Functions

In [15]:
import os
import json
import pandas as pd

def crawl_ducttape_outputs(
    ducttape_output: str,
    results_task: str = "EvalLLM",
    results_json: str = "results.json",
):
    """Crawl the outputs of ducttape jobs"""
    results_dir = os.path.join(ducttape_output, results_task)
    branches_info = []
    for filename in os.listdir(results_dir):
        # check if its not directory
        filepath = os.path.join(results_dir, filename)
        if os.path.isfile(filepath):
            continue
        
        branches = filename.split("+")
        kwargs = {
            branch.split(".")[0]: branch.split(".")[1] 
            for branch in branches
        }

        branches_info.append((filepath, kwargs))

    print(branches_info)
    num_branches = max(len(i[1]) for i in branches_info)
    results = []
    for filepath, kwargs in branches_info:
        # skip folders with less branches (so we dont repeat on symlinks)
        if len(kwargs) < num_branches:
            continue

        # read results
        with open(os.path.join(filepath, results_json)) as f:
            metrics = json.load(f)
            row = {**kwargs, **metrics}
            results.append(row)

    return pd.DataFrame.from_records(results)

In [16]:
TOWERLLM_OUTS = "/mnt/data/patrick/towerllm-outs/llama2_c4small"
output_df = crawl_ducttape_outputs(
    TOWERLLM_OUTS,
    results_task="Train",
)

In [17]:
import yaml

CONFIGS_FOLDER = "../configs/models"
SIZE_CONFIGS = {
    "base": os.path.join(CONFIGS_FOLDER, "llama2_1b.yml"),
    "small1": os.path.join(CONFIGS_FOLDER, "llama2_50m.yml"),
    "small2": os.path.join(CONFIGS_FOLDER, "llama2_150m.yml"),
    "small3": os.path.join(CONFIGS_FOLDER, "llama2_300m.yml"),
}

def get_size(
    hidden_size,
    ffn_hidden_size,
    num_layers,
    num_attention_heads,
    num_kv_heads,
    *args,
    vocab_size=32000,
    **kwargs,
):
    embeddings_ps = vocab_size * hidden_size
    grouped_size = hidden_size // (num_attention_heads // num_kv_heads)
    layer_attn_ps = hidden_size * hidden_size * 2 + grouped_size * grouped_size * 2
    layer_ffn_ps = hidden_size * ffn_hidden_size * 2
    layer_ln_ps = hidden_size * 2
    layer_ps = layer_attn_ps + layer_ffn_ps + layer_ln_ps
    decoder_ps = hidden_size * vocab_size * 2

    return embeddings_ps + num_layers * layer_ps + decoder_ps

def get_scaling_df(output_df):
    df = output_df.copy()
    def _get_size_from_name(name):
        config_f = SIZE_CONFIGS[name]
        with open(config_f) as f:
            config = yaml.load(f, Loader=yaml.FullLoader)
        return get_size(**config)
        
    df["Size"] = df["Size"].apply(_get_size_from_name)
    return df

In [18]:
scaling_df = get_scaling_df(output_df)
scaling_df

Unnamed: 0,DSAutotuneConfig,Size,UseDeepSpeed,ppl
0,False,65673216,True,10
