In [18]:
import neptune as neptune
from datetime import datetime, timedelta
import pandas as pd
import pytz


In [39]:
def get_neptune_table(tags, negative_tags=None, active=False):
    project = neptune.init_project(
        project="pmtest/llm-random",
        mode="read-only",
        api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIyMDY0ZDI5Ni05YWU3LTQyNGYtYmY4My1hZTFkY2EzYmUwMjgifQ=="
    )
    if active:
        runs_table = project.fetch_runs_table(tag=tags, state='active').to_pandas()
    else:
        runs_table = project.fetch_runs_table(tag=tags).to_pandas()

    # If negative tags are provided, filter them out
    if negative_tags is not None:
        for neg_tag in negative_tags:
            runs_table = runs_table[~runs_table['sys/tags'].apply(lambda x: neg_tag in x)]

    print(f'table downloaded\nshape: {runs_table.shape}')
    return runs_table

def get_training_progress_with_eta(runs_table, run_specs, active=False):
    # Filter runs based on the provided specifications
    for column, value in run_specs.items():
        if value is not None:
            runs_table = runs_table[runs_table[column] == value]

    if runs_table.empty:
        print("No runs found with the specified criteria.")
        return None
    results = []
    
    for idx, run in runs_table.iterrows():        
        # Fetch necessary information
        exp_id = run['sys/id']
        current_step = run["step"]
        n_steps = run["args/n_steps"]
        time_running = run['sys/running_time']
        n_gpus = run['args/n_gpus']

        # Calculate the percentage of training completed
        if n_steps > 0:
            percent_finished = (current_step / n_steps) * 100
        else:
            percent_finished = 0.0
        
        # Estimate ETA (remaining time)
        if current_step > 0:
            time_per_step = time_running / current_step
            remaining_steps = n_steps - current_step
            time_left = time_per_step * remaining_steps
            # time_left_hours = time_left.total_seconds() / 3600  # Convert to hours
        else:
            time_left = None
        
        # Collect the data into a dictionary
        result = {
            # "args/name": run.get("args/name", None),
            "ID": exp_id,
            "time_running": time_running / 3600,
            "n_gpus": n_gpus,
            "gpu_hours": time_running * n_gpus / 3600,
            "time_left (hours)": time_left,
            "%finished": percent_finished,
            "step": current_step,
            "host": run['sys/hostname']
        }
        
        # Include the run specifications in the result
        for column in run_specs.keys():
            result[column] = run.get(column, None)
        
        results.append(result)
    
    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

In [47]:
model_sizes = {
    # 'none': None,
    '4M': 3145728,
    # '9M': 7077888,
    '33M': 25165824,
    '65M': 49152000,
    '113M': 84934656,
    '260M': 201326592,
    '520M': 393216000,
}

training_lengths = {
    # 'none': None,
    '500M': 1907,
    '1B': 3814,
    '2B': 7628,
    '4B': 15256,
    '8B': 30512,
    '16B': 61024,
    '32B': 122048,
    '64B': 244150,
    '128B': 488300,
}

In [48]:
project = neptune.init_project(
    project="pmtest/llm-random",
    mode="read-only",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIyMDY0ZDI5Ni05YWU3LTQyNGYtYmY4My1hZTFkY2EzYmUwMjgifQ=="
)

runs_table = project.fetch_runs_table(tag=["constrained_scaling_grid_final"]).to_pandas()
print(runs_table.shape)

runs_table = get_neptune_table(tags=['constrained_scaling_grid_final'], negative_tags=['remove', 'remove_constrained_scaling_laws'])

https://app.neptune.ai/pmtest/llm-random/
(320, 4342)
https://app.neptune.ai/pmtest/llm-random/
table downloaded
shape: (276, 4342)


In [49]:
total_gpu_hours = 0
total_completed_hours = 0

for model_size, model_value in model_sizes.items():
    for training_length, training_value in training_lengths.items():
        run_specifications = {
            'args/model_n_active': model_value,
            'args/n_steps': training_value,
        }
        df = get_training_progress_with_eta(runs_table=runs_table,
                                            run_specs=run_specifications)
        if df is not None:
            gpu_hours = df['gpu_hours'].sum()
            total_gpu_hours += gpu_hours
            completed_runs_hours = df[df['%finished'] == 100]['gpu_hours'].sum()
            total_completed_hours += completed_runs_hours
            print(f"gpu_hours (completed/all): {round(completed_runs_hours, 1)}/{round(gpu_hours, 1)}\tfinished: {df[df['%finished'] == 100].shape[0]}/{df.shape[0]}\tmodel: {model_size}\tsteps: {training_length}")
        else:
            print(f"gpu_hours: no-runs\tmodel: {model_size}\tsteps: {training_length}")

print()
print(f'Used total {round(total_gpu_hours, 2)} GPU (A100) hours')
print(f'Used for completed runs {round(total_completed_hours, 2)} GPU (A100) hours')


gpu_hours (completed/all): 23.5/23.5	finished: 6/6	model: 4M	steps: 500M
gpu_hours (completed/all): 41.7/41.7	finished: 6/6	model: 4M	steps: 1B
gpu_hours (completed/all): 94.4/94.4	finished: 6/6	model: 4M	steps: 2B
gpu_hours (completed/all): 110.2/110.2	finished: 6/6	model: 4M	steps: 4B
gpu_hours (completed/all): 35.7/35.7	finished: 6/6	model: 4M	steps: 8B
gpu_hours (completed/all): 80.4/80.4	finished: 6/6	model: 4M	steps: 16B
gpu_hours (completed/all): 156.9/156.9	finished: 6/6	model: 4M	steps: 32B
No runs found with the specified criteria.
gpu_hours: no-runs	model: 4M	steps: 64B
No runs found with the specified criteria.
gpu_hours: no-runs	model: 4M	steps: 128B
gpu_hours (completed/all): 34.8/34.8	finished: 6/6	model: 33M	steps: 500M
gpu_hours (completed/all): 70.5/70.5	finished: 6/6	model: 33M	steps: 1B
gpu_hours (completed/all): 134.9/134.9	finished: 6/6	model: 33M	steps: 2B
gpu_hours (completed/all): 353.3/353.3	finished: 17/18	model: 33M	steps: 4B
gpu_hours (completed/all): 87.6/

In [16]:
def exp_status_by_tags(tags, negative_tags=None, active=False):

    runs_table = get_neptune_table(tags=tags, negative_tags=negative_tags, active=active)
    print(runs_table.shape)

    run_specs = {"args/n_steps": training_lengths['none'], "args/model_n_active": model_sizes['none'], "args/expansion_rate": None, "sys/state": None}
    active = True

    result = get_training_progress_with_eta(runs_table, run_specs, active=active)
    result["args/model_n_active"] = (result["args/model_n_active"] / 1000000).round().astype(int)
    result["time_left (d)"] = (result["time_left (hours)"] / 24).round(2)
    # result = result[result["%finished"] != 100]

    return result


In [17]:
component_names = ['embedding_layer', 'head', 'gating', 'expert_inner_function', 'projection']
start_ends = ['start', 'end']
negative_tags = ['remove']

uplot_medium_tags = ['relativity_paper']
active=True

# for start_end in start_ends:
#     for component in component_names:
#         uplot_medium_tags = ['relativity_paper', 'medium_model', 'uplot', start_end, component]

#         df = get_neptune_table(tags=uplot_medium_tags, negative_tags=negative_tags)
#         print(f'{start_end}, component: {component}, shape: {df.shape}')

not_finished = exp_status_by_tags(tags=uplot_medium_tags, negative_tags=negative_tags, active=active)
# print(not_finished.shape)
# not_finished = not_finished[not_finished['sys/state'] != 'Active']
print(not_finished)
not_finished.to_csv('not_finished_relativity.csv')

https://app.neptune.ai/pmtest/llm-random/
table downloaded
shape: (3, 733)
(3, 733)
(3, 733)
                ID  time_running  time_left (hours)  %finished     step  \
0  LLMRANDOM-16074     79905.495       82874.604006     49.088  24544.0   
1  LLMRANDOM-16073    132897.652       26131.703734     83.568  41784.0   
2  LLMRANDOM-16028    206926.890        8129.117067     96.220  48110.0   

             host  args/n_steps  args/model_n_active  args/expansion_rate  \
0  164-152-24-115         50000                  679                    8   
1         login01         50000                  679                    8   
2         4124gs0         50000                  679                    8   

  sys/state  time_left (d)  
0    Active        3453.11  
1    Active        1088.82  
2    Active         338.71  
