In [1]:
import neptune as neptune
from datetime import datetime, timedelta
import pandas as pd
import pytz


In [2]:
def get_neptune_table(tags, negative_tags=None, active=False):
    # Initialize the Neptune project in read-only mode
    project = neptune.init_project(
        project="pmtest/llm-random",
        mode="read-only",
        api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIyMDY0ZDI5Ni05YWU3LTQyNGYtYmY4My1hZTFkY2EzYmUwMjgifQ==",
    )

    # Fetch runs based on the active status and specified tags
    if active:
        runs_table = project.fetch_runs_table(tag=tags, state='active').to_pandas()
    else:
        runs_table = project.fetch_runs_table(tag=tags).to_pandas()

    # Filter out runs with any negative tags
    if negative_tags:
        for neg_tag in negative_tags:
            runs_table = runs_table[~runs_table['sys/tags'].apply(lambda tags: neg_tag in tags)]

    print(f'Table downloaded\nShape: {runs_table.shape}')
    return runs_table


def get_training_progress_with_eta(runs_table, run_specs=None, active=False):
    # Apply specifications to filter runs
    if run_specs is not None:
        for column, value in run_specs.items():
            if value is not None:
                runs_table = runs_table[runs_table[column] == value]

    # If no matching runs are found, return None
    if runs_table.empty:
        print("No runs found with the specified criteria.")
        return None

    # Gather results into a list of dictionaries
    results = []
    for idx, run in runs_table.iterrows():
        # Extract run details
        exp_id = run['sys/id']
        current_step = run["step"]
        n_steps = run["args/n_steps"]
        time_running_hours = run['sys/running_time'] / 3600
        n_gpus = run['args/n_gpus']
        # expansion_rate = run['args/expansion_rate']

        # Calculate completion percentage
        percent_finished = (current_step / n_steps * 100) if n_steps > 0 else 0.0

        # Estimate remaining time (ETA) if progress has been made
        time_left = None
        if current_step > 0:
            time_per_step = time_running_hours / current_step
            time_left = time_per_step * (n_steps - current_step)

        # Collect relevant data in a dictionary
        result = {
            "ID": exp_id,
            # "E": expansion_rate,
            "GPU Hours": time_running_hours * n_gpus,
            "running_time": time_running_hours,
            "n_gpus": n_gpus,
            "step": current_step,
            "% Finished": percent_finished,
            "Host": run['sys/hostname'],
            "Time Left (h)": time_left,
            "Time Left (d)": time_left / 24,
        }

        # Include specified run parameters
        if run_specs is not None:
            for spec_column in run_specs.keys():
                result[spec_column] = run.get(spec_column, None)

        results.append(result)

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)

    # Host mapping with default assignment for hosts starting with 't'
    host_mapping = {
        'login01': 'IDEAS',
        'gpu01': 'IDEAS',
        '164-152-24-115': 'writer',
        '4124gs0': 'entropy'
    }
    results_df['Host'] = results_df['Host'].apply(lambda x: host_mapping.get(x, 'athena' if x.startswith('t') else 'unknown'))
    
    return results_df


def exp_status_by_tags(tags, negative_tags=None, active=False):
    # Retrieve runs table with specified tags and exclusions
    runs_table = get_neptune_table(tags=tags, negative_tags=negative_tags, active=active)
    print("Runs table shape:", runs_table.shape)

    # Define specifications to filter runs
    run_specs = {
        "args/n_steps": None,
        "args/model_n_active": None,
        "args/expansion_rate": None,
        "sys/state": None
    }

    # Get training progress with estimated time remaining (ETA)
    training_progress_df = get_training_progress_with_eta(runs_table, run_specs, active=active)

    # Convert model size to millions, round, and calculate time left in days
    if training_progress_df is not None:
        training_progress_df["args/model_n_active"] = (training_progress_df["args/model_n_active"] / 1_000_000).round().astype(int)
        training_progress_df["Time Left (days)"] = (training_progress_df["Time Left (hours)"] / 24).round(2)

    return training_progress_df


## Constrained

In [89]:
model_sizes = {
    # 'none': None,
    '4M': 3145728,
    # '9M': 7077888,
    '33M': 25165824,
    '65M': 49152000,
    '113M': 84934656,
    '260M': 201326592,
    '520M': 393216000,
}

training_lengths = {
    # 'none': None,
    '500M': 1907,
    '1B': 3814,
    '2B': 7628,
    '4B': 15256,
    '8B': 30512,
    '16B': 61024,
    '32B': 122048,
    '64B': 244150,
    '128B': 488300,
}

In [64]:
runs_table = get_neptune_table(tags=['constrained_scaling_grid_final'], negative_tags=['remove', 'remove_constrained_scaling_laws'])

https://app.neptune.ai/pmtest/llm-random/
Table downloaded
Shape: (276, 4342)


In [90]:
# Initialize counters and data storage
total_gpu_hours = 0
total_completed_hours = 0
all_runs_df = None

# Loop through each model size and training length to accumulate GPU usage
for model_size, active_units in model_sizes.items():
    for training_length, step_count in training_lengths.items():
        
        # Define run specifications for filtering
        run_specs = {
            'args/model_n_active': active_units,
            'args/n_steps': step_count,
        }

        # Fetch training progress based on current run specifications
        df = get_training_progress_with_eta(runs_table=runs_table, run_specs=run_specs)

        # If runs data exists, accumulate GPU hours and add to aggregate DataFrame
        if df is not None:
            gpu_hours = df['GPU Hours'].sum()
            completed_gpu_hours = df[df['% Finished'] == 100]['GPU Hours'].sum()
            df['model'] = model_size
            df['tokens'] = training_length  # Add descriptive columns

            # Update total GPU usage counters
            total_gpu_hours += gpu_hours
            total_completed_hours += completed_gpu_hours

            # Print summary for the current model and step count
            print(f"GPU Hours (Completed/Total): {round(completed_gpu_hours, 1)}/{round(gpu_hours, 1)}\t"
                  f"Finished Runs: {df[df['% Finished'] == 100].shape[0]}/{df.shape[0]}\t"
                  f"Model Size: {model_size}\tSteps: {training_length}")

            # Concatenate data for all runs, initializing if empty
            all_runs_df = pd.concat([all_runs_df, df], ignore_index=True) if all_runs_df is not None else df
        
        # Print a message if no runs are available for the given configuration
        else:
            print(f"GPU Hours: No runs available\tModel Size: {model_size}\tSteps: {training_length}")

# Final summary of accumulated GPU hours
print("\nSummary:")
print(f"Total GPU (A100) Hours Used: {round(total_gpu_hours, 2)}")
print(f"GPU Hours for Completed Runs: {round(total_completed_hours, 2)}")


GPU Hours (Completed/Total): 23.5/23.5	Finished Runs: 6/6	Model Size: 4M	Steps: 500M
GPU Hours (Completed/Total): 41.7/41.7	Finished Runs: 6/6	Model Size: 4M	Steps: 1B
GPU Hours (Completed/Total): 94.4/94.4	Finished Runs: 6/6	Model Size: 4M	Steps: 2B
GPU Hours (Completed/Total): 110.2/110.2	Finished Runs: 6/6	Model Size: 4M	Steps: 4B
GPU Hours (Completed/Total): 35.7/35.7	Finished Runs: 6/6	Model Size: 4M	Steps: 8B
GPU Hours (Completed/Total): 80.4/80.4	Finished Runs: 6/6	Model Size: 4M	Steps: 16B
GPU Hours (Completed/Total): 156.9/156.9	Finished Runs: 6/6	Model Size: 4M	Steps: 32B
No runs found with the specified criteria.
GPU Hours: No runs available	Model Size: 4M	Steps: 64B
No runs found with the specified criteria.
GPU Hours: No runs available	Model Size: 4M	Steps: 128B
GPU Hours (Completed/Total): 34.8/34.8	Finished Runs: 6/6	Model Size: 33M	Steps: 500M
GPU Hours (Completed/Total): 70.5/70.5	Finished Runs: 6/6	Model Size: 33M	Steps: 1B
GPU Hours (Completed/Total): 134.9/134.9	Fin

In [91]:
all_runs_df = all_runs_df[all_runs_df['% Finished'] == 100]

all_runs_df.to_csv('plots/CSL_completed_runs.csv')

In [92]:
filtered_df = all_runs_df[all_runs_df['Host'].str.startswith('t')]

# Print all unique hosts
unique_hosts = all_runs_df['Host'].unique()
print("Unique hosts:", unique_hosts)

# If you want to display the filtered DataFrame, use:
# print(filtered_df)

Unique hosts: ['athena' 'IDEAS' 'entropy' 'writer']


In [95]:
# Filter rows where host is 'athena' and E is 32
non_athena_df = all_runs_df[(all_runs_df['Host'] == 'athena') & (all_runs_df['E'] == 32)]

# Sort by 'args/model_n_active' and 'args/n_steps' in descending order
sorted_non_athena_df = non_athena_df.sort_values(by=['args/model_n_active', 'args/n_steps'], ascending=[False, False])

# Find the row with the biggest model on Athena
biggest_model_on_athena = sorted_non_athena_df.iloc[0]
print("Biggest model on Athena:")
print(biggest_model_on_athena[['GPU Hours', 'model', 'tokens', 'E', 'n_gpus', 'running_time']])


Biggest model on Athena:
GPU Hours       291.295551
model                 260M
tokens                 16B
E                       32
n_gpus                   8
running_time     36.411944
Name: 217, dtype: object


In [117]:
# Convert 'args/model_n_active' to millions for easier comparison
all_runs_df['args/model_n_active_millions'] = all_runs_df['args/model_n_active'] / 1_000_000

# Filter for Athena hosts, with model size of 260M, and time running less than 48 hours
athena_filtered_df = all_runs_df[
    (all_runs_df['args/model_n_active'] < biggest_model_on_athena['args/model_n_active']) &
    (all_runs_df['running_time'] < 48)
]

# Sum the GPU Hours for the filtered runs
total_gpu_hours_athena = athena_filtered_df['GPU Hours'].sum()

# Print the result
print(f"Total GPU Hours for Athena (260M, < 2 days): {total_gpu_hours_athena}")

remaining_runs_df = all_runs_df.drop(athena_filtered_df.index)
total_gpu_hours_other = remaining_runs_df['GPU Hours'].sum()
print(f"Total GPU Hours for Other: {total_gpu_hours_other}")
print(f"Total Days of other clusters: {total_gpu_hours_other/(24 * 8 * 3)}")

Total GPU Hours for Athena (260M, < 2 days): 4832.1458016666675
Total GPU Hours for Other: 3934.0049988888886
Total Days of other clusters: 6.8298697897376535


## Else

In [3]:
large_dense_table = get_neptune_table(['relativity_paper', 'large_model', 'std', 'dense'], negative_tags=['remove'])
get_training_progress_with_eta(large_dense_table)

https://app.neptune.ai/pmtest/llm-random/
Table downloaded
Shape: (10, 704)


Unnamed: 0,ID,GPU Hours,running_time,n_gpus,step,% Finished,Host,Time Left (h),Time Left (d)
0,LLMRANDOM-19306,34.54958,4.318697,8,5879.0,11.758,athena,32.411167,1.350465
1,LLMRANDOM-19303,48.815318,6.101915,8,7108.0,14.216,entropy,36.820952,1.534206
2,LLMRANDOM-19238,8.298736,1.037342,8,1184.0,2.368,entropy,42.769328,1.782055
3,LLMRANDOM-19070,126.634329,31.658582,4,22409.0,44.818,IDEAS,38.979515,1.624146
4,LLMRANDOM-18919,214.727124,53.681781,4,37507.0,75.014,writer,17.880569,0.745024
5,LLMRANDOM-18918,214.766664,53.691666,4,37757.0,75.514,writer,17.409939,0.725414
6,LLMRANDOM-18917,221.509526,55.377381,4,39031.0,78.062,IDEAS,15.562873,0.648453
7,LLMRANDOM-18911,228.338371,57.084593,4,38868.0,77.736,IDEAS,16.349328,0.681222
8,LLMRANDOM-18903,315.469949,78.867487,4,50000.0,100.0,entropy,0.0,0.0
9,LLMRANDOM-18902,309.505661,77.376415,4,49572.0,99.144,entropy,0.668061,0.027836
