In [1]:
import neptune as neptune
from datetime import datetime, timedelta
import pandas as pd
import pytz


In [2]:
def get_training_progress_with_eta(runs_table, run_specs, active=False):
    # Filter runs based on the provided specifications
    for column, value in run_specs.items():
        if value is not None:
            runs_table = runs_table[runs_table[column] == value]
    print(runs_table.shape)

    if runs_table.empty:
        print("No runs found with the specified criteria.")
        return None
    
    results = []
    
    for idx, run in runs_table.iterrows():        
        # Fetch necessary information
        current_step = run["step"]
        n_steps = run["args/n_steps"]
        start_time = run["sys/creation_time"]
        warsaw_tz = pytz.timezone('Europe/Warsaw')
        current_time = datetime.now(warsaw_tz)
        time_running = current_time - start_time
        time_running = round(time_running.total_seconds() / 3600, 2)
        
        # Calculate the percentage of training completed
        if n_steps > 0:
            percent_finished = (current_step / n_steps) * 100
        else:
            percent_finished = 0.0
        
        # Estimate ETA (remaining time)
        if current_step > 0:
            time_per_step = time_running / current_step
            remaining_steps = n_steps - current_step
            time_left = time_per_step * remaining_steps
            # time_left_hours = time_left.total_seconds() / 3600  # Convert to hours
        else:
            time_left = None
        
        # Collect the data into a dictionary
        result = {
            # "args/name": run.get("args/name", None),
            "time_running": time_running,
            "time_left (hours)": time_left,
            "%finished": percent_finished,
            "step": current_step,
            "host": run['sys/hostname']
        }
        
        # Include the run specifications in the result
        for column in run_specs.keys():
            result[column] = run.get(column, None)
        
        results.append(result)
    
    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

In [3]:
project = neptune.init_project(
    project="pmtest/llm-random",
    mode="read-only",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIyMDY0ZDI5Ni05YWU3LTQyNGYtYmY4My1hZTFkY2EzYmUwMjgifQ=="
)

# Find the specific run based on provided specs and tags
# runs_table = project.fetch_runs_table(state="active", tag=["constrained_scaling_grid_final"]).to_pandas()
runs_table = project.fetch_runs_table(tag=["constrained_scaling_grid_final"], state='active').to_pandas()
print(runs_table.shape)

https://app.neptune.ai/pmtest/llm-random/
(6, 544)


In [4]:
model_sizes = {
    'none': None,
    '4M': 3145728,
    '9M': 7077888,
    '33M': 25165824,
    '65M': 49152000,
    '113M': 84934656,
    '260M': 201326592,
    '520M': 393216000,
}

training_lengths = {
    'none': None,
    '500M': 1907,
    '1B': 3814,
    '2B': 7628,
    '4B': 15256,
    '8B': 30512,
    '16B': 61024,
    '32B': 122048,
    '64B': 244150,
    '128B': 488300,
}

In [5]:
run_specs = {"args/n_steps": training_lengths['128B'], "args/model_n_active": model_sizes['none'], "args/expansion_rate": None, "sys/state": None}
active = True

result = get_training_progress_with_eta(runs_table, run_specs, active=active)
result["args/model_n_active"] = (result["args/model_n_active"] / 1000000).round().astype(int)
result["time_left (d)"] = (result["time_left (hours)"] / 24).round(2)
print(result)

(6, 544)
   time_running  time_left (hours)  %finished      step     host  \
0         67.96         190.406627  26.303707  128441.0  4124gs0   
1         92.19         686.869187  11.833504   57783.0    gpu01   
2        107.19                NaN        NaN       NaN  login01   
3        141.20                NaN        NaN       NaN  login01   
4        148.60                NaN        NaN       NaN  login01   
5        188.98                NaN        NaN       NaN  4124gs0   

   args/n_steps  args/model_n_active  args/expansion_rate sys/state  \
0        488300                   85                    1    Active   
1        488300                   25                   16    Active   
2        488300                   25                    1    Active   
3        488300                   49                   16    Active   
4        488300                   49                    1    Active   
5        488300                   85                   16    Active   

   time_left (d)