In [None]:
import matplotlib.pyplot as plt
import os
import re
import glob

: 

In [None]:
# Parse all subfolders and extract GPU counts and training times
base_dir = '/workspace/slime/experiments-elastic/gpu-hour-baselines/'

# Find all log files
log_files = glob.glob(os.path.join(base_dir, '*/*_log.txt'))

data = []

for log_file in log_files:
    # Extract folder name to determine GPU configuration
    folder_name = os.path.basename(os.path.dirname(log_file))
    
    # Parse inference and training GPU counts from folder name (e.g., '2i_1t' or '1i_1t_info')
    match = re.match(r'(\d+)i_(\d+)t', folder_name)
    if match:
        inference_gpus = int(match.group(1))
        training_gpus = int(match.group(2))
        total_gpus = inference_gpus + training_gpus
        
        # Read the log file and extract training time
        with open(log_file, 'r') as f:
            content = f.read()
            time_match = re.search(r'Total training time: ([\d.]+)', content)
            if time_match:
                training_time = float(time_match.group(1))
                data.append({
                    'folder': folder_name,
                    'inference_gpus': inference_gpus,
                    'training_gpus': training_gpus,
                    'total_gpus': total_gpus,
                    'training_time': training_time
                })
                print(f"{folder_name}: {total_gpus} GPUs, {training_time:.2f}s")

# Sort by total GPUs
data.sort(key=lambda x: x['total_gpus'])

# Extract arrays for plotting
total_gpus = [d['total_gpus'] for d in data]
training_times = [d['training_time'] for d in data]
gpu_hours = [d['training_time'] * d['total_gpus'] / 3600 for d in data]  # Convert to hours

print(f"\nParsed {len(data)} experiments")

In [None]:
def simulate_total_elastic_time(global_batch_size: int, total_gpus_used: int, number_of_dedicated_inference_gpus: float, number_of_elastic_gpus: int, gpu_inference_throughput: float, gpu_training_throughput: float, training_to_inference_cost: float, inference_to_training_cost: float):
    dedicated_inference_throughput = number_of_dedicated_inference_gpus * gpu_inference_throughput
    total_inference_throughput = total_gpus_used * gpu_inference_throughput
    # the time for the first rollout is the throughput when every engine does inference
    time = 0
    time += (global_batch_size / total_inference_throughput)
    num_rollouts = 5
    # this is a loop that basically does training stuff
    for i in range(num_rollouts):
        time += inference_to_training_cost
        time_training = global_batch_size / (gpu_training_throughput * number_of_elastic_gpus)
        time += time_training

        if i == num_rollouts - 1:
            break  # Last training done, no need to switch back

        time += training_to_inference_cost
        total_time_used = inference_to_training_cost + time_training + training_to_inference_cost
        remaining_samples_in_async_batch = global_batch_size - dedicated_inference_throughput * total_time_used
        if remaining_samples_in_async_batch > 0:
            extra_time_to_complete_next_batch = remaining_samples_in_async_batch / total_inference_throughput
            time += extra_time_to_complete_next_batch
    return time

In [None]:
def simulate_sync_total_time(global_batch_size, total_gpus_used, gpu_inference_throughput, gpu_training_throughput, num_rollouts: int = 5):
    total_inference_throughput, total_training_throughput = total_gpus_used * gpu_inference_throughput, total_gpus * gpu_training_throughput
    inference_time, training_time = global_batch_size / total_inference_throughput, global_batch_size / total_training_throughput
    return num_rollouts * (inference_time + training_time)

In [None]:
def simulate_one_step_overlap_total_time(global_batch_size, total_gpus_used, num_inference_gpus, num_training_gpus, gpu_inference_throughput, gpu_training_throughput, num_rollouts: int = 5):
    total_inference_time = global_batch_size / (num_inference_gpus * gpu_inference_throughput)
    total_training_time = global_batch_size / (num_training_gpus * gpu_training_throughput)
    total_time = total_inference_time + total_training_time + (num_rollouts - 1) * max(total_inference_time, total_training_time)
    return total_time

In [None]:
# Experimental: calculating the elastic's theoretical best
total_gpus_elastic = [i for i in range(2, 8)]
elastic_theoretical_time = []
# These constants are measured
GPU_INFERENCE_THROUGHPUT = 1
GPU_TRAINING_THROUGHPUT = 2.5
TRAINING_TO_INFERENCE = 1.8
INFERENCE_TO_TRAINING = 3.01
# test the value of a setting
time = simulate_total_time(256, 2, 1, 1, GPU_INFERENCE_THROUGHPUT, GPU_TRAINING_THROUGHPUT, TRAINING_TO_INFERENCE, INFERENCE_TO_TRAINING)
time

In [None]:
# Create figure with dual y-axis
fig, ax1 = plt.subplots(figsize=(10, 6))

# Plot 1: Total training time (left y-axis)
color1 = 'tab:blue'
ax1.set_xlabel('Total GPUs', fontsize=12)
ax1.set_ylabel('Training Time (seconds)', color=color1, fontsize=12)
line1 = ax1.plot(total_gpus, training_times, 'o-', color=color1, linewidth=2, markersize=8, label='Training Time')
ax1.tick_params(axis='y', labelcolor=color1)
ax1.set_xticks(total_gpus)

# Plot 2: GPU-hours (right y-axis)
ax2 = ax1.twinx()
color2 = 'tab:orange'
ax2.set_ylabel('GPU-Hours', color=color2, fontsize=12)
line2 = ax2.plot(total_gpus, gpu_hours, 's--', color=color2, linewidth=2, markersize=8, label='GPU-Hours')
ax2.tick_params(axis='y', labelcolor=color2)

# Combine legends
lines = line1 + line2
labels = [l.get_label() for l in lines]
ax1.legend(lines, labels, loc='upper right', fontsize=10)

plt.title('Elastic Training: Time and GPU-Hours vs Total GPUs', fontsize=14)
fig.tight_layout()
plt.grid(True, alpha=0.3)
plt.show()

# Print summary table
print("\nSummary:")
print(f"{'GPUs':<6} {'Time (s)':<12} {'GPU-Hours':<12}")
print("-" * 30)
for d in data:
    gpu_hr = d['training_time'] * d['total_gpus'] / 3600
    print(f"{d['total_gpus']:<6} {d['training_time']:<12.2f} {gpu_hr:<12.4f}")