In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime, timedelta

In [None]:
csv_path_jobs = "wandb_runs_time-to-train.csv"
df_jobs = pd.read_csv(csv_path_jobs)
print(df_jobs.shape)
df_jobs.sample(2)

In [None]:
def get_tpu_type(run_name: str) -> str:
    # default value
    tpu_type = "v4-256"
    tpu_zone = "us"

    if "v4-256" in run_name or "v4" in run_name:
        tpu_type = "v4-256"
    else:
        tpu_type = "v5lite-256"

    if "europe-west4-b" in run_name:
        tpu_zone = "eu"
    else:
        tpu_zone = "us"
    return f"{tpu_type}({tpu_zone})"

df_jobs["tpu_type"] = df_jobs["run_name"].apply(get_tpu_type)
df_jobs["tpu_type"].value_counts()

In [None]:
df_jobs.columns

In [None]:
df_jobs["create_time"].head()

In [None]:
df_jobs["heartbeat_time"].head()

In [None]:
df_jobs["runtime"].head()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def get_tpu_hours(df):
    # Convert time columns to datetime if they aren't already
    df['create_time'] = pd.to_datetime(df['create_time'])
    df['heartbeat_time'] = pd.to_datetime(df['heartbeat_time'])

    # Function to generate hourly intervals and distribute runtime
    def distribute_runtime(row):
        start = row['create_time'].floor('H')
        end = row['heartbeat_time'].ceil('H')
        hours = pd.date_range(start=start, end=end, freq='H')
        total_period = (row['heartbeat_time'] - row['create_time']).total_seconds() / 3600
        runtime_per_hour = row['runtime'] / total_period if total_period > 0 else 0

        distributed_hours = []
        for hour_start in hours[:-1]:
            hour_end = hour_start + pd.Timedelta(hours=1)
            overlap_start = max(hour_start, row['create_time'])
            overlap_end = min(hour_end, row['heartbeat_time'])
            overlap_fraction = (overlap_end - overlap_start).total_seconds() / 3600
            distributed_hours.append({
                'hour': hour_start,
                'tpu_type': row['tpu_type'],
                'active_hours': runtime_per_hour * overlap_fraction
            })
        return pd.DataFrame(distributed_hours)

    # Apply the function to each row and concatenate the results
    hourly_data = pd.concat(df.apply(distribute_runtime, axis=1).tolist(), ignore_index=True)

    # Group by hour and tpu_type, and sum the active_hours
    active_hours = hourly_data.groupby(['hour', 'tpu_type'])['active_hours'].sum().unstack(fill_value=0)

    # Plotting
    plt.figure(figsize=(15, 8))
    for tpu in active_hours.columns:
        plt.plot(active_hours.index, active_hours[tpu], label=tpu)

    plt.xlabel('Time')
    plt.ylabel('Active Hours')
    plt.title('Active Hours by TPU Type Over Time (Hourly Windows)')
    plt.legend(title='TPU Type', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)

    # Format x-axis
    plt.gcf().autofmt_xdate()  # Rotate and align the tick labels

    plt.tight_layout()
    plt.show()

    # Print some statistics
    print("Total active hours by TPU type:")
    print(active_hours.sum())

    print("\nPeak active hours by TPU type:")
    print(active_hours.max())

    print("\nAverage active hours by TPU type:")
    print(active_hours.mean())

# Call the function with your dataframe
get_tpu_hours(df_jobs)
