# Energy consumption of training Stable Diffusion
Author: Mathilde Jay    
Date: 01/08/2023

### Summary
- Analysis of energy consumed
- Analysis of power timeseries
    - by components
    - in total
- Analysis of ML stats

The results were first processed using the script "process_results.py" : 
```
python utils/process_results.py --analysis_git_dir "/home/mjay/ai-energy-consumption-framework" --result_folder "/home/mjay/laion/pokemon/training_13_09_sirius"
```

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import seaborn as sns

## Description of results

In [2]:
files = [
    # "/home/mjay/laion/pokemon/training_6_09_sirius/",
    # "/home/mjay/laion/pokemon/training_11_09_sirius/",
    "/home/mjay/laion/pokemon/training_13_09_sirius/",
]
exp_table = pd.DataFrame()
for f in files:
    table = pd.read_csv(f+"processed_table.csv")
    exp_table = pd.concat([exp_table, table])    
cols = [x for x in exp_table.columns if "tool" not in x and "id" not in x and "dir" not in x]
exp_table[cols].T

Unnamed: 0,0,1,2,3,4,5,6,7
experiment_start,1694625636.262681,1694628187.636724,1694631642.836078,1694636053.891289,1694643599.442096,1694654351.129999,1694668379.697234,1694670915.720232
experiment_end,1694628183.3256,1694631638.365082,1694636048.658032,1694643595.090788,1694654345.745601,1694668372.671182,1694670911.678811,1694674351.623191
period,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
results,,,,,,,,
execution_script_args.sleep_before,90,90,90,90,90,90,90,90
execution_script_args.sleep_after,90,90,90,90,90,90,90,90
execution_script_args.benchmark_execution,accelerate launch /home/mjay/ai-energy-consump...,accelerate launch /home/mjay/ai-energy-consump...,accelerate launch /home/mjay/ai-energy-consump...,accelerate launch /home/mjay/ai-energy-consump...,accelerate launch /home/mjay/ai-energy-consump...,accelerate launch /home/mjay/ai-energy-consump...,accelerate launch /home/mjay/ai-energy-consump...,accelerate launch /home/mjay/ai-energy-consump...
execution_script_template,/home/mjay/ai-energy-consumption-framework//ut...,/home/mjay/ai-energy-consumption-framework//ut...,/home/mjay/ai-energy-consumption-framework//ut...,/home/mjay/ai-energy-consumption-framework//ut...,/home/mjay/ai-energy-consumption-framework//ut...,/home/mjay/ai-energy-consumption-framework//ut...,/home/mjay/ai-energy-consumption-framework//ut...,/home/mjay/ai-energy-consumption-framework//ut...
execution_script_path,/home/mjay/laion/pokemon/training_13_09_sirius...,/home/mjay/laion/pokemon/training_13_09_sirius...,/home/mjay/laion/pokemon/training_13_09_sirius...,/home/mjay/laion/pokemon/training_13_09_sirius...,/home/mjay/laion/pokemon/training_13_09_sirius...,/home/mjay/laion/pokemon/training_13_09_sirius...,/home/mjay/laion/pokemon/training_13_09_sirius...,/home/mjay/laion/pokemon/training_13_09_sirius...
host_name,sirius-1.lyon.grid5000.fr\n,sirius-1.lyon.grid5000.fr\n,sirius-1.lyon.grid5000.fr\n,sirius-1.lyon.grid5000.fr\n,sirius-1.lyon.grid5000.fr\n,sirius-1.lyon.grid5000.fr\n,sirius-1.lyon.grid5000.fr\n,sirius-1.lyon.grid5000.fr\n


In [3]:
parameter_selection = {
    "model_version":"v1-1",
    "train_batch_size":4,
    "gradient_accumulation_steps":2,
    "max_train_samples":400,
    "num_train_epochs":1,
    "resolution":512,
}

selected_df=exp_table.copy()
for col in parameter_selection.keys():
    value = parameter_selection[col]
    selected_df = selected_df[selected_df[col]==value]
selected_df #[["model_version"]]

Unnamed: 0,experiment_start,experiment_end,tool_csv_file_nvml,tool_csv_file_rapl,tool_csv_file_sysinfo,tool_energy_consumption(kWh),tool_GPU_energy_consumption(kWh),tool_CPU_energy_consumption(kWh),tool_RAM_energy_consumption(kWh),tool_GPU_utilization(percent),...,seed,dataloader_num_workers,output_dir,max_train_samples,validation_prompts,model_version,sensor_exp_energy_consumption(kWh),sensor_bench_energy_consumption(kWh),wattmetre_exp_energy_consumption(kWh),wattmetre_bench_energy_consumption(kWh)


In [4]:
energy_df = pd.DataFrame()
for f in files:
    energy = pd.read_csv(f+"timeseries.csv")
    energy["exp_table_path"]=f+"processed_table.csv"
    energy_df = pd.concat([energy_df, energy])
del energy

: 

#### Code for components

In [None]:

def plot_energy_components(ax, energy_df, index, energy_col, x_col):
    plot_df = energy_df[energy_df["exp_table_index"]==index]
    for label, df in plot_df.groupby(["device_index"]):
        df.plot(
            x=x_col, 
            y=energy_col, 
            ax=ax, 
            label="GPU energy "+str(label),
            )
    for label, df in plot_df.groupby(["domain", "socket"]):
        df.plot(
            x=x_col, 
            y=energy_col, 
            ax=ax, 
            label=label,
            )
    ax.legend(
        bbox_to_anchor=(0.5,-0.1), 
        loc='upper center',
        fontsize=20,
        ncol=2
    )
    ax.set_title(index)
    return ax

#### Code for total energy

In [None]:
def plot_energy_total(ax, energy_df, index, energy_col, x_col):
    plot_df = energy_df[energy_df["exp_table_index"]==index]
    plot_df[(plot_df["device_index"].notna())].groupby([x_col]).sum().plot(
        y=energy_col, 
        ax=ax, 
        label="GPU energy",
        )
    plot_df[(plot_df["domain"]=="Dram")].groupby([x_col]).sum().plot(
        y=energy_col, 
        ax=ax, 
        label="RAM energy",
        )
    plot_df[(plot_df["domain"]=="Package")].groupby([x_col]).sum().plot(
        y=energy_col, 
        ax=ax, 
        label="CPU energy",
        )
    ax.legend(
        bbox_to_anchor=(0.5,-0.1), 
        loc='upper center',
        fontsize=20,
        ncol=2
    )
    ax.set_title("Energy of components")
    return ax
    

#### Code for usage

In [None]:
def plot_utilization(ax, energy_df, index, x_col):
    plot_df = energy_df[energy_df["exp_table_index"]==index]
    plot_df[
        (plot_df["device_index"].notna())
        
        ].plot(x=x_col, y="global_utilization_percent", ax=ax, label="GPU utilization")
    plot_df[
        (plot_df["device_index"].notna())
        ].plot(x=x_col, y="global_memory_percent", ax=ax, label="GPU mem utilization")
    plot_df[
        (plot_df["utilization_percent"].notna())
        ].groupby(x_col).mean().plot(ax=ax, y="utilization_percent", label="CPU avg utilization")
    ax.legend(
        bbox_to_anchor=(0.5,-0.1), 
        loc='upper center',
        fontsize=20,
        ncol=2
    )
    return ax

### Let's plot all this

In [None]:
energy_col = "energy_consumption_since_previous_measurement_milliJ"
energy_col = "power(W)"
timestamp_col="timestamp"

In [None]:
figsize = (50,10)
fig, ax_c = plt.subplots(1, len(selected_df), figsize=figsize)
fig, ax_t = plt.subplots(1, len(selected_df), figsize=figsize)
fig, ax_u = plt.subplots(1, len(selected_df), figsize=figsize)
for i, index in enumerate(selected_df.index):
    ax = plot_energy_components(ax_c[i], energy_df, index, energy_col,timestamp_col)
    ax = plot_utilization(ax_t[i], energy_df, index, timestamp_col)
    ax = plot_energy_total(ax_u[i], energy_df, index, energy_col, timestamp_col)
    
    bench_start = selected_df.loc[index, 'bench_start(msec)']
    bench_end = selected_df.loc[index, 'bench_end(msec)']
    ax.vlines(
            x=bench_start, 
            ymin=0, 
            ymax=1000,
            color="red",
            )
    ax.vlines(
            x=bench_end, 
            ymin=0, 
            ymax=1000,
            color="red",
            )
    

We can see that there are drops in the power consumed by the GPU - probably at the end of each epoch since they are 15 peaks. The peak is always longer at the end of the first epoch, maybe an evaluation is done? 

#### Stats from the tensorboard
I wanted to add the data from the tensorboad which is supposed to gather the time at which the epoch/steps starts. But as you can see below, the timestamps are identical for the first 5 epochs which makes me think that the GPUs don't synchronize very often. I was not able to fix this issue.
I also had a timestamp conversion issue that prevented me from adding tags to the power timeseries. 

The loss don't seem to decrease a lot during the first 15 epochs - but our dataset is very small.

In [None]:
for index in selected_df.index:
    print(index)
    fig, ax = plt.subplots(figsize=(15,10))

    ml_df = pd.read_csv(exp_table.loc[index, 'result_dir']+"logs/tb.csv")
    line_df = ml_df[ml_df["tag"].isin([x for x in ml_df.tag.unique() if "validation_start" in x])] #, 'epoch_end'])]
    line_df["timestamp"]=line_df["value"]*1000
    
    # origin = plot_df["timestamp"].min()
    # line_df["timestamp_sec"]=line_df["timestamp"]-origin

    color_dict = {
        'epoch_start':"blue", 
        'epoch_end':"darkblue",
        'saving_results':"green", 
        'step_end':"darkred", 
        'step_start':"red",
        'train_loss':"lightgrey",
        'validation_timestamp':"purple",
        'validation_start':"purple",
        'validation_end':"purple",
        'backpropagate_start':"pink", 
        'backpropagate_end':"pink", 
        'get_losses_end':"blue", 
        'get_losses_start':"blue",
        'saving_results':"black",
        'sync_gradients':"grey",
    }
    ax = plot_energy_total(ax, energy_df, index, energy_col, timestamp_col)
    
    bench_start = exp_table.loc[index, 'bench_start(msec)']
    bench_end = exp_table.loc[index, 'bench_end(msec)']
    ax.vlines(
            x=bench_start, 
            ymin=0, 
            ymax=1000,
            color="red",
            )
    ax.vlines(
            x=bench_end, 
            ymin=0, 
            ymax=1000,
            color="red",
            )
    for i in range(len(line_df)):
        ax.vlines(
            x=line_df["timestamp"].values[i], 
            ymin=0, 
            ymax=1000,
            color=color_dict[line_df["tag"].values[i]],
            )
    ax.set_title(f"Index = {index}")
    ax.legend(
        bbox_to_anchor=(0.5,-0.1), 
        loc='upper center',
        fontsize=20,
        ncol=2
    )