In [1]:
! pip install pandas \
    matplotlib



In [2]:
import pandas as pd
import glob as gb

In [3]:
DIR = 'airflow/logs/dag_id=app_template/*/*/*'
logs_paths = gb.glob(DIR)

df_logs = pd.DataFrame()
for path in logs_paths:
    df = pd.read_csv(path, sep=' ', skiprows=6, header=None)
    df = df[df[1]=='{helpers.py:110}']
    col_names = {
        0: 'timestamp',
        4: 'tasks',
        5: 'batch_id',
        6: 'processed_%',
        7: 'seconds'
    }
    df = df[col_names.keys()]
    df.rename(columns=col_names, inplace=True)
    task_type = path.split('/')[4].split('=')[1].split('.')[0]
    df['task_type'] = task_type
    run_id = path.split('/')[3].split('=')[1].split('__')[1]
    df['run_id'] = run_id
    df_logs = pd.concat([df_logs, df], ignore_index=True)
df_logs.reset_index(drop=True, inplace=True)

df_logs['seconds'] = pd.to_numeric(df_logs['seconds'], errors='coerce')
df_logs['processed_%'] = pd.to_numeric(df_logs['processed_%'], errors='coerce')
df_logs['timestamp'] = pd.to_datetime(df_logs['timestamp'].str.strip('[]'), errors='coerce')

df_logs.to_csv('tmp/logs.csv', index=False)

display(df_logs.shape)
df_logs.sample(10)

(2190, 7)

Unnamed: 0,timestamp,tasks,batch_id,processed_%,seconds,task_type,run_id
849,2025-06-08 15:59:07.481000+00:00,1-1-1,101,80.8,2.344,loaders,46e7fd0e-0f28-4e4f-813c-3e444d476c7b
1188,2025-06-08 16:37:26.085000+00:00,1-2-1,65,52.0,2.665,loaders,72d90dd1-eaab-4415-990e-40269e13c5c6
1140,2025-06-08 16:35:42.962000+00:00,1-2-1,17,13.6,2.119,loaders,72d90dd1-eaab-4415-990e-40269e13c5c6
1782,2025-06-08 16:14:39.907000+00:00,1-1-3,92,73.6,2.322,extractors,dc3dd13e-933a-4fd1-918b-75e13ff1cb1b
249,2025-06-08 16:19:23.733000+00:00,1-1-4,1,0.8,2.029,extractors,9290d4f5-5dad-4e54-b870-69ef3bff7d12
869,2025-06-08 15:59:49.950000+00:00,1-1-1,121,96.8,2.504,loaders,46e7fd0e-0f28-4e4f-813c-3e444d476c7b
300,2025-06-08 16:21:13.240000+00:00,1-1-4,52,41.6,1.804,extractors,9290d4f5-5dad-4e54-b870-69ef3bff7d12
145,2025-06-08 16:22:38.513000+00:00,1-1-4,53,42.4,3.0,transformers,9290d4f5-5dad-4e54-b870-69ef3bff7d12
2060,2025-06-08 16:06:01.418000+00:00,1-1-2,58,93.55,2.853,loaders,a70bf2db-3afb-4197-b0c9-72bf6d2a7c27
360,2025-06-08 16:23:22.234000+00:00,1-1-4,112,89.6,1.617,extractors,9290d4f5-5dad-4e54-b870-69ef3bff7d12


In [4]:
df_grouped = (
    df_logs.groupby(['run_id', 'task_type'])['seconds']
    .sum()
    .unstack(fill_value=0)
    .reset_index()
)
df_grouped['total_pipeline_time'] = df_grouped[['extractors', 'transformers', 'loaders']].sum(axis=1)
config_map = df_logs[['run_id', 'tasks']].drop_duplicates().rename(columns={'tasks': 'config'})
df_summary = df_grouped.merge(config_map, on='run_id')
df_summary


Unnamed: 0,run_id,extractors,loaders,transformers,total_pipeline_time,config
0,46e7fd0e-0f28-4e4f-813c-3e444d476c7b,270.937,260.908,467.145,998.99,1-1-1
1,72d90dd1-eaab-4415-990e-40269e13c5c6,217.346,218.058,413.817,849.221,1-2-1
2,9290d4f5-5dad-4e54-b870-69ef3bff7d12,264.009,263.656,461.06,988.725,1-1-4
3,a70bf2db-3afb-4197-b0c9-72bf6d2a7c27,265.507,261.419,462.148,989.074,1-1-2
4,c2e453ee-5c19-4a7f-8dd7-2020810bd5f8,261.719,265.516,463.598,990.833,1-1-5
5,dc3dd13e-933a-4fd1-918b-75e13ff1cb1b,255.253,256.137,463.215,974.605,1-1-3
