In [5]:
! pip install pandas \
    matplotlib



In [6]:
import pandas as pd
import glob as gb

In [7]:
DIR = 'airflow/logs/dag_id=app_template/*/*/*'
logs_paths = gb.glob(DIR)

df_logs = pd.DataFrame()
for path in logs_paths:
    df = pd.read_csv(path, sep=' ', skiprows=6, header=None)
    df = df[df[1]=='{helpers.py:110}']
    col_names = {
        0: 'timestamp',
        4: 'tasks',
        5: 'batch_id',
        6: 'processed_%',
        7: 'seconds'
    }
    df = df[col_names.keys()]
    df.rename(columns=col_names, inplace=True)
    task_type = path.split('/')[4].split('=')[1].split('.')[0]
    df['task_type'] = task_type
    run_id = path.split('/')[3].split('=')[1].split('__')[1]
    df['run_id'] = run_id
    df_logs = pd.concat([df_logs, df], ignore_index=True)
df_logs.reset_index(drop=True, inplace=True)

df_logs['seconds'] = pd.to_numeric(df_logs['seconds'], errors='coerce')
df_logs['processed_%'] = pd.to_numeric(df_logs['processed_%'], errors='coerce')
df_logs['timestamp'] = \
    pd.to_datetime(df_logs['timestamp'].str.strip('[]'), errors='coerce')

df_logs.to_csv('tmp/logs.csv', index=False)

display(df_logs.shape)
df_logs.sample(10)

(4004, 7)

Unnamed: 0,timestamp,tasks,batch_id,processed_%,seconds,task_type,run_id
2263,2025-06-08 16:45:03.132000+00:00,1-2-3,20,48.78,1.741,loaders,6729361f-0b1c-499a-97b7-c418ed90e7c8
1258,2025-06-08 15:56:19.098000+00:00,1-1-1,12,9.6,4.038,transformers,46e7fd0e-0f28-4e4f-813c-3e444d476c7b
229,2025-06-08 16:19:47.456000+00:00,1-1-4,12,38.71,1.675,loaders,9290d4f5-5dad-4e54-b870-69ef3bff7d12
3758,2025-06-08 16:59:10.249000+00:00,1-3-1,22,17.6,1.893,loaders,2233ce67-b6b7-4dd7-a8ba-33c2c5bf4afc
3779,2025-06-08 16:59:53.667000+00:00,1-3-1,43,34.4,1.457,loaders,2233ce67-b6b7-4dd7-a8ba-33c2c5bf4afc
1157,2025-06-08 15:56:47.826000+00:00,1-1-1,36,28.8,2.36,loaders,46e7fd0e-0f28-4e4f-813c-3e444d476c7b
3212,2025-06-08 16:05:15.892000+00:00,1-1-2,37,59.68,2.217,loaders,a70bf2db-3afb-4197-b0c9-72bf6d2a7c27
3027,2025-06-08 16:05:00.659000+00:00,1-1-2,39,62.9,2.724,loaders,a70bf2db-3afb-4197-b0c9-72bf6d2a7c27
3961,2025-06-08 16:59:25.517000+00:00,1-3-1,31,24.8,1.702,extractors,2233ce67-b6b7-4dd7-a8ba-33c2c5bf4afc
1857,2025-06-08 16:39:05.508000+00:00,1-2-1,112,89.6,1.972,extractors,72d90dd1-eaab-4415-990e-40269e13c5c6


In [8]:
cols = ['extractors', 'transformers', 'loaders']
df_time_span = (df_logs
                .groupby(['run_id', 'task_type'])['timestamp']
                .agg(start_time='min', end_time='max')
                .reset_index())
df_time_span['duration'] = \
    (df_time_span['end_time'] - df_time_span['start_time']).dt.total_seconds()
df_grouped = \
    (df_time_span
     .pivot(index='run_id', columns='task_type', values='duration')
     .fillna(0)
     .reset_index())
df_grouped['total_pipeline_time'] = df_grouped[cols].max(axis=1)
config_map = (df_logs[['run_id', 'tasks']]
              .drop_duplicates()
              .rename(columns={'tasks': 'config'}))
df_summary = df_grouped.merge(config_map, on='run_id').sort_values(by='config')
df_summary[['run_id']+cols+['total_pipeline_time','config']]

Unnamed: 0,run_id,extractors,transformers,loaders,total_pipeline_time,config
1,46e7fd0e-0f28-4e4f-813c-3e444d476c7b,272.7,467.352,263.234,467.352,1-1-1
7,a70bf2db-3afb-4197-b0c9-72bf6d2a7c27,267.874,462.175,153.097,462.175,1-1-2
10,dc3dd13e-933a-4fd1-918b-75e13ff1cb1b,257.064,463.726,93.363,463.726,1-1-3
5,9290d4f5-5dad-4e54-b870-69ef3bff7d12,265.908,460.567,88.989,460.567,1-1-4
9,c2e453ee-5c19-4a7f-8dd7-2020810bd5f8,263.331,464.022,59.778,464.022,1-1-5
3,72d90dd1-eaab-4415-990e-40269e13c5c6,264.287,250.912,266.522,266.522,1-2-1
8,adae183b-f26b-47af-b674-dc3354b18ae1,265.814,231.141,146.577,265.814,1-2-2
2,6729361f-0b1c-499a-97b7-c418ed90e7c8,264.512,228.892,102.216,264.512,1-2-3
6,97441767-9ed6-40e3-80be-679c20758fec,266.669,226.844,85.154,266.669,1-2-4
4,8a1afd80-e110-49bb-9a55-a031af4bbd88,267.961,229.134,67.236,267.961,1-2-5
