In [2]:
! pip install pandas



In [3]:
import pandas as pd
import glob as gb
import os
from io import StringIO
from datetime import datetime

In [4]:
DIR = '**'
paths = gb.glob(DIR, recursive=True)
logs_paths = [p for p in paths if ".log" in p]
logs_paths = [p for p in logs_paths if "dag_id=app_template" in p]
print(len(logs_paths))
logs_paths[:10]

18750


['tmp/dag_id=app_template_8/run_id=manual__d47d1448-6a7b-4de4-a85e-2c248759bd65/task_id=loaders.load_1/attempt=1.log',
 'tmp/dag_id=app_template_8/run_id=manual__d47d1448-6a7b-4de4-a85e-2c248759bd65/task_id=transformers.transform_3/attempt=1.log',
 'tmp/dag_id=app_template_8/run_id=manual__d47d1448-6a7b-4de4-a85e-2c248759bd65/task_id=extractors.extract_5/attempt=1.log',
 'tmp/dag_id=app_template_8/run_id=manual__d47d1448-6a7b-4de4-a85e-2c248759bd65/task_id=extractors.extract_2/attempt=1.log',
 'tmp/dag_id=app_template_8/run_id=manual__d47d1448-6a7b-4de4-a85e-2c248759bd65/task_id=transformers.transform_4/attempt=1.log',
 'tmp/dag_id=app_template_8/run_id=manual__d47d1448-6a7b-4de4-a85e-2c248759bd65/task_id=loaders.load_4/attempt=1.log',
 'tmp/dag_id=app_template_8/run_id=manual__d47d1448-6a7b-4de4-a85e-2c248759bd65/task_id=loaders.load_3/attempt=1.log',
 'tmp/dag_id=app_template_8/run_id=manual__d47d1448-6a7b-4de4-a85e-2c248759bd65/task_id=transformers.transform_2/attempt=1.log',
 'tmp/

In [5]:
def parse_path(path):
    parts = path.split('/')
    dag_id   = parts[1].split('=')[1]
    run_id   = parts[2].split('=')[1]
    task_id  = parts[3].split('=')[1]
    attempt  = int(os.path.basename(parts[4]).split('=')[1].split('.')[0])
    return dag_id, run_id, task_id, attempt

df_paths = pd.DataFrame([(*parse_path(p), p) for p in logs_paths],
                        columns=['dag_id','run_id','task_id','attempt','path'])

df_latest = df_paths.sort_values('attempt').groupby(
    ['dag_id','run_id','task_id'], as_index=False
).last()

latest_paths = df_latest['path'].tolist()
len(latest_paths)

18750

In [6]:
df_logs = pd.DataFrame()
errors = set()
for path in latest_paths:
    parts = path.split('/')
    dag_id = parts[1].split('_')[-1]
    task_type = parts[3].split('=')[1].split('.')[0]
    run_id = parts[2].split('=')[1].split('__')[1]
    task_id = parts[3][-1]

    with open(path, 'r') as f:
        lines = f.readlines()
        for l in lines:
            if 'failed' in l:
                errors.add(run_id)
        lines = [l for l in lines if '{helpers.py:' in l]
        lines = ''.join(lines)
    try:
        df = pd.read_csv(StringIO(lines), sep=' ', header=None)
    except:
        continue
    df = df[df[4].astype(str).str.contains(r"\b[1-5]-[1-5]-[1-5]\b", regex=True, na=False)]
    col_names = {
        0: 'timestamp',
        4: 'tasks',
        5: 'batch_id',
        6: 'processed_%',
        7: 'seconds'
    }
    df = df[col_names.keys()]
    df.rename(columns=col_names, inplace=True)
    df['dag_id'] = int(dag_id)
    df['task_type'] = task_type
    df['run_id'] = run_id
    df['task_id'] = int(task_id)
    df_logs = pd.concat([df_logs, df], ignore_index=True)
df_logs.reset_index(drop=True, inplace=True)

df_logs['seconds'] = pd.to_numeric(df_logs['seconds'], errors='coerce')
df_logs['processed_%'] = pd.to_numeric(df_logs['processed_%'], errors='coerce')
df_logs['timestamp'] = \
    pd.to_datetime(df_logs['timestamp'].str.strip('[]'), errors='coerce')

df_logs

Unnamed: 0,timestamp,tasks,batch_id,processed_%,seconds,dag_id,task_type,run_id,task_id
0,2025-09-04 12:51:30.350000+00:00,2-4-1,1,1.61,2.059,1,extractors,00b8bb56-e57e-4a4c-9d21-b7fa5ab6fdbb,1
1,2025-09-04 12:51:32.340000+00:00,2-4-1,2,3.23,1.982,1,extractors,00b8bb56-e57e-4a4c-9d21-b7fa5ab6fdbb,1
2,2025-09-04 12:51:34.549000+00:00,2-4-1,3,4.84,2.199,1,extractors,00b8bb56-e57e-4a4c-9d21-b7fa5ab6fdbb,1
3,2025-09-04 12:51:36.174000+00:00,2-4-1,4,6.45,1.616,1,extractors,00b8bb56-e57e-4a4c-9d21-b7fa5ab6fdbb,1
4,2025-09-04 12:51:38.087000+00:00,2-4-1,5,8.06,1.905,1,extractors,00b8bb56-e57e-4a4c-9d21-b7fa5ab6fdbb,1
...,...,...,...,...,...,...,...,...,...
465745,2025-09-04 15:45:26.631000+00:00,5-4-1,27,87.10,3.644,9,transformers,ff94d0bc-1e5e-4cba-8f9e-dfda9e7b9f9c,4
465746,2025-09-04 15:45:30.750000+00:00,5-4-1,28,90.32,4.111,9,transformers,ff94d0bc-1e5e-4cba-8f9e-dfda9e7b9f9c,4
465747,2025-09-04 15:45:34.355000+00:00,5-4-1,29,93.55,3.596,9,transformers,ff94d0bc-1e5e-4cba-8f9e-dfda9e7b9f9c,4
465748,2025-09-04 15:45:38.417000+00:00,5-4-1,30,96.77,4.054,9,transformers,ff94d0bc-1e5e-4cba-8f9e-dfda9e7b9f9c,4


In [7]:
all_tasks = set(df_logs['tasks'])
print(len(all_tasks))
print(all_tasks)

125
{'1-5-5', '4-2-4', '4-5-3', '3-3-1', '3-5-1', '3-1-2', '5-3-1', '3-1-3', '5-2-1', '1-3-3', '2-4-5', '4-5-4', '4-3-2', '4-1-4', '1-4-3', '2-2-1', '1-1-3', '2-3-5', '2-5-4', '3-1-5', '4-2-3', '2-5-5', '3-4-3', '1-1-1', '2-1-5', '4-1-2', '5-2-3', '2-4-2', '2-3-2', '4-3-1', '4-4-3', '5-1-2', '4-2-2', '5-1-4', '5-5-1', '1-3-5', '2-4-4', '2-2-4', '1-1-4', '5-3-3', '2-1-1', '2-1-3', '1-3-2', '2-4-1', '1-2-2', '2-3-4', '3-5-2', '5-2-4', '4-1-3', '2-2-3', '1-2-4', '5-5-5', '4-3-5', '4-2-1', '2-2-2', '1-2-3', '1-4-2', '2-3-1', '1-1-2', '5-4-3', '1-5-2', '3-2-3', '3-4-1', '1-1-5', '4-2-5', '5-1-1', '4-5-2', '3-3-5', '5-3-5', '1-3-1', '3-5-5', '1-2-5', '3-1-1', '3-2-2', '2-5-3', '3-3-3', '1-4-5', '3-5-4', '5-5-3', '4-5-5', '3-4-2', '5-4-4', '4-3-4', '1-4-4', '1-3-4', '1-5-4', '1-4-1', '4-4-5', '5-5-2', '3-2-1', '4-4-1', '3-4-4', '2-1-4', '4-3-3', '3-2-5', '5-2-5', '5-4-5', '2-5-2', '3-2-4', '4-4-2', '2-5-1', '2-3-3', '1-2-1', '3-3-4', '5-5-4', '2-4-3', '3-5-3', '2-2-5', '4-4-4', '4-1-5', '1-5-

In [8]:
all_dags = set(df_logs['dag_id'])
print(len(all_dags))
print(all_dags)

10
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}


In [9]:
all_run = set(df_logs['run_id'])
print(len(all_run))

1250


In [10]:
pd.DataFrame(df_logs[['tasks', 'run_id']].drop_duplicates()['tasks'].value_counts()).sample(20)

Unnamed: 0_level_0,count
tasks,Unnamed: 1_level_1
1-4-5,10
4-5-1,10
1-2-1,10
5-3-2,10
4-5-5,10
2-4-2,10
3-4-3,10
3-4-1,10
2-1-1,10
1-3-4,10


In [11]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
df_logs.to_csv(f"tmp/logs_{timestamp}.csv", index=False)