In [1]:
! pip install pandas



In [2]:
import pandas as pd
import glob as gb
import os
from pprint import pprint
from io import StringIO
from datetime import datetime

In [3]:
DIR = '**'
paths = gb.glob(DIR, recursive=True)
logs_paths = [p for p in paths if ".log" in p]
logs_paths = [p for p in logs_paths if "dag_id=app_template" in p]
print(len(logs_paths))
logs_paths[:10]

13400


['tmp/dag_id=app_template/run_id=manual__096dcecb-3f0c-48ce-b18a-750bd40f9488/task_id=loaders.load_1/attempt=1.log',
 'tmp/dag_id=app_template/run_id=manual__096dcecb-3f0c-48ce-b18a-750bd40f9488/task_id=transformers.transform_3/attempt=1.log',
 'tmp/dag_id=app_template/run_id=manual__096dcecb-3f0c-48ce-b18a-750bd40f9488/task_id=loaders.load_4/attempt=2.log',
 'tmp/dag_id=app_template/run_id=manual__096dcecb-3f0c-48ce-b18a-750bd40f9488/task_id=loaders.load_4/attempt=4.log',
 'tmp/dag_id=app_template/run_id=manual__096dcecb-3f0c-48ce-b18a-750bd40f9488/task_id=loaders.load_4/attempt=3.log',
 'tmp/dag_id=app_template/run_id=manual__096dcecb-3f0c-48ce-b18a-750bd40f9488/task_id=loaders.load_3/attempt=1.log',
 'tmp/dag_id=app_template/run_id=manual__096dcecb-3f0c-48ce-b18a-750bd40f9488/task_id=transformers.transform_2/attempt=1.log',
 'tmp/dag_id=app_template/run_id=manual__096dcecb-3f0c-48ce-b18a-750bd40f9488/task_id=transformers.transform_1/attempt=1.log',
 'tmp/dag_id=app_template/run_id=m

In [4]:
def parse_path(path):
    parts = path.split('/')
    dag_id   = parts[1].split('=')[1]
    run_id   = parts[2].split('=')[1]
    task_id  = parts[3].split('=')[1]
    attempt  = int(os.path.basename(parts[4]).split('=')[1].split('.')[0])
    return dag_id, run_id, task_id, attempt

df_paths = pd.DataFrame([(*parse_path(p), p) for p in logs_paths],
                        columns=['dag_id','run_id','task_id','attempt','path'])

df_latest = df_paths.sort_values('attempt').groupby(
    ['dag_id','run_id','task_id'], as_index=False
).last()

latest_paths = df_latest['path'].tolist()
len(latest_paths)

9602

In [5]:
df_logs = pd.DataFrame()
errors = set()
for path in latest_paths:

    task_type = path.split('/')[3].split('=')[1].split('.')[0]
    run_id = path.split('/')[2].split('=')[1].split('__')[1]
    task_id = path.split('/')[3][-1]

    with open(path, 'r') as f:
        lines = f.readlines()
        for l in lines:
            if 'failed' in l:
                errors.add(run_id)
        lines = [l for l in lines if '{helpers.py:' in l]
        lines = ''.join(lines)
    try:
        df = pd.read_csv(StringIO(lines), sep=' ', header=None)
    except:
        continue
    df = df[df[4].astype(str).str.contains(r"\b[1-5]-[1-5]-[1-5]\b", regex=True, na=False)]
    col_names = {
        0: 'timestamp',
        4: 'tasks',
        5: 'batch_id',
        6: 'processed_%',
        7: 'seconds'
    }
    df = df[col_names.keys()]
    df.rename(columns=col_names, inplace=True)
    df['task_type'] = task_type
    df['run_id'] = run_id
    df['task_id'] = task_id
    df_logs = pd.concat([df_logs, df], ignore_index=True)
df_logs.reset_index(drop=True, inplace=True)

df_logs['seconds'] = pd.to_numeric(df_logs['seconds'], errors='coerce')
df_logs['processed_%'] = pd.to_numeric(df_logs['processed_%'], errors='coerce')
df_logs['timestamp'] = \
    pd.to_datetime(df_logs['timestamp'].str.strip('[]'), errors='coerce')

df_logs

Unnamed: 0,timestamp,tasks,batch_id,processed_%,seconds,task_type,run_id,task_id
0,2025-09-02 20:47:18.223000+00:00,3-3-5,1,2.44,2.267,extractors,00006511-0400-4468-99a5-63bbdafa7678,1
1,2025-09-02 20:47:19.821000+00:00,3-3-5,2,4.88,1.590,extractors,00006511-0400-4468-99a5-63bbdafa7678,1
2,2025-09-02 20:47:21.908000+00:00,3-3-5,3,7.32,2.079,extractors,00006511-0400-4468-99a5-63bbdafa7678,1
3,2025-09-02 20:47:23.706000+00:00,3-3-5,4,9.76,1.789,extractors,00006511-0400-4468-99a5-63bbdafa7678,1
4,2025-09-02 20:47:25.898000+00:00,3-3-5,5,12.20,2.183,extractors,00006511-0400-4468-99a5-63bbdafa7678,1
...,...,...,...,...,...,...,...,...
395389,2025-08-27 11:09:11.140000+00:00,5-3-1,24,58.54,3.840,transformers,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f,3
395390,2025-08-27 11:09:14.137000+00:00,5-3-1,25,60.98,2.988,transformers,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f,3
395391,2025-08-27 11:09:17.588000+00:00,5-2-2,26,63.41,3.443,transformers,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f,3
395392,2025-08-27 11:09:20.694000+00:00,5-2-2,27,65.85,3.098,transformers,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f,3


In [6]:
df_complete_100 = df_logs.query("`processed_%` == 100")
df_complete_100 = df_complete_100[['run_id', 'task_type', 'tasks', 'task_id']]
df_complete_100.reset_index(drop=True, inplace=True)
df_complete_100

Unnamed: 0,run_id,task_type,tasks,task_id
0,00006511-0400-4468-99a5-63bbdafa7678,extractors,3-4-2,1
1,00006511-0400-4468-99a5-63bbdafa7678,extractors,3-4-2,2
2,00006511-0400-4468-99a5-63bbdafa7678,extractors,3-4-2,3
3,00006511-0400-4468-99a5-63bbdafa7678,loaders,3-3-5,1
4,00006511-0400-4468-99a5-63bbdafa7678,loaders,3-3-5,2
...,...,...,...,...
8661,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f,extractors,5-2-2,4
8662,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f,extractors,5-2-2,5
8663,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f,loaders,5-3-1,1
8664,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f,transformers,5-2-2,1


In [7]:
df_complete_ref = df_complete_100.copy()
ref = {
    'extractors': 0,
    'transformers': 1,
    'loaders': 2
    }
def create_etl_reference(task_type):
    idx = ref.get(task_type)
    df_complete_ref[task_type] = df_complete_ref['tasks'].str.split('-').apply(lambda x: x[idx])
for task in ref:
    create_etl_reference(task)
df_complete_ref

Unnamed: 0,run_id,task_type,tasks,task_id,extractors,transformers,loaders
0,00006511-0400-4468-99a5-63bbdafa7678,extractors,3-4-2,1,3,4,2
1,00006511-0400-4468-99a5-63bbdafa7678,extractors,3-4-2,2,3,4,2
2,00006511-0400-4468-99a5-63bbdafa7678,extractors,3-4-2,3,3,4,2
3,00006511-0400-4468-99a5-63bbdafa7678,loaders,3-3-5,1,3,3,5
4,00006511-0400-4468-99a5-63bbdafa7678,loaders,3-3-5,2,3,3,5
...,...,...,...,...,...,...,...
8661,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f,extractors,5-2-2,4,5,2,2
8662,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f,extractors,5-2-2,5,5,2,2
8663,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f,loaders,5-3-1,1,5,3,1
8664,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f,transformers,5-2-2,1,5,2,2


In [8]:
df_complete_max = df_complete_ref.groupby(
    ['run_id', 'task_type', 'tasks', 'extractors', 'transformers', 'loaders'], 
    as_index=False).agg({'task_id': 'max'})
df_complete_max

Unnamed: 0,run_id,task_type,tasks,extractors,transformers,loaders,task_id
0,00006511-0400-4468-99a5-63bbdafa7678,extractors,3-4-2,3,4,2,3
1,00006511-0400-4468-99a5-63bbdafa7678,loaders,3-3-5,3,3,5,5
2,00006511-0400-4468-99a5-63bbdafa7678,loaders,3-4-2,3,4,2,4
3,00006511-0400-4468-99a5-63bbdafa7678,transformers,3-4-2,3,4,2,3
4,00e6ef5d-525f-439b-b9d3-c6e3eda0a357,extractors,4-3-5,4,3,5,4
...,...,...,...,...,...,...,...
3509,ffa23643-2105-4886-a3ee-053ed9168b4f,transformers,1-5-4,1,5,4,5
3510,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f,extractors,5-2-2,5,2,2,5
3511,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f,extractors,5-3-1,5,3,1,1
3512,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f,loaders,5-3-1,5,3,1,1


In [9]:
df_complete = pd.DataFrame()
for task_type in ref:
    df_task_type = df_complete_max.copy()[df_complete_max['task_type'] == task_type]
    df_task_type = df_task_type.copy()[df_task_type[task_type] == df_task_type['task_id']]
    df_complete = pd.concat([df_complete, df_task_type], ignore_index=True)
df_complete.reset_index(drop=True, inplace=True)
df_complete = df_complete[[ 'run_id', 'task_type', 'tasks', 'task_id']]
df_complete

Unnamed: 0,run_id,task_type,tasks,task_id
0,00006511-0400-4468-99a5-63bbdafa7678,extractors,3-4-2,3
1,00e6ef5d-525f-439b-b9d3-c6e3eda0a357,extractors,4-3-5,4
2,00f8ef9d-069c-4f14-862e-3ad6a340e023,extractors,3-4-5,3
3,010fb528-1c13-45bf-8952-0fecdba10e5d,extractors,1-3-1,1
4,01d97b38-19db-40d4-bfcb-2c6a6b7cb37f,extractors,2-4-4,2
...,...,...,...,...
2637,ff8ac8bb-8d81-4c5c-8374-ce88da8d07a2,loaders,1-1-2,2
2638,ff8ac8bb-8d81-4c5c-8374-ce88da8d07a2,loaders,1-1-3,3
2639,ff91b30f-7eec-4369-9d4b-dbef484386bf,loaders,2-2-2,2
2640,ffa23643-2105-4886-a3ee-053ed9168b4f,loaders,1-5-4,4


In [10]:
df_logs_filtered = df_logs.merge(
    df_complete, 
    on=df_complete.columns.tolist(), 
    how='outer', 
    indicator=True)
df_logs_filtered = df_logs_filtered[df_logs_filtered['_merge'] == 'both']
df_logs_filtered.drop(['task_id', '_merge'], axis=1, inplace=True)
df_logs_filtered

Unnamed: 0,timestamp,tasks,batch_id,processed_%,seconds,task_type,run_id
111,2025-09-02 20:48:20.495000+00:00,3-4-2,30,73.17,2.460,extractors,00006511-0400-4468-99a5-63bbdafa7678
112,2025-09-02 20:48:22.398000+00:00,3-4-2,31,75.61,1.895,extractors,00006511-0400-4468-99a5-63bbdafa7678
113,2025-09-02 20:48:24.271000+00:00,3-4-2,32,78.05,1.865,extractors,00006511-0400-4468-99a5-63bbdafa7678
114,2025-09-02 20:48:26.936000+00:00,3-4-2,33,80.49,2.656,extractors,00006511-0400-4468-99a5-63bbdafa7678
115,2025-09-02 20:48:29.450000+00:00,3-4-2,34,82.93,2.507,extractors,00006511-0400-4468-99a5-63bbdafa7678
...,...,...,...,...,...,...,...
395268,2025-08-27 11:06:09.724000+00:00,5-2-2,11,17.74,4.249,transformers,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f
395269,2025-08-27 11:06:13.648000+00:00,5-2-2,12,19.35,3.915,transformers,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f
395270,2025-08-27 11:06:17.147000+00:00,5-2-2,13,20.97,3.491,transformers,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f
395271,2025-08-27 11:06:20.533000+00:00,5-2-2,14,22.58,3.378,transformers,ffbef2ee-a180-4c35-8fff-3d5f3c46cf2f


In [16]:
pd.DataFrame(df_logs_filtered[['tasks', 'run_id']].drop_duplicates()['tasks'].value_counts()).sample(20)

Unnamed: 0_level_0,count
tasks,Unnamed: 1_level_1
2-3-3,9
5-3-4,3
4-2-5,8
4-5-2,6
3-2-4,16
2-4-2,12
3-1-4,11
4-2-3,6
3-4-2,16
1-2-3,11


In [11]:
# timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# df_logs_filtered.to_csv(f"tmp/logs_{timestamp}.csv", index=False)