In [None]:
import os
import plotly.express as px
import pandas as pd
import numpy as np

def process_file_all_files(task_id: str):

    def writeLine(line, memo, r):
        parts = line.strip().split(maxsplit=1) 
        if len(parts) < 2:
            return 
        
        currCount = int(parts[0]) 
        currPath = os.path.split(parts[1])[0] 
        
        filename = parts[1].replace(',', '')
        
        if currPath in memo:
            memo[currPath] += currCount
        else:
            memo[currPath] = currCount
        
        higherPath = currPath
        while higherPath:
            higherPath = os.path.split(higherPath)[0]
            if higherPath in memo:
                memo[higherPath] += currCount
            else:
                memo[higherPath] = currCount

        r.write(f"{currCount},{filename},{currPath}\n")

    def writeSummaries(memo, r):
        for path, total in sorted(memo.items(), key=lambda item: item[0]):
            if path:
                r.write(f"{total},{path},{os.path.split(path)[0]}\n")
            else:
                r.write(f"{total},,\n")
            #print(f"----> {total} {path}")

    try:
        with open(f"maps/{task_id}_all_files2.csv", "w") as r:
            r.write("size,filename,path\n")
            with open(f"logs/{task_id}_all_files.csv", "r") as f:
                memo = {}
                for line in f:
                    if "entity" in line.strip() or " total" in line:
                        continue
                    writeLine(line, memo, r)
                    
                writeSummaries(memo, r)

    except FileNotFoundError as e:
        print(f"File error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    df_all_files = pd.read_csv(f"maps/{task_id}_all_files2.csv")

    fig = px.treemap(
        names = df_all_files['filename'],
        parents = df_all_files['path'],
        color=np.zeros(len(df_all_files)),
        hover_data=[df_all_files['filename']],
        color_continuous_scale='RdBu',
        #color_continuous_midpoint=np.average(np.zeros(len(df_all_files)), weights=np.ones(len(df_all_files)))
    )

    fig.update_layout(
        uniformtext=dict(minsize=10, mode='hide'),
        margin = dict(t=50, l=25, r=25, b=25),
        autosize=False,
        width=1000,
        height=1000
    )

    fig.write_html(f"maps/{task_id}-map.html")


In [14]:
log_dir = "logs/"
for filename in os.listdir(log_dir):
    if filename.endswith("_all_files.csv"):
        task_id = filename.replace("_all_files.csv", "")
        process_file_all_files(task_id)