In [None]:
from datetime import datetime
import json
import numpy as np
import pandas as pd

# Show all rows
pd.set_option('display.max_rows', None)

# Optional: prevent column truncation
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 0)  # Auto-detect width

def now():
    now = datetime.now()
    yyyymmdd_hhmmss_part = now.strftime('%Y-%m-%d %H:%M:%S')
    ms_part = f'{int(now.microsecond / 1000):03d}'
    return f'{yyyymmdd_hhmmss_part},{ms_part}'

def get_full_shape(df_filename):
    df = pd.read_parquet(df_filename)
    n_rows, n_cols = df.shape
    n_cols -= 1 # discard target
    n_labels = df['label'].nunique()
    return n_rows, n_cols, n_labels

def get_dtypes(df_filename):
    df = pd.read_parquet(df_filename)
    dtypes = set([str(x) for x in df.dtypes.tolist()])
    return dtypes

In [None]:
from pathlib import Path
from time import sleep
from tqdm.notebook import tqdm
import os
import papermill as pm

In [None]:
base_folder = Path("/home/automl/git/iot-threat-classifier/2025-06-28/Input_Multiclass")

file_sizes = {}
shapes = {}
dtypes = {}

# Find all .parquet files recursively
parquet_files = list(base_folder.rglob("*.parquet"))

# Sort by file size (ascending)
parquet_files_sorted = sorted(parquet_files, key=lambda p: os.path.getsize(p))

# Iterate
for file in tqdm(parquet_files_sorted, desc='File', leave='False'):

    try:
        file_sizes[file.stem] = os.path.getsize(file) / 1024 / 1024
        shapes[file.stem] = get_full_shape(file)
        dtypes[file.stem] = get_dtypes(file)
        
    except (ValueError, Exception) as e:
        tqdm.write(f'[{now()}] ERROR      | FILE = {str(file.stem):<64} | EXCEPTION = {e}')

In [None]:
# Build the DataFrame
df_raw = pd.DataFrame({
    'size_mb': file_sizes,
    'samples': {k: v[0] for k, v in shapes.items()}, 
    'features': {k: v[1] for k, v in shapes.items()},
    'classes': {k: v[2] for k, v in shapes.items()},
    'dtypes': dtypes
})

# Set index using the row labels (i.e., filenames)
df_raw.index.name = "filename"

# Now you can style it
input_df = df_raw.style.format({
    'size_mb': '{:.2f}',
    'samples': '{:,.0f}'
})

input_df

In [None]:
def summarize_results(df_filename, results_filename):
    df = pd.read_excel(results_filename)
    row_full = df.loc[df['Unnamed: 0'] == 'full', 'X_train_shape'].values[0]
    row_max = df.loc[df['f1_score_abs'].idxmax()].to_dict()
    if 'Unnamed: 0' in row_max:
        row_max['best_config'] = row_max.pop('Unnamed: 0')
    summary = {'filename': df_filename, 'shape': row_full,}
    summary.update(row_max)
    return summary

In [None]:
base_folder = Path("/home/automl/git/iot-threat-classifier/2025-06-28/Output_Multiclass")

summary_rows = []

# Find all .parquet files recursively
result_files = list(base_folder.rglob("*table.xlsx"))

# Iterate
for file in tqdm(result_files, desc='File', leave='False'):

    try:
        short_filename = str(file.parent.stem)
        row = summarize_results(short_filename, file)
        summary_rows.append(row)
        
    except (ValueError, Exception) as e:
        tqdm.write(f'[{now()}] ERROR      | FILE = {str(file.stem):<64} | EXCEPTION = {e}')

In [None]:
output_df = pd.DataFrame(summary_rows).set_index("filename")

output_df

In [None]:
summary_df = input_df.data.join(output_df, how='left')

summary_df

In [None]:
summary_df.to_excel("/home/automl/git/iot-threat-classifier/2025-06-28/summary.xlsx")

In [None]:
summary_df[summary_df.isna().any(axis=1)]

In [None]:
summary_df[summary_df.isna().any(axis=1)].to_excel("/home/automl/git/iot-threat-classifier/2025-06-28/errors.xlsx")