In [7]:
# real throught he csv files under cwd who start with "clean", then reorder their columns as this: 
#_source_file,batch_id,completion_cost,completion_tokens,latency,name,parameters,prompt_cost,prompt_tokens,status,total_cost,total_tokens,trace_id,processor,model
# If the columns name occur in this list but not actual csv file, ignore them. if the columns show in csv but not this list, move them to the end of the columns

In [None]:

import os
import pandas as pd

def get_processor_type(batch_id):
    """
    Helper function to determine processor type.
    Raises ValueError if no valid tag is found.
    """
    # Ensure batch_id is a string to avoid errors on NaN or numbers
    s_batch_id = str(batch_id)
    
    if "_dp" in s_batch_id:
        return "dp"
    elif "_mc" in s_batch_id:
        return "mc"

    elif "_psg" in s_batch_id:
        return "sg"
    elif "_tpusg" in s_batch_id:
        return "sg"
    elif "_sg" in s_batch_id:
        return "sg"
    else:
        raise ValueError(f"Invalid batch_id '{batch_id}' - must contain _dp_, _mc_, _sg_, _tpusg_, or _psg_")


def get_model_name(batch_id):
    """
    Helper function to determine model name based on batch_id.
    This is a placeholder and can be expanded with actual logic as needed.
    """
    # Example logic (this can be modified based on actual requirements)
    # gemma3, qwen2.5-coder:14b
    if "gpt-5" in batch_id:
        return "gpt-5"
    elif "qwen2.5-coder:32b" in batch_id:
        return "qw32"
    elif "phi4" in batch_id:
        return "phi4"
    elif "qwen2.5-coder:14b" in batch_id:
        return "qw14"
    elif "gemma3" in batch_id:
        return "gemma3"
    elif "codestral" in batch_id:
        return "codestral"
    else:
        raise ValueError(f"Invalid batch_id '{batch_id}', invalid model name.")
    
def process_csv_files():
    # The specific column order requested
    target_order = [
        # name,trace_id,batch_id,status,latency,total_tokens,prompt_tokens,completion_tokens,total_cost,prompt_cost,completion_cost, parameters, processor, model
        "name", "trace_id", "batch_id", "status", "latency", "total_tokens", "prompt_tokens", "completion_tokens", "total_cost", "prompt_cost", "completion_cost", "parameters", "processor", "model"
        # "_source_file", "batch_id", "completion_cost", "completion_tokens", 
        # "latency", "name", "parameters", "prompt_cost", 
        # "prompt_tokens", "status",  "total_cost", "total_tokens", "trace_id","processor","model"
    ]

    cwd = os.getcwd()
    files = [f for f in os.listdir(cwd) if f.startswith("run_level") and f.endswith(".csv")]

    if not files:
        print("No files found starting with 'clean' in the current directory.")
        return

    print(f"Found {len(files)} files to process...")

    for filename in files:
        try:
            file_path = os.path.join(cwd, filename)
            df = pd.read_csv(file_path)

            # --- 1. Add 'processor' column logic ---
            if 'batch_id' not in df.columns:
                print(f"Skipping {filename}: Column 'batch_id' not found.")
                continue

            # Apply the logic row by row. This will raise ValueError if conditions aren't met.
            df['processor'] = df['batch_id'].apply(get_processor_type)
            
            # --- 2. Add 'model' column logic ---
            # Sets 'model' based on batch_id
            df['model'] = df['batch_id'].apply(get_model_name)
            df['_source_file']='none'
            
            df.drop(columns=["num_run","tags","generation_count","timestamp"],inplace=True, errors='ignore')

            # --- 2. Reorder columns ---
            # Columns from your target list that exist in this file
            existing_target_cols = [col for col in target_order if col in df.columns]
            
            # Columns in the file that are NOT in the target list
            # Note: 'processor' will naturally fall into this list and appear at the end
            remaining_cols = [col for col in df.columns if col not in target_order]
            
            final_order = existing_target_cols + remaining_cols
            
            df = df[final_order]
            
            # --- 3. Save ---
            df.to_csv(file_path, index=False)
            print(f"Successfully processed: {filename}")
            
        except ValueError as ve:
            # Captures the custom error raised when tags are missing
            print(f"ERROR in {filename}: {ve}")
        except Exception as e:
            print(f"General error processing {filename}: {e}")
        
if __name__ == "__main__":
    process_csv_files()


Found 1 files to process...
Successfully processed: run_level.csv
