# This converter is to unify different column and their orders and value formats, for the convenience of plotting and further processing.

In [1]:
import os
import pandas as pd

FALSE_PARAMS = {"", " ", "N/A", "{'extra_body:{}'}", "NP", "F"}
TRUE_PARAMS = {"T", "{'top_p:0.3', 'temperature:0.1'}", "P","{'temperature:0.1', 'top_p:0.3'}"}

def get_processor_type(batch_id):
    """Derive processor tag from batch_id."""
    s_batch_id = str(batch_id).strip()
    if "_dp" in s_batch_id:
        return "dp"
    if "_mc" in s_batch_id:
        return "mc"
    if "_tpusg" in s_batch_id:
        return "tpusg"
    if "_psg" in s_batch_id :
        return "psg"
    if "_sg" in s_batch_id:
        return "sg"
    
    raise ValueError(f"Invalid batch_id '{batch_id}' - must contain _dp_, _mc_, _sg_, _tpusg_, or _psg_")


def get_model_name(batch_id):
    """Derive model name from batch_id."""
    s_batch_id = str(batch_id).strip()
    if "gpt-5" in s_batch_id:
        return "gpt-5"
    if "qwen2.5-coder:32b" in s_batch_id:
        return "qwen32"
    if "phi4" in s_batch_id:
        return "phi4"
    if "qwen2.5-coder:14b" in s_batch_id:
        return "qwen14"
    if "gemma3" in s_batch_id:
        return "gemma3"
    if "codestral" in s_batch_id:
        return "codestral"
    if "gpt-4o-mini" in s_batch_id:
        return "gpt-4o-mini"
    if "g4o-m" in s_batch_id:
        return "gpt-4o-mini"
    if "gpt-4o" in s_batch_id:
        return "gpt-4o"
    raise ValueError(f"Invalid batch_id '{batch_id}', invalid model name.")


def normalize_parameters(value):
    """Map parameters column to T/F with strict validation."""
    if pd.isna(value):
        return "F"
    trimmed = str(value).strip()
    if trimmed in FALSE_PARAMS:
        return "F"
    if trimmed in TRUE_PARAMS:
        return "T"
    raise ValueError(f"Unexpected parameters value '{value}'.")


def detect_style(columns):
    if "_source_file" in columns:
        return "legacy"
    if "num_run" in columns or "tags" in columns:
        return "standard"
    return "modern"


def process_csv_files():
    target_order = [
        "name", "trace_id", "batch_id", "status", "latency", "total_tokens", "prompt_tokens",
        "completion_tokens", "total_cost", "prompt_cost", "completion_cost", "parameters", "processor", "model",
    ]

    cwd = os.getcwd()
    files = [f for f in os.listdir(cwd) if( f.endswith(".csv") and not f.endswith("_converted.csv"))]

    if not files:
        print("No CSV files found in the current directory.")
        return

    print(f"Found {len(files)} files to process...")

    for filename in files:
        try:
            file_path = os.path.join(cwd, filename)
            df = pd.read_csv(file_path)

            style = detect_style(df.columns)

            if "batch_id" not in df.columns:
                print(f"Skipping {filename}: Column 'batch_id' not found.")
                continue

            df["processor"] = df["batch_id"].apply(get_processor_type)
            df["model"] = df["batch_id"].apply(get_model_name)

            # Normalize parameters column for modern style expectations
            if "parameters" not in df.columns:
                df["parameters"] = "F"
            df["parameters"] = df["parameters"].apply(normalize_parameters)
            df.drop(columns=["parameters.1"], inplace=True, errors="ignore")
            # Drop legacy/standard-only columns to unify modern output
            drop_cols = ["num_run", "tags", "generation_count", "timestamp", "_source_file"]
            df.drop(columns=drop_cols, inplace=True, errors="ignore")

            # Reorder into modern target order, keeping any extra columns at the end
            existing_target_cols = [col for col in target_order if col in df.columns]
            remaining_cols = [col for col in df.columns if col not in target_order]
            df = df[existing_target_cols + remaining_cols]

            # Save with _modern suffix
            base_name, ext = os.path.splitext(filename)
            output_filename = f"{base_name}_converted{ext}"
            output_path = os.path.join(cwd, output_filename)
            
            df.to_csv(output_path, index=False)
            print(f"Successfully processed: {filename} (detected {style}) → {output_filename}")

        except ValueError as ve:
            print(f"ERROR in {filename}: {ve}")
        except Exception as e:
            print(f"General error processing {filename}: {e}")


if __name__ == "__main__":
    process_csv_files()


Found 2 files to process...
Successfully processed: clean_phi4_2a34_psg_batch.csv (detected standard) → clean_phi4_2a34_psg_batch_converted.csv
Successfully processed: clean_phi4_9caa_psg_batch.csv (detected standard) → clean_phi4_9caa_psg_batch_converted.csv


In [4]:
#
ab1p_file= "ab1p-p-phi4.csv"
df_ab1p= pd.read_csv(ab1p_file)
df_ab1p["prompt_type"]= "abla-1p"
df_ab1p.to_csv(ab1p_file, index=False)

ab2p_file= "ab2p-p-phi4.csv"
df_ab2p= pd.read_csv(ab2p_file)
df_ab2p["prompt_type"]= "abla-2p"
df_ab2p.to_csv(ab2p_file, index=False)