In [1]:
import os
import pandas as pd
import re

In [20]:
# Define the number of features for each feature group
feature_group_sizes = {
    "FP": 1024,
    "ExtFP": 1024,
    "EStateFP": 79,
    "GraphFP": 1024,
    "MACCSFP": 166,
    "PubchemFP": 881,
    "SubFP": 307,
    "KRFP": 4860,
    "AD2D": 780,
    "SubFPC": 307,
    "KRFPC": 4860,
    "APC2D": 780,
}

# Define columns for the final report
columns = [
    "FP",
    "ExtFP",
    "EStateFP",
    "GraphFP",
    "MACCSFP",
    "PubchemFP",
    "SubFP",
    "SubFPC",
    "KRFP",
    "KRFPC",
    "AD2D",
    "APC2D",
    "Total features",
    "Train Logloss",
    "Validation Logloss",
    "Test Logloss",
    "Train AUC",
    "Validation AUC",
    "Test AUC",
    "Iteration",
]


# Function to extract feature group sizes based on drop list
def extract_feature_counts(drop_list):
    counts = {
        group: feature_group_sizes[group] if group not in drop_list else 0
        for group in feature_group_sizes
    }
    counts["Total features"] = sum(counts.values())
    return counts

In [21]:
# Function to extract metrics from log files
def extract_metrics(log_file):
    metrics = {}
    with open(log_file, "r") as file:
        for line in file:
            if "Train Logloss" in line:
                metrics["Train Logloss"] = float(line.split(": ")[1].strip())
            elif "Validation Logloss" in line:
                metrics["Validation Logloss"] = float(line.split(": ")[1].strip())
            elif "Test Logloss" in line:
                metrics["Test Logloss"] = float(line.split(": ")[1].strip())
            elif "Train AUC" in line:
                metrics["Train AUC"] = float(line.split(": ")[1].strip())
            elif "Validation AUC" in line:
                metrics["Validation AUC"] = float(line.split(": ")[1].strip())
            elif "Test AUC" in line:
                metrics["Test AUC"] = float(line.split(": ")[1].strip())
    return metrics

In [27]:
# Directory where logs are stored (modify this based on your setup)
log_base_dir = "logs"

# Initialize a list to store the report data
report_data = []

# Traverse the directory structure to find log files
for iteration_dir in os.listdir(log_base_dir):
    iteration_path = os.path.join(log_base_dir, iteration_dir)
    n_iteration = re.findall(r"iteration_(\d)", iteration_path)[0]

    if os.path.isdir(iteration_path):
        for log_file in os.listdir(iteration_path):
            if log_file.startswith("xgboost_drop_"):
                log_file_path = os.path.join(iteration_path, log_file)

                # Extract the list of dropped features from the filename
                drop_list = re.findall(r"xgboost_drop_(.*?).txt", log_file)[0].split(
                    "_"
                )

                # Extract feature counts based on the dropped features
                feature_counts = extract_feature_counts(drop_list)

                # Extract metrics from the log file
                metrics = extract_metrics(log_file_path)

                # Combine feature counts and metrics into a single dictionary
                report_row = {**feature_counts, **metrics, "Iteration": n_iteration}

                # Append the row to the report data list
                report_data.append(report_row)

In [32]:
# Convert the report data to a DataFrame
report_df = pd.DataFrame(report_data, columns=columns)
report_df.head(5).T

Unnamed: 0,0,1,2,3,4
FP,1024.0,1024.0,1024.0,1024.0,1024.0
ExtFP,1024.0,1024.0,1024.0,1024.0,1024.0
EStateFP,79.0,79.0,79.0,79.0,79.0
GraphFP,1024.0,1024.0,1024.0,1024.0,1024.0
MACCSFP,166.0,166.0,166.0,166.0,166.0
PubchemFP,881.0,881.0,881.0,881.0,881.0
SubFP,307.0,307.0,307.0,307.0,307.0
SubFPC,307.0,307.0,307.0,307.0,307.0
KRFP,4860.0,4860.0,4860.0,4860.0,4860.0
KRFPC,4860.0,4860.0,4860.0,4860.0,4860.0


In [33]:
report_df["Iteration"].value_counts()

Iteration
0    56
1    56
2    55
3    55
4    55
Name: count, dtype: int64

In [34]:
report_df["Test AUC"].describe()

count    277.000000
mean       0.898798
std        0.008533
min        0.874556
25%        0.893213
50%        0.900035
75%        0.904629
max        0.919109
Name: Test AUC, dtype: float64

In [35]:
# Save the final report to a CSV file
report_df.to_csv("final_report.csv", index=False)