In [67]:
import os
import pandas as pd
import re

In [68]:
# Define the number of features for each feature group
feature_group_sizes = {
    "FP": 1024,
    "ExtFP": 1024,
    "EStateFP": 79,
    "GraphFP": 1024,
    "MACCSFP": 166,
    "PubchemFP": 881,
    "SubFP": 307,
    "KRFP": 4860,
    "AD2D": 780,
    "SubFPC": 307,
    "KRFPC": 4860,
    "APC2D": 780,
}

# Define columns for the final report
columns = [
    "FP",
    "ExtFP",
    "EStateFP",
    "GraphFP",
    "MACCSFP",
    "PubchemFP",
    "SubFP",
    "SubFPC",
    "KRFP",
    "KRFPC",
    "AD2D",
    "APC2D",
    "Total features",
    "Train Logloss",
    "Validation Logloss",
    "Test Logloss",
    "Train AUC",
    "Validation AUC",
    "Test AUC",
    "Iteration",
    "Drop",
]


# Function to extract feature group sizes based on drop list
def extract_feature_counts(drop_list):
    counts = {
        group: feature_group_sizes[group] if group not in drop_list else 0
        for group in feature_group_sizes
    }
    counts["Total features"] = sum(counts.values())
    return counts

In [69]:
# Function to extract metrics from log files
def extract_metrics(log_file):
    metrics = {}
    with open(log_file, "r") as file:
        for line in file:
            if "Train Logloss" in line:
                metrics["Train Logloss"] = float(line.split(": ")[1].strip())
            elif "Validation Logloss" in line:
                metrics["Validation Logloss"] = float(line.split(": ")[1].strip())
            elif "Test Logloss" in line:
                metrics["Test Logloss"] = float(line.split(": ")[1].strip())
            elif "Train AUC" in line:
                metrics["Train AUC"] = float(line.split(": ")[1].strip())
            elif "Validation AUC" in line:
                metrics["Validation AUC"] = float(line.split(": ")[1].strip())
            elif "Test AUC" in line:
                metrics["Test AUC"] = float(line.split(": ")[1].strip())
    return metrics

In [70]:
# Directory where logs are stored (modify this based on your setup)
log_base_dir = "logs"

# Initialize a list to store the report data
report_data = []

# Traverse the directory structure to find log files
for iteration_dir in os.listdir(log_base_dir):
    iteration_path = os.path.join(log_base_dir, iteration_dir)
    n_iteration = re.findall(r"iteration_(\d)", iteration_path)[0]

    if os.path.isdir(iteration_path):
        for log_file in os.listdir(iteration_path):
            if log_file.startswith("xgboost_drop_"):
                log_file_path = os.path.join(iteration_path, log_file)

                # Extract the list of dropped features from the filename
                drop_list = re.findall(r"xgboost_drop_(.*?).txt", log_file)[0].split(
                    "_"
                )

                # Extract feature counts based on the dropped features
                feature_counts = extract_feature_counts(drop_list)

                # Extract metrics from the log file
                metrics = extract_metrics(log_file_path)

                # Combine feature counts and metrics into a single dictionary
                drop_name = "_".join(drop_list)
                report_row = {
                    **feature_counts,
                    **metrics,
                    "Iteration": n_iteration,
                    "Drop": drop_name,
                }

                # Append the row to the report data list
                report_data.append(report_row)

In [71]:
# Convert the report data to a DataFrame
report_df = pd.DataFrame(report_data, columns=columns)
print(report_df.shape)
report_df.head(5).T

(4595, 21)


Unnamed: 0,0,1,2,3,4
FP,1024,1024,1024,1024,1024
ExtFP,1024,1024,1024,1024,1024
EStateFP,79,79,79,79,79
GraphFP,1024,1024,1024,1024,1024
MACCSFP,166,166,166,166,166
PubchemFP,881,881,881,881,881
SubFP,307,307,307,307,307
SubFPC,307,307,307,307,307
KRFP,4860,4860,4860,4860,4860
KRFPC,4860,4860,4860,4860,4860


In [72]:
report_df["Iteration"].value_counts()

Iteration
0    919
1    919
2    919
3    919
4    919
Name: count, dtype: int64

In [73]:
report_df["Test AUC"].describe()

count    4595.000000
mean        0.892084
std         0.013603
min         0.821928
25%         0.882840
50%         0.894883
75%         0.902332
max         0.922102
Name: Test AUC, dtype: float64

In [74]:
# Save the final report to a CSV file
report_df.to_csv("final_report.csv", index=False)

In [75]:
report_df[report_df["Test AUC"] == report_df["Test AUC"].max()]

Unnamed: 0,FP,ExtFP,EStateFP,GraphFP,MACCSFP,PubchemFP,SubFP,SubFPC,KRFP,KRFPC,...,APC2D,Total features,Train Logloss,Validation Logloss,Test Logloss,Train AUC,Validation AUC,Test AUC,Iteration,Drop
3748,1024,1024,79,1024,0,0,307,0,4860,4860,...,780,14738,0.34534,0.540249,0.45324,0.978163,0.794036,0.922102,4,EstateFP_MACCSFP_PubchemFP_SubFPC
4475,1024,1024,79,1024,0,0,307,0,4860,4860,...,780,14738,0.34534,0.540249,0.45324,0.978163,0.794036,0.922102,4,MACCSFP_PubchemFP_SubFPC


In [76]:
report_df[report_df["Drop"] == "MACCSFP_PubchemFP_SubFPC"]

Unnamed: 0,FP,ExtFP,EStateFP,GraphFP,MACCSFP,PubchemFP,SubFP,SubFPC,KRFP,KRFPC,...,APC2D,Total features,Train Logloss,Validation Logloss,Test Logloss,Train AUC,Validation AUC,Test AUC,Iteration,Drop
799,1024,1024,79,1024,0,0,307,0,4860,4860,...,780,14738,0.304485,0.549485,0.445929,0.983813,0.778995,0.902959,0,MACCSFP_PubchemFP_SubFPC
1718,1024,1024,79,1024,0,0,307,0,4860,4860,...,780,14738,0.416971,0.553927,0.496276,0.959378,0.78546,0.892447,1,MACCSFP_PubchemFP_SubFPC
2637,1024,1024,79,1024,0,0,307,0,4860,4860,...,780,14738,0.30784,0.557995,0.457284,0.983583,0.770814,0.894953,2,MACCSFP_PubchemFP_SubFPC
3556,1024,1024,79,1024,0,0,307,0,4860,4860,...,780,14738,0.413209,0.552667,0.496125,0.960025,0.784734,0.893422,3,MACCSFP_PubchemFP_SubFPC
4475,1024,1024,79,1024,0,0,307,0,4860,4860,...,780,14738,0.34534,0.540249,0.45324,0.978163,0.794036,0.922102,4,MACCSFP_PubchemFP_SubFPC


In [27]:
report_df.groupby("Drop").agg(
    {
        "Test AUC": ["mean", "std"],
        "Total features": "max",
    }
).sort_values(("Test AUC", "mean"), ascending=False).head(10)

Unnamed: 0_level_0,Test AUC,Test AUC,Total features
Unnamed: 0_level_1,mean,std,max
Drop,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
ExtFP_GraphFP_AD2D_APC2D,0.913066,0.001653,12484
ExtFP_GraphFP_SubFP_APC2D,0.912746,0.002717,12957
ExtFP_EstateFP,0.910574,0.004832,15068
ExtFP,0.910574,0.004832,15068
ExtFP_EstateFP_AD2D_APC2D,0.910003,0.004388,13508
ExtFP_AD2D_APC2D,0.910003,0.004388,13508
ExtFP_GraphFP_MACCSFP_APC2D,0.909822,0.00707,13098
ExtFP_GraphFP_PubchemFP_APC2D,0.909307,0.004022,12383
ExtFP_GraphFP_MACCSFP_SubFP,0.909279,0.005636,13571
ExtFP_SubFP_SubFPC,0.909252,0.003478,14454


In [77]:
import os
import pandas as pd

# Define the path to the feat_imp directory
feat_imp_dir = "feat_imp"

# Initialize an empty list to hold the parsed data
feature_importance_data = []

# Loop through each iteration directory
for iteration in os.listdir(feat_imp_dir):
    iteration_path = os.path.join(feat_imp_dir, iteration)

    if os.path.isdir(iteration_path):  # Check if it is a directory
        # Loop through each file in the iteration folder
        for filename in os.listdir(iteration_path):
            if filename.startswith("xgboost_drop_") and filename.endswith(".csv"):
                file_path = os.path.join(iteration_path, filename)

                # Extract the features being dropped from the filename
                features_dropped = filename.replace("xgboost_drop_", "").replace(
                    ".csv", ""
                )

                # only select group of drop features
                if features_dropped not in [
                    "MACCSFP_PubchemFP_SubFPC",
                    "ExtFP_GraphFP_AD2D_APC2D",
                    "ExtFP_GraphFP_SubFP_APC2D",
                    "ExtFP_EstateFP",
                ]:
                    continue

                # Read the CSV file into a pandas dataframe
                df = pd.read_csv(file_path)

                # Add iteration and dropped features columns to the dataframe
                df["Iteration"] = iteration
                df["Dropped_Features"] = features_dropped

                # Append the dataframe to the list
                feature_importance_data.append(df)

# Concatenate all dataframes into one
feature_importance_df = pd.concat(feature_importance_data, ignore_index=True)

# Display the dataframe
print(feature_importance_df)

                Feature  Importance    Iteration         Dropped_Features
0                   FP1     0.00000  iteration_0                     AD2D
1                   FP2     0.00032  iteration_0                     AD2D
2                   FP3     0.00000  iteration_0                     AD2D
3                   FP4     0.00000  iteration_0                     AD2D
4                   FP5     0.00000  iteration_0                     AD2D
...                 ...         ...          ...                      ...
46202585   APC2D10_B_Si     0.00000  iteration_4  SubFP_SubFPC_KRFP_KRFPC
46202586    APC2D10_B_X     0.00000  iteration_4  SubFP_SubFPC_KRFP_KRFPC
46202587  APC2D10_Si_Si     0.00000  iteration_4  SubFP_SubFPC_KRFP_KRFPC
46202588   APC2D10_Si_X     0.00000  iteration_4  SubFP_SubFPC_KRFP_KRFPC
46202589    APC2D10_X_X     0.00000  iteration_4  SubFP_SubFPC_KRFP_KRFPC

[46202590 rows x 4 columns]


In [87]:
features_dropped = [
    "MACCSFP_PubchemFP_SubFPC",
    "ExtFP_GraphFP_AD2D_APC2D",
    "ExtFP_GraphFP_SubFP_APC2D",
    "ExtFP_EstateFP",
]
feat_common = feature_importance_df[
    feature_importance_df["Dropped_Features"].isin(features_dropped)
]

# Group by 'Dropped_Features' and apply the desired aggregation for each group
grouped_features = feat_common.groupby("Dropped_Features").apply(
    lambda feat_df: feat_df.groupby("Feature")
    .agg(
        {
            "Importance": ["min", "max", "mean"],
            "Iteration": "count",
        }
    )
    .sort_values(("Importance", "mean"), ascending=False)
    .head(100)
    .index.tolist()  # Extract the top 20 features' indices as a list
)

# Convert the grouped result to a list of lists
ls_features = grouped_features.tolist()
print(len(ls_features))

4


In [88]:
# Convert each sublist to a set
sets = map(set, ls_features)

# Use set intersection to find common elements
common_features = set.intersection(*sets)

# Output the result
print(len(common_features))
print(common_features)

23
{'FP168', 'KRFP297', 'FP1006', 'FP802', 'FP823', 'KRFPC4757', 'FP890', 'FP349', 'KRFP298', 'FP289', 'FP277', 'FP937', 'KRFP4757', 'FP1007', 'EStateFP33', 'FP187', 'FP932', 'FP355', 'FP598', 'KRFPC3389', 'FP316', 'FP130', 'KRFPC3884'}
