In [1]:
import os
import pandas as pd
import re

In [14]:
# Define the number of features for each feature group
feature_group_sizes = {
    "FP": 1024,
    "ExtFP": 1024,
    "EStateFP": 79,
    "GraphFP": 1024,
    "MACCSFP": 166,
    "PubchemFP": 881,
    "SubFP": 307,
    "KRFP": 4860,
    "AD2D": 780,
    "SubFPC": 307,
    "KRFPC": 4860,
    "APC2D": 780,
}

# Define columns for the final report
columns = [
    "FP",
    "ExtFP",
    "EStateFP",
    "GraphFP",
    "MACCSFP",
    "PubchemFP",
    "SubFP",
    "SubFPC",
    "KRFP",
    "KRFPC",
    "AD2D",
    "APC2D",
    "Total features",
    "Train Logloss",
    "Validation Logloss",
    "Test Logloss",
    "Train AUC",
    "Validation AUC",
    "Test AUC",
    "Iteration",
    "Drop",
]


# Function to extract feature group sizes based on drop list
def extract_feature_counts(drop_list):
    counts = {
        group: feature_group_sizes[group] if group not in drop_list else 0
        for group in feature_group_sizes
    }
    counts["Total features"] = sum(counts.values())
    return counts

In [3]:
# Function to extract metrics from log files
def extract_metrics(log_file):
    metrics = {}
    with open(log_file, "r") as file:
        for line in file:
            if "Train Logloss" in line:
                metrics["Train Logloss"] = float(line.split(": ")[1].strip())
            elif "Validation Logloss" in line:
                metrics["Validation Logloss"] = float(line.split(": ")[1].strip())
            elif "Test Logloss" in line:
                metrics["Test Logloss"] = float(line.split(": ")[1].strip())
            elif "Train AUC" in line:
                metrics["Train AUC"] = float(line.split(": ")[1].strip())
            elif "Validation AUC" in line:
                metrics["Validation AUC"] = float(line.split(": ")[1].strip())
            elif "Test AUC" in line:
                metrics["Test AUC"] = float(line.split(": ")[1].strip())
    return metrics

In [8]:
# Directory where logs are stored (modify this based on your setup)
log_base_dir = "logs"

# Initialize a list to store the report data
report_data = []

# Traverse the directory structure to find log files
for iteration_dir in os.listdir(log_base_dir):
    iteration_path = os.path.join(log_base_dir, iteration_dir)
    n_iteration = re.findall(r"iteration_(\d)", iteration_path)[0]

    if os.path.isdir(iteration_path):
        for log_file in os.listdir(iteration_path):
            if log_file.startswith("xgboost_drop_"):
                log_file_path = os.path.join(iteration_path, log_file)

                # Extract the list of dropped features from the filename
                drop_list = re.findall(r"xgboost_drop_(.*?).txt", log_file)[0].split(
                    "_"
                )

                # Extract feature counts based on the dropped features
                feature_counts = extract_feature_counts(drop_list)

                # Extract metrics from the log file
                metrics = extract_metrics(log_file_path)

                # Combine feature counts and metrics into a single dictionary
                drop_name = "_".join(drop_list)
                report_row = {
                    **feature_counts,
                    **metrics,
                    "Iteration": n_iteration,
                    "Drop": drop_name,
                }

                # Append the row to the report data list
                report_data.append(report_row)

In [15]:
# Convert the report data to a DataFrame
report_df = pd.DataFrame(report_data, columns=columns)
print(report_df.shape)
report_df.head(5).T

(4059, 21)


Unnamed: 0,0,1,2,3,4
FP,1024,1024,1024,1024,1024
ExtFP,1024,1024,1024,1024,1024
EStateFP,79,79,79,79,79
GraphFP,1024,1024,1024,1024,1024
MACCSFP,166,166,166,166,166
PubchemFP,881,881,881,881,881
SubFP,307,307,307,307,307
SubFPC,307,307,307,307,307
KRFP,4860,4860,4860,4860,4860
KRFPC,4860,4860,4860,4860,4860


In [10]:
report_df["Iteration"].value_counts()

Iteration
0    812
1    812
2    812
3    812
4    811
Name: count, dtype: int64

In [11]:
report_df["Test AUC"].describe()

count    4059.000000
mean        0.894777
std         0.011530
min         0.845527
25%         0.887678
50%         0.896763
75%         0.903098
max         0.922102
Name: Test AUC, dtype: float64

In [12]:
# Save the final report to a CSV file
report_df.to_csv("final_report.csv", index=False)

In [19]:
report_df[report_df["Test AUC"] == report_df["Test AUC"].max()]

Unnamed: 0,FP,ExtFP,EStateFP,GraphFP,MACCSFP,PubchemFP,SubFP,SubFPC,KRFP,KRFPC,...,APC2D,Total features,Train Logloss,Validation Logloss,Test Logloss,Train AUC,Validation AUC,Test AUC,Iteration,Drop
3320,1024,1024,79,1024,0,0,307,0,4860,4860,...,780,14738,0.34534,0.540249,0.45324,0.978163,0.794036,0.922102,4,EstateFP_MACCSFP_PubchemFP_SubFPC
3939,1024,1024,79,1024,0,0,307,0,4860,4860,...,780,14738,0.34534,0.540249,0.45324,0.978163,0.794036,0.922102,4,MACCSFP_PubchemFP_SubFPC


In [22]:
report_df[report_df["Drop"] == "MACCSFP_PubchemFP_SubFPC"]

Unnamed: 0,FP,ExtFP,EStateFP,GraphFP,MACCSFP,PubchemFP,SubFP,SubFPC,KRFP,KRFPC,...,APC2D,Total features,Train Logloss,Validation Logloss,Test Logloss,Train AUC,Validation AUC,Test AUC,Iteration,Drop
692,1024,1024,79,1024,0,0,307,0,4860,4860,...,780,14738,0.304485,0.549485,0.445929,0.983813,0.778995,0.902959,0,MACCSFP_PubchemFP_SubFPC
1504,1024,1024,79,1024,0,0,307,0,4860,4860,...,780,14738,0.416971,0.553927,0.496276,0.959378,0.78546,0.892447,1,MACCSFP_PubchemFP_SubFPC
2316,1024,1024,79,1024,0,0,307,0,4860,4860,...,780,14738,0.30784,0.557995,0.457284,0.983583,0.770814,0.894953,2,MACCSFP_PubchemFP_SubFPC
3128,1024,1024,79,1024,0,0,307,0,4860,4860,...,780,14738,0.413209,0.552667,0.496125,0.960025,0.784734,0.893422,3,MACCSFP_PubchemFP_SubFPC
3939,1024,1024,79,1024,0,0,307,0,4860,4860,...,780,14738,0.34534,0.540249,0.45324,0.978163,0.794036,0.922102,4,MACCSFP_PubchemFP_SubFPC


In [27]:
report_df.groupby("Drop").agg(
    {
        "Test AUC": ["mean", "std"],
        "Total features": "max",
    }
).sort_values(("Test AUC", "mean"), ascending=False).head(10)

Unnamed: 0_level_0,Test AUC,Test AUC,Total features
Unnamed: 0_level_1,mean,std,max
Drop,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
ExtFP_GraphFP_AD2D_APC2D,0.913066,0.001653,12484
ExtFP_GraphFP_SubFP_APC2D,0.912746,0.002717,12957
ExtFP_EstateFP,0.910574,0.004832,15068
ExtFP,0.910574,0.004832,15068
ExtFP_EstateFP_AD2D_APC2D,0.910003,0.004388,13508
ExtFP_AD2D_APC2D,0.910003,0.004388,13508
ExtFP_GraphFP_MACCSFP_APC2D,0.909822,0.00707,13098
ExtFP_GraphFP_PubchemFP_APC2D,0.909307,0.004022,12383
ExtFP_GraphFP_MACCSFP_SubFP,0.909279,0.005636,13571
ExtFP_SubFP_SubFPC,0.909252,0.003478,14454


In [51]:
import os
import pandas as pd

# Define the path to the feat_imp directory
feat_imp_dir = "feat_imp"

# Initialize an empty list to hold the parsed data
feature_importance_data = []

# Loop through each iteration directory
for iteration in os.listdir(feat_imp_dir):
    iteration_path = os.path.join(feat_imp_dir, iteration)

    if os.path.isdir(iteration_path):  # Check if it is a directory
        # Loop through each file in the iteration folder
        for filename in os.listdir(iteration_path):
            if filename.startswith("xgboost_drop_") and filename.endswith(".csv"):
                file_path = os.path.join(iteration_path, filename)

                # Extract the features being dropped from the filename
                features_dropped = filename.replace("xgboost_drop_", "").replace(
                    ".csv", ""
                )

                # only select group of drop features
                if features_dropped not in [
                    "MACCSFP_PubchemFP_SubFPC",
                    "ExtFP_GraphFP_AD2D_APC2D",
                    "ExtFP_GraphFP_SubFP_APC2D",
                    "ExtFP_EstateFP",
                ]:
                    continue

                # Read the CSV file into a pandas dataframe
                df = pd.read_csv(file_path)

                # Add iteration and dropped features columns to the dataframe
                df["Iteration"] = iteration
                df["Dropped_Features"] = features_dropped

                # Append the dataframe to the list
                feature_importance_data.append(df)

# Concatenate all dataframes into one
feature_importance_df = pd.concat(feature_importance_data, ignore_index=True)

# Display the dataframe
print(feature_importance_df)

              Feature  Importance    Iteration          Dropped_Features
0                 FP1    0.000119  iteration_0            ExtFP_EstateFP
1                 FP2    0.000452  iteration_0            ExtFP_EstateFP
2                 FP3    0.000000  iteration_0            ExtFP_EstateFP
3                 FP4    0.000858  iteration_0            ExtFP_EstateFP
4                 FP5    0.000000  iteration_0            ExtFP_EstateFP
...               ...         ...          ...                       ...
274695   APC2D10_B_Si    0.000000  iteration_4  MACCSFP_PubchemFP_SubFPC
274696    APC2D10_B_X    0.000000  iteration_4  MACCSFP_PubchemFP_SubFPC
274697  APC2D10_Si_Si    0.000000  iteration_4  MACCSFP_PubchemFP_SubFPC
274698   APC2D10_Si_X    0.000000  iteration_4  MACCSFP_PubchemFP_SubFPC
274699    APC2D10_X_X    0.000000  iteration_4  MACCSFP_PubchemFP_SubFPC

[274700 rows x 4 columns]


In [52]:
feature_importance_df.head()

Unnamed: 0,Feature,Importance,Iteration,Dropped_Features
0,FP1,0.000119,iteration_0,ExtFP_EstateFP
1,FP2,0.000452,iteration_0,ExtFP_EstateFP
2,FP3,0.0,iteration_0,ExtFP_EstateFP
3,FP4,0.000858,iteration_0,ExtFP_EstateFP
4,FP5,0.0,iteration_0,ExtFP_EstateFP


In [65]:
ls_features = []
for features_dropped in [
    "MACCSFP_PubchemFP_SubFPC",
    "ExtFP_GraphFP_AD2D_APC2D",
    "ExtFP_GraphFP_SubFP_APC2D",
    "ExtFP_EstateFP",
]:
    print(features_dropped)
    feat_df = feature_importance_df[
        feature_importance_df["Dropped_Features"] == features_dropped
    ]

    # Top 20 most selected important features in which group, order by Importance (min, max, average)
    top_20_features = (
        feat_df.groupby(["Feature"])
        .agg(
            {
                "Importance": ["min", "max", "mean"],
                "Iteration": "count",
            }
        )
        .sort_values(("Importance", "mean"), ascending=False)
        .head(100)
    )

    ls_features.append(top_20_features.index.tolist())

MACCSFP_PubchemFP_SubFPC
ExtFP_GraphFP_AD2D_APC2D
ExtFP_GraphFP_SubFP_APC2D
ExtFP_EstateFP


In [66]:
# Convert each sublist to a set
sets = map(set, ls_features)

# Use set intersection to find common elements
common_features = set.intersection(*sets)

# Output the result
print(len(common_features))
print(common_features)

23
{'FP168', 'KRFP297', 'FP1006', 'FP802', 'FP823', 'KRFPC4757', 'FP890', 'FP349', 'KRFP298', 'FP289', 'FP277', 'FP937', 'KRFP4757', 'FP1007', 'EStateFP33', 'FP187', 'FP932', 'FP355', 'FP598', 'KRFPC3389', 'FP316', 'FP130', 'KRFPC3884'}


In [None]:
feature_importance_df["Rank"] = feature_importance_df.groupby(
    ["Dropped_Features", "Iteration"]
)["Importance"].rank(ascending=False)

In [18]:
feat_analyzing = feature_importance_df.query("Dropped_Features == 'AD2D'")
print(feat_analyzing.shape)
feat_analyzing.head()

(76560, 5)


Unnamed: 0,Feature,Importance,Iteration,Dropped_Features,Rank
0,FP1,0.0,iteration_0,AD2D,8597.0
1,FP2,0.00032,iteration_0,AD2D,1434.0
2,FP3,0.0,iteration_0,AD2D,8597.0
3,FP4,0.0,iteration_0,AD2D,8597.0
4,FP5,0.0,iteration_0,AD2D,8597.0


In [21]:
feat_analyzing.groupby("Feature")["Rank"].mean().sort_values()

Feature
FP890           31.8
EStateFP9       41.0
MACCSFP100      48.0
MACCSFP153      54.4
FP349           70.6
               ...  
KRFP3677      8605.5
KRFP3678      8605.5
KRFP3679      8605.5
KRFP3714      8605.5
KRFP4350      8605.5
Name: Rank, Length: 15312, dtype: float64

In [25]:
feat_analyzing.groupby(["Dropped_Features", "Feature"]).agg(
    {"Rank": ["min", "max", "mean", "std"]}
).sort_values(("Rank", "mean")).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Rank,Rank,Rank,Rank
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std
Dropped_Features,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AD2D,FP890,17.0,74.0,31.8,23.931151
AD2D,EStateFP9,7.0,103.0,41.0,37.914377
AD2D,MACCSFP100,12.0,120.0,48.0,41.737274
AD2D,MACCSFP153,47.0,63.0,54.4,7.127412
AD2D,FP349,9.0,150.0,70.6,54.150716
AD2D,FP130,37.0,153.0,70.8,48.463388
AD2D,ExtFP549,13.0,184.0,78.0,67.152811
AD2D,SubFP33,61.0,127.0,89.6,25.70603
AD2D,KRFP4231,1.0,515.0,109.0,227.002203
AD2D,KRFP297,19.0,217.0,109.2,99.72813


In [34]:
feat_summary = (
    feature_importance_df.groupby(["Dropped_Features", "Feature"])
    .agg(
        {
            "Rank": ["min", "max", "mean", "std", "count"],
        }
    )
    .sort_values(("Rank", "mean"))
)

feat_summary = feat_summary[
    (feat_summary[("Rank", "count")] > 2) & (feat_summary[("Rank", "mean")] < 20)
]
feat_summary.shape

(346, 5)

In [38]:
feat_summary.sort_index(level=1).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Rank,Rank,Rank,Rank,Rank
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,count
Dropped_Features,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
FP_EstateFP_GraphFP_PubchemFP,AD2D93,8.0,30.0,19.0,9.924717,5
FP_ExtFP_EstateFP_KRFP,AD2D93,4.0,20.0,8.4,6.542171,5
FP_ExtFP_GraphFP_KRFP,AD2D93,3.0,16.0,11.8,5.118594,5
FP_ExtFP_KRFP,AD2D93,4.0,20.0,8.4,6.542171,5
FP_ExtFP_KRFP_KRFPC,AD2D93,4.0,20.0,8.4,6.542171,5
FP_ExtFP_MACCSFP_KRFP,AD2D93,5.0,27.0,12.4,8.988882,5
FP_ExtFP_MACCSFP_SubFPC,AD2D93,1.0,16.0,7.8,5.974948,5
FP_GraphFP_PubchemFP,AD2D93,8.0,30.0,19.0,9.924717,5
ExtFP_KRFP_AD2D,APC2D2_N_S,1.0,35.0,8.0,15.099669,5
FP_ExtFP_EstateFP_KRFPC,APC2D2_N_S,1.0,17.0,8.2,6.685806,5


In [40]:
feat_summary.reset_index()["Feature"].value_counts()

Feature
PubchemFP489    35
FP937           32
MACCSFP153      28
MACCSFP45       28
FP130           28
FP277           28
FP890           23
GraphFP582      20
GraphFP547      11
FP823           10
FP932            9
KRFPC4231        8
AD2D93           8
MACCSFP47        7
EStateFP9        7
KRFP4231         7
APC2D2_N_S       7
ExtFP549         6
ExtFP819         5
GraphFP224       5
KRFP297          5
GraphFP451       4
GraphFP8         4
KRFP3010         2
MACCSFP100       2
FP802            2
FP1006           2
MACCSFP81        2
GraphFP418       2
KRFP298          1
FP349            1
GraphFP571       1
SubFPC35         1
MACCSFP129       1
ExtFP956         1
GraphFP669       1
KRFP4810         1
FP91             1
Name: count, dtype: int64