# Filtering the Features

Features with a variance less than 0.2 and with correlation larger than 0.95 were removed

In [7]:
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold
import pandas as pd
import numpy as np
import glob
import os

In [2]:
# Read entire dataset with all descriptors
dataset = pd.read_csv('../3_train_test_split/descriptors_all.csv').set_index('Molecule ChEMBL ID').drop(columns=['Potency','-logIC50'])
print("The original data size is ", dataset.shape)

The original data size is  (654, 1009)


## Remove Descriptors with Variance < 0.2

In [3]:
sel = VarianceThreshold(threshold=0.2) 
seldataset=sel.fit_transform(dataset)
seldataset=dataset[dataset.columns[sel.get_support(indices=True)]]

print("after filtering with variance > 0.2, the data size is", seldataset.shape)

after filtering with variance > 0.2, the data size is (654, 1007)


## Drop Features with Correlation > 0.95

In [6]:
def drop_highcorr(df, threshold=0.95):
    """
    Removes features that have a correlation higher than the specified threshold.

    Parameters:
        df (pd.DataFrame): The input DataFrame with features.
        threshold (float): The correlation threshold above which features will be removed.
        
    Returns:
        pd.DataFrame: A DataFrame with highly correlated features removed.
    """
    # Calculate the correlation matrix
    corr_matrix = df.corr().abs()
    
    # Create a boolean mask for the upper triangle of the correlation matrix
    upper_tri = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )
    
    # Find columns with correlation greater than the threshold
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > threshold)]
    
    # Drop the identified columns
    return df.drop(columns=to_drop)


# Apply filtration to each molecule
filtered_df = drop_highcorr(seldataset, threshold=0.95)

print("after filtering correlations > 0.95, the data size is", filtered_df.shape)

after filtering correlations > 0.95, the data size is (654, 565)


## Save Train and Test Sets with Reduced Features

In [16]:
input_dir = "../3_train_test_split/"
output_dir = "1_filter/"
os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists

# Columns to keep (from filtered_df)
selected_columns = filtered_df.columns

# Loop over train, test, and val CSV files
for file_type in ["train", "test", "val"]:
    # Find all matching CSV files for this type
    files = glob.glob(f"{input_dir}{file_type}*.csv")
    
    for file_path in files:
        df = pd.read_csv(file_path).set_index('Molecule ChEMBL ID') # Read the CSV file
        filtered_df = df[selected_columns] # Keep only selected columns
        
        # Construct the output file path
        file_name = os.path.basename(file_path)  # Extract file name
        output_path = os.path.join(output_dir, file_name)
        
        # Save the filtered DataFrame
        filtered_df.to_csv(output_path)
        print(f"Filtered file saved to {output_path}")

Filtered file saved to 1_filter/train_class_5.csv
Filtered file saved to 1_filter/train_class_4.csv
Filtered file saved to 1_filter/train_reg.csv
Filtered file saved to 1_filter/train_class_3.csv
Filtered file saved to 1_filter/train_class_2.csv
Filtered file saved to 1_filter/train_class_1.csv
Filtered file saved to 1_filter/train_reg_5.csv
Filtered file saved to 1_filter/train_reg_4.csv
Filtered file saved to 1_filter/train_reg_1.csv
Filtered file saved to 1_filter/train_reg_3.csv
Filtered file saved to 1_filter/train_reg_2.csv
Filtered file saved to 1_filter/train_class.csv
Filtered file saved to 1_filter/test_reg.csv
Filtered file saved to 1_filter/test_class.csv
Filtered file saved to 1_filter/val_class_5.csv
Filtered file saved to 1_filter/val_class_4.csv
Filtered file saved to 1_filter/val_class_1.csv
Filtered file saved to 1_filter/val_class_3.csv
Filtered file saved to 1_filter/val_class_2.csv
Filtered file saved to 1_filter/val_reg_4.csv
Filtered file saved to 1_filter/val_re