# Feature selection

Use sklearn to select a specific number of features.

Load in our original dataframe.

In [1]:
import os
import pandas as pd

CSV_PATH = "https://gitlab.com/oasci/courses/pitt/biosc1540-2024s/-/raw/main/biosc1540/files/csv/pka/pka_with_desc.csv"

df = pd.read_csv(CSV_PATH)

CSV_DIR = "../../files/csv/pka"
CSV_SAVE_PATH = os.path.join(CSV_DIR, "pka_desc_selected.csv")

In [2]:
X_all = df.drop(["SMILES", "pka_value"], axis=1)
y = df["pka_value"]

print(X_all.shape)

(1706, 210)


In [3]:
correlation_matrix = X_all.corr()
print(correlation_matrix)

                   MaxAbsEStateIndex  MaxEStateIndex  MinAbsEStateIndex  \
MaxAbsEStateIndex           1.000000        1.000000          -0.735691   
MaxEStateIndex              1.000000        1.000000          -0.735691   
MinAbsEStateIndex          -0.735691       -0.735691           1.000000   
MinEStateIndex             -0.708439       -0.708439           0.569787   
qed                        -0.065836       -0.065836           0.020809   
...                              ...             ...                ...   
fr_thiazole                -0.109902       -0.109902           0.120545   
fr_thiocyan                      NaN             NaN                NaN   
fr_thiophene               -0.067638       -0.067638           0.024200   
fr_unbrch_alkane           -0.093467       -0.093467           0.127537   
fr_urea                     0.059489        0.059489           0.007595   

                   MinEStateIndex       qed       SPS     MolWt  \
MaxAbsEStateIndex       -0.70843

In [4]:
# Set a correlation threshold
correlation_threshold = 0.99

# Find and drop one of the highly correlated columns
columns_to_drop = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > correlation_threshold:
            colname_i = correlation_matrix.columns[i]
            colname_j = correlation_matrix.columns[j]

            # Add one of the column names to the set to drop
            # Choose the one with the lower index (j) in this case
            columns_to_drop.add(colname_j)

# Keep only one column from each highly correlated pair
columns_to_keep = list(set(correlation_matrix.columns) - columns_to_drop)
X = X_all[columns_to_keep]

print(X.shape)

(1706, 199)


In [5]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

n_features = X.shape[1]
selector = SelectKBest(score_func=f_regression, k=n_features)
X_new = selector.fit_transform(X, y)

In [6]:
# Get the names of all features
all_feature_names = X.columns

# Get the scores of all features
all_feature_scores = selector.scores_

# Create a dictionary of feature names and their scores
feature_scores_dict = dict(zip(all_feature_names, all_feature_scores))

# Sort the features by their scores in descending order
sorted_features = sorted(feature_scores_dict.items(), key=lambda x: x[1], reverse=True)

# Print or use the sorted features
features_high_score = []
for feature, score in sorted_features:
    feature_index = X.columns.get_loc(feature)
    if feature_index < len(all_feature_scores) and score >= 100:
        features_high_score.append(feature)

df_features_sorted = X[features_high_score]
print(df_features_sorted.shape)

(1706, 38)


In [7]:
df_features_sorted = X[features_high_score].copy()
df_features_sorted["pka_value"] = y
df_features_sorted["SMILES"] = df["SMILES"]

# Reorder columns
column_order = ["SMILES", "pka_value"] + [
    col for col in df_features_sorted.columns if col not in ["pka_value", "SMILES"]
]
df_features_sorted = df_features_sorted[column_order]

print(df_features_sorted.head())

                   SMILES  pka_value  FractionCSP3  EState_VSA10  \
0  Brc1cc2cccnc2c2ncccc12       4.20           0.0           0.0   
1        Brc1ccc2ccccc2n1       1.05           0.0           0.0   
2        Brc1ccc2cccnc2c1       3.87           0.0           0.0   
3        Brc1ccc2ncccc2c1       3.91           0.0           0.0   
4        Brc1cccc2cccnc12       3.13           0.0           0.0   

   MinAbsPartialCharge  HallKierAlpha  MaxEStateIndex  SMR_VSA10  VSA_EState2  \
0             0.097537          -1.48        4.377685  37.735794     8.744815   
1             0.106326          -0.89        4.292315  26.832869     4.292315   
2             0.071267          -0.89        4.212037  26.832869     4.212037   
3             0.070223          -0.89        4.201481  26.832869     4.201481   
4             0.084336          -0.89        4.233866  26.832869     4.233866   

   VSA_EState8  ...   TPSA  NumAromaticRings  EState_VSA6  EState_VSA1  \
0          0.0  ...  25.78    

In [8]:
df_features_sorted.to_csv(CSV_SAVE_PATH, index=False)