In [1]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
import dxdata
import re
import os

In [2]:
# Create the "feature_selection" subfolder if it doesn't already exist
os.makedirs("feature_selection", exist_ok=True)

# load data

In [3]:
# Load the all_depression training and testing DataFrames from the encoded pickle files
train_df_all = pd.read_pickle("data/all_late_depression_train_encoded_500.pkl")
test_df_all = pd.read_pickle("data/all_late_depression_test_encoded_500.pkl")

# Load the all_depression training and testing DataFrames with drop_first=True (encoded_drop)
train_df_all_drop = pd.read_pickle("data/all_late_depression_train_encoded_drop_500.pkl")
test_df_all_drop = pd.read_pickle("data/all_late_depression_test_encoded_drop_500.pkl")

In [4]:
# List of special columns only available in the test dataset
special_columns = ['eid', 'p130894', 'p130895', 'p53_i0']

In [5]:
# columns with polygenetic risk scores (PRS) from UK Biobank
# columns with too many missing values that have therefore been removed by data cleaning are already excluded
PRS_columns = [
    "p26202", "p26204", "p26206", "p26210", "p26212", "p26214", "p26216", 
    "p26218", "p26220", "p26223", "p26225", "p26227", "p26229", "p26232", 
    "p26234", "p26238", "p26240", "p26242", "p26244", "p26246", "p26248", 
    "p26250", "p26252", "p26254", "p26258", "p26260", "p26265", "p26267", 
    "p26269", "p26273", "p26275", "p26278", "p26283", "p26285", "p26287", 
    "p26289"
]

In [6]:
# Print the first row (head) for all four datasets

print("\nHead of train_df_all (drop_first=False):")
print(train_df_all.head(1))

print("\nHead of test_df_all (drop_first=False):")
print(test_df_all.head(1))

print("\nHead of train_df_all_drop (drop_first=True):")
print(train_df_all_drop.head(1))

print("\nHead of test_df_all_drop (drop_first=True):")
print(test_df_all_drop.head(1))


Head of train_df_all (drop_first=False):
          p34    p46_i0    p47_i0   p48_i0    p49_i0    p50_i0    p51_i0  \
126  1.349564 -1.732448 -0.621671 -0.47625  0.587172 -1.312008 -1.124861   

       p68_i0    p74_i0   p77_i0  ...  p23075_i0_Category_A  \
126 -1.122776 -0.832024 -0.31545  ...                   0.0   

     p23075_i0_Category_B  p23075_i0_Category_C  p23075_i0_Category_D  \
126                   0.0                   0.0                   1.0   

     p23075_i0_Category_E  p23165_Category_A  p23165_Category_B  \
126                   0.0                0.0                1.0   

     p23165_Category_C  p23165_Category_D  p23165_Category_E  
126                0.0                0.0                0.0  

[1 rows x 2857 columns]

Head of test_df_all (drop_first=False):
          p34    p46_i0    p47_i0    p48_i0    p49_i0   p50_i0    p51_i0  \
126  0.758108 -1.177368 -1.027585  1.053287  1.447405 -1.49189  1.725702   

       p68_i0    p74_i0    p77_i0  ...  p23075_i0_C

# feature selection
- mutual information
- logistic regression with lasso

## mutual information

### calculate mutual information for each feature

In [7]:
# Function to calculate mutual information for all features and return the complete DataFrame
def calculate_mutual_info(train_df, target_column):
    # Separate features and target
    X = train_df.drop(columns=[target_column])  # Features
    y = train_df[target_column]  # Target variable

    # Calculate mutual information between each feature and the target
    mutual_info = mutual_info_classif(X, y, random_state=81)

    # Create a DataFrame to display the mutual information scores
    mutual_info_df = pd.DataFrame({'Feature': X.columns, 'Mutual Information': mutual_info})

    # Sort the features by their mutual information scores in descending order
    mutual_info_df.sort_values(by='Mutual Information', ascending=False, inplace=True)

    # Print the top 10 features by mutual information scores
    print(mutual_info_df.head(10))

    # Return the complete DataFrame with mutual information scores for all features
    return mutual_info_df

# Calculate and store mutual information for both datasets

# For train_df_all (drop_first=False)
print("Top 10 Mutual Information Features for train_df_all (drop_first=False):")
mutual_info_all = calculate_mutual_info(train_df_all, 'target')

# For train_df_all_drop (drop_first=True)
print("\nTop 10 Mutual Information Features for train_df_all_drop (drop_first=True):")
mutual_info_all_drop = calculate_mutual_info(train_df_all_drop, 'target')

Top 10 Mutual Information Features for train_df_all (drop_first=False):
                   Feature  Mutual Information
191              p23405_i0            0.093608
1292   p2674_i0_Category_B            0.092549
1011   p1920_i0_Category_A            0.084037
803    p1408_i0_Category_C            0.078094
167              p23112_i0            0.077085
1613   p3659_i0_Category_C            0.068769
1312   p2714_i0_Category_B            0.068029
2166   p5790_i0_Category_A            0.066839
2525  p10886_i0_Category_E            0.066743
2090   p5441_i0_Category_E            0.066603

Top 10 Mutual Information Features for train_df_all_drop (drop_first=True):
                  Feature  Mutual Information
191             p23405_i0            0.093608
1825  p5790_i0_Category_E            0.087695
167             p23112_i0            0.077085
1642  p4631_i0_Category_B            0.068530
783   p1538_i0_Category_C            0.067096
766   p1498_i0_Category_B            0.065788
892   p1873_

### select different features sizes with highest mutual information

In [8]:
# Define the different feature sizes to select
feature_sizes = [5, 10, 15, 20, 30, 50]

# Function to select top features, create new DataFrames, and save them
def select_and_save_top_features(mutual_info_df, train_df, dataset_name, target_column='target'):
    for size in feature_sizes:
        # Step 1: Identify the top features with the highest mutual information
        top_features = mutual_info_df.head(size)['Feature']

        # Step 2: Create train_df_features_selected
        train_df_features_selected = train_df.loc[:, top_features].copy()

        # Add the target column back to the train_df_features_selected
        train_df_features_selected.loc[:, target_column] = train_df[target_column].values

        # Define the variable name
        if size == len(mutual_info_df):
            train_var_name = f'{dataset_name}_features_all'
        else:
            train_var_name = f'{dataset_name}_features_{size}'

        # Dynamically create a variable to store the DataFrame
        globals()[train_var_name] = train_df_features_selected

        # Save the DataFrame as a .pkl file
        train_df_features_selected.to_pickle(f"feature_selection/{train_var_name}.pkl")

        # Display the shape of the resulting DataFrame to confirm
        print(f"Feature size: {size} for {dataset_name}")
        print(f"Shape of {train_var_name}:", train_df_features_selected.shape)

# Apply the function for both datasets

# For train_df_all (drop_first=False)
print("Processing train_df_all (drop_first=False):")
select_and_save_top_features(mutual_info_all, train_df_all, 'train_df_all')

# For train_df_all_drop (drop_first=True)
print("\nProcessing train_df_all_drop (drop_first=True):")
select_and_save_top_features(mutual_info_all_drop, train_df_all_drop, 'train_df_all_drop')

Processing train_df_all (drop_first=False):
Feature size: 5 for train_df_all
Shape of train_df_all_features_5: (500, 6)
Feature size: 10 for train_df_all
Shape of train_df_all_features_10: (500, 11)
Feature size: 15 for train_df_all
Shape of train_df_all_features_15: (500, 16)
Feature size: 20 for train_df_all
Shape of train_df_all_features_20: (500, 21)
Feature size: 30 for train_df_all
Shape of train_df_all_features_30: (500, 31)
Feature size: 50 for train_df_all
Shape of train_df_all_features_50: (500, 51)

Processing train_df_all_drop (drop_first=True):
Feature size: 5 for train_df_all_drop
Shape of train_df_all_drop_features_5: (500, 6)
Feature size: 10 for train_df_all_drop
Shape of train_df_all_drop_features_10: (500, 11)
Feature size: 15 for train_df_all_drop
Shape of train_df_all_drop_features_15: (500, 16)
Feature size: 20 for train_df_all_drop
Shape of train_df_all_drop_features_20: (500, 21)
Feature size: 30 for train_df_all_drop
Shape of train_df_all_drop_features_30: (500

### select different feature sizes from test dataset

In [9]:
# Function to select top features from test datasets based on selected train features
def select_test_features(mutual_info_df, train_dataset_name, test_df, test_dataset_name, target_column='target'):
    for size in feature_sizes:
        # Step 1: Define the train file name and load the corresponding train DataFrame from the .pkl file
        if size == len(mutual_info_df):
            train_var_name = f'{train_dataset_name}_features_all'
        else:
            train_var_name = f'{train_dataset_name}_features_{size}'

        # Load the train DataFrame from the corresponding .pkl file
        train_df_features_selected = pd.read_pickle(f"feature_selection/{train_var_name}.pkl")

        # Ensure the train DataFrame was loaded correctly
        if train_df_features_selected is None:
            print(f"Train DataFrame for feature size {size} not found!")
            continue

        # Step 2: Identify the selected features including 'target'
        train_top_features = train_df_features_selected.columns  # Includes 'target'

        # Step 3: Select the same features from the test dataset, including 'target' if available in test
        test_top_features = train_top_features.intersection(test_df.columns)
        test_df_features_selected = test_df.loc[:, test_top_features]

        # Step 4: Retain the special columns and move them to the last 4 positions
        special_columns_in_test = [col for col in special_columns if col in test_df.columns]
        test_df_features_selected = pd.concat([test_df_features_selected, test_df[special_columns_in_test]], axis=1)

        # Define the test variable name
        if size == len(mutual_info_df):
            test_var_name = f'{test_dataset_name}_features_all'
        else:
            test_var_name = f'{test_dataset_name}_features_{size}'

        # Save the test DataFrame as a .pkl file
        test_df_features_selected.to_pickle(f"feature_selection/{test_var_name}.pkl")

        # Display the shape of the resulting test DataFrame to confirm
        print(f"Feature size: {size} for {test_dataset_name}")
        print(f"Shape of {test_var_name}:", test_df_features_selected.shape)

# Apply the function for both test datasets

# For test_df_all (drop_first=False)
print("Processing test_df_all (drop_first=False):")
select_test_features(mutual_info_all, 'train_df_all', test_df_all, 'test_df_all')

# For test_df_all_drop (drop_first=True)
print("\nProcessing test_df_all_drop (drop_first=True):")
select_test_features(mutual_info_all_drop, 'train_df_all_drop', test_df_all_drop, 'test_df_all_drop')

Processing test_df_all (drop_first=False):
Feature size: 5 for test_df_all
Shape of test_df_all_features_5: (500, 10)
Feature size: 10 for test_df_all
Shape of test_df_all_features_10: (500, 15)
Feature size: 15 for test_df_all
Shape of test_df_all_features_15: (500, 20)
Feature size: 20 for test_df_all
Shape of test_df_all_features_20: (500, 25)
Feature size: 30 for test_df_all
Shape of test_df_all_features_30: (500, 35)
Feature size: 50 for test_df_all
Shape of test_df_all_features_50: (500, 55)

Processing test_df_all_drop (drop_first=True):
Feature size: 5 for test_df_all_drop
Shape of test_df_all_drop_features_5: (500, 10)
Feature size: 10 for test_df_all_drop
Shape of test_df_all_drop_features_10: (500, 15)
Feature size: 15 for test_df_all_drop
Shape of test_df_all_drop_features_15: (500, 20)
Feature size: 20 for test_df_all_drop
Shape of test_df_all_drop_features_20: (500, 25)
Feature size: 30 for test_df_all_drop
Shape of test_df_all_drop_features_30: (500, 35)
Feature size: 50

In [10]:
# Function to add PRS columns before the target column in the feature-selected DataFrames
def add_columns_before_target(full_df, feature_df, columns_to_add, special_columns=None):
    # Extract the columns to add from the full dataset
    new_columns = full_df[columns_to_add]

    # Drop the target column temporarily from the feature-selected DataFrame
    target = feature_df['target']
    feature_df = feature_df.drop(columns=['target'])

    # Concatenate the new columns and the original feature-selected DataFrame
    updated_df = pd.concat([new_columns, feature_df], axis=1)

    # Add the target column back
    updated_df['target'] = target

    # For test datasets: Retain the special columns and move them after 'target'
    if special_columns is not None:
        special_columns_in_test = [col for col in special_columns if col in full_df.columns]
        updated_df = pd.concat([updated_df, full_df[special_columns_in_test]], axis=1)

    return updated_df

# Load the train and test DataFrames from .pkl files
train_df_all_features_10 = pd.read_pickle("feature_selection/train_df_all_features_10.pkl")
test_df_all_features_10 = pd.read_pickle("feature_selection/test_df_all_features_10.pkl")
train_df_all_drop_features_10 = pd.read_pickle("feature_selection/train_df_all_drop_features_10.pkl")
test_df_all_drop_features_10 = pd.read_pickle("feature_selection/test_df_all_drop_features_10.pkl")

# Apply the function to each dataset

# For train_df_all_features_10 (drop_first=False)
train_df_all_features_10_PRS = add_columns_before_target(train_df_all, train_df_all_features_10, PRS_columns)

# For test_df_all_features_10 (drop_first=False), include special columns
test_df_all_features_10_PRS = add_columns_before_target(test_df_all, test_df_all_features_10, PRS_columns, special_columns)

# For train_df_all_drop_features_10 (drop_first=True)
train_df_all_drop_features_10_PRS = add_columns_before_target(train_df_all_drop, train_df_all_drop_features_10, PRS_columns)

# For test_df_all_drop_features_10 (drop_first=True), include special columns
test_df_all_drop_features_10_PRS = add_columns_before_target(test_df_all_drop, test_df_all_drop_features_10, PRS_columns, special_columns)

# Print updated DataFrame shapes to confirm
print("Updated train_df_all_features_10_PRS shape (drop_first=False):", train_df_all_features_10_PRS.shape)
print("Updated test_df_all_features_10_PRS shape (drop_first=False):", test_df_all_features_10_PRS.shape)
print("Updated train_df_all_drop_features_10_PRS shape (drop_first=True):", train_df_all_drop_features_10_PRS.shape)
print("Updated test_df_all_drop_features_10_PRS shape (drop_first=True):", test_df_all_drop_features_10_PRS.shape)

# Save the DataFrames with the suffix '_PRS'
train_df_all_features_10_PRS.to_pickle("feature_selection/train_df_all_features_10_PRS.pkl")
test_df_all_features_10_PRS.to_pickle("feature_selection/test_df_all_features_10_PRS.pkl")
train_df_all_drop_features_10_PRS.to_pickle("feature_selection/train_df_all_drop_features_10_PRS.pkl")
test_df_all_drop_features_10_PRS.to_pickle("feature_selection/test_df_all_drop_features_10_PRS.pkl")

Updated train_df_all_features_10_PRS shape (drop_first=False): (500, 47)
Updated test_df_all_features_10_PRS shape (drop_first=False): (500, 55)
Updated train_df_all_drop_features_10_PRS shape (drop_first=True): (500, 47)
Updated test_df_all_drop_features_10_PRS shape (drop_first=True): (500, 55)


## logistic regression with Lasso

In [11]:
# Function to perform logistic regression with Lasso and select 10 features
def select_top_features_via_logistic_lasso(train_df, target_column='target', num_features=10, threshold=0.0001):
    # Separate features and target
    X = train_df.drop(columns=[target_column])
    y = train_df[target_column]
    
    # Initialize Logistic Regression with L1 regularization (Lasso)
    model = LogisticRegression(penalty='l1', solver='liblinear', max_iter=10000, C=1.0)

    # Train the model
    model.fit(X, y)
    
    # Get the coefficients
    coefficients = np.abs(model.coef_)[0]
    
    # Keep adjusting regularization until only num_features have coefficients > threshold
    while np.sum(coefficients > threshold) > num_features:
        model = LogisticRegression(penalty='l1', solver='liblinear', max_iter=10000, C=model.C * 0.99) 
        model.fit(X, y)
        coefficients = np.abs(model.coef_)[0]
    
    # Extract the selected features (those with non-zero coefficients > threshold)
    selected_features = X.columns[coefficients > threshold]
    
    return selected_features

# Apply the function to both datasets (drop_first=False and drop_first=True)
selected_features_all = select_top_features_via_logistic_lasso(train_df_all, target_column='target', num_features=10)
selected_features_all_drop = select_top_features_via_logistic_lasso(train_df_all_drop, target_column='target', num_features=10)

# Print the selected features
print("Selected features for train_df_all (drop_first=False):", selected_features_all)
print("Selected features for train_df_all_drop (drop_first=True):", selected_features_all_drop)

Selected features for train_df_all (drop_first=False): Index(['p3147_i0', 'p4079_i0_a0', 'p20022_i0', 'p20153_i0', 'p23105_i0',
       'p23458_i0', 'p26217', 'p26417', 'p30110_i0', 'p30190_i0'],
      dtype='object')
Selected features for train_df_all_drop (drop_first=True): Index(['p3147_i0', 'p4079_i0_a0', 'p20022_i0', 'p20153_i0', 'p23105_i0',
       'p23458_i0', 'p26217', 'p26417', 'p30110_i0', 'p30190_i0'],
      dtype='object')


In [12]:
def select_features_from_datasets(train_df, test_df, selected_features, target_column='target', special_columns=None):
    # Select features + target column for train dataset
    train_df_selected = train_df[selected_features.tolist() + [target_column]].copy()
    
    # Select features + target column for test dataset
    test_df_selected = test_df[selected_features.tolist() + [target_column]].copy()
    
    # If special columns exist, add them to the test dataset
    if special_columns is not None:
        special_columns_in_test = [col for col in special_columns if col in test_df.columns]
        test_df_selected = pd.concat([test_df_selected, test_df[special_columns_in_test]], axis=1)
    
    return train_df_selected, test_df_selected

In [13]:
# Apply the function to create the new datasets

# For the all dataset (drop_first=False, with _features_10_logistic suffix)
train_df_all_features_10_logistic, test_df_all_features_10_logistic = select_features_from_datasets(
    train_df_all, test_df_all, selected_features_all, target_column='target', special_columns=special_columns)

# For the all_drop dataset (drop_first=True, with _features_10_logistic suffix)
train_df_all_drop_features_10_logistic, test_df_all_drop_features_10_logistic = select_features_from_datasets(
    train_df_all_drop, test_df_all_drop, selected_features_all_drop, target_column='target', special_columns=special_columns)

# Print shapes to confirm the new datasets
print("train_df_all_features_10_logistic shape (drop_first=False):", train_df_all_features_10_logistic.shape)
print("test_df_all_features_10_logistic shape (drop_first=False):", test_df_all_features_10_logistic.shape)
print("train_df_all_drop_features_10_logistic shape (drop_first=True):", train_df_all_drop_features_10_logistic.shape)
print("test_df_all_drop_features_10_logistic shape (drop_first=True):", test_df_all_drop_features_10_logistic.shape)

# Save the new datasets with the suffix '_features_10_logistic'
train_df_all_features_10_logistic.to_pickle("feature_selection/train_df_all_features_10_logistic.pkl")
test_df_all_features_10_logistic.to_pickle("feature_selection/test_df_all_features_10_logistic.pkl")
train_df_all_drop_features_10_logistic.to_pickle("feature_selection/train_df_all_drop_features_10_logistic.pkl")
test_df_all_drop_features_10_logistic.to_pickle("feature_selection/test_df_all_drop_features_10_logistic.pkl")

train_df_all_features_10_logistic shape (drop_first=False): (500, 11)
test_df_all_features_10_logistic shape (drop_first=False): (500, 15)
train_df_all_drop_features_10_logistic shape (drop_first=True): (500, 11)
test_df_all_drop_features_10_logistic shape (drop_first=True): (500, 15)


## manual feature selection

In [14]:
manual_selected_features = pd.Index([
    'p47_i0',
    'p135_i0',
    'p137_i0',
    'p22189',
    'p30140_i0',
    'p1920_i0_Category_B',
    'p1920_i0_Category_B',
    'p1930_i0_Category_C',
    'p2090_i0_Category_B',
    'p2090_i0_Category_C'
])

In [15]:
# For the all dataset (drop_first=False, with _features_10_manual suffix)
train_df_all_features_10_manual, test_df_all_features_10_manual = select_features_from_datasets(
    train_df_all, test_df_all, manual_selected_features, target_column='target', special_columns=special_columns)

# For the all_drop dataset (drop_first=True, with _features_10_manual suffix)
train_df_all_drop_features_10_manual, test_df_all_drop_features_10_manual = select_features_from_datasets(
    train_df_all_drop, test_df_all_drop, manual_selected_features, target_column='target', special_columns=special_columns)

# Print shapes to confirm the new datasets
print("train_df_all_features_10_manual shape (drop_first=False):", train_df_all_features_10_manual.shape)
print("test_df_all_features_10_manual shape (drop_first=False):", test_df_all_features_10_manual.shape)
print("train_df_all_drop_features_10_manual shape (drop_first=True):", train_df_all_drop_features_10_manual.shape)
print("test_df_all_drop_features_10_manual shape (drop_first=True):", test_df_all_drop_features_10_manual.shape)

# Save the new datasets with the suffix '_features_10_manual'
train_df_all_features_10_manual.to_pickle("feature_selection/train_df_all_features_10_manual.pkl")
test_df_all_features_10_manual.to_pickle("feature_selection/test_df_all_features_10_manual.pkl")
train_df_all_drop_features_10_manual.to_pickle("feature_selection/train_df_all_drop_features_10_manual.pkl")
test_df_all_drop_features_10_manual.to_pickle("feature_selection/test_df_all_drop_features_10_manual.pkl")

train_df_all_features_10_manual shape (drop_first=False): (500, 11)
test_df_all_features_10_manual shape (drop_first=False): (500, 15)
train_df_all_drop_features_10_manual shape (drop_first=True): (500, 11)
test_df_all_drop_features_10_manual shape (drop_first=True): (500, 15)
