In [1]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, chi2
from sklearn.datasets import load_iris


In [2]:
# Load dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target


In [13]:
# Method 1: Correlation-based Feature Selection (for classification)
# Calculate feature-target correlation
correlation_scores = X.corrwith(pd.Series(y, name="Target"))
correlation_scores = correlation_scores.abs().sort_values(ascending=False)
# Select the top 'k' features based on correlation (e.g., top 3)
k = 3
selected_features_corr = correlation_scores.index[:k]

print(selected_features_corr)


Index(['petal width (cm)', 'petal length (cm)', 'sepal length (cm)'], dtype='object')


In [14]:
# Method 2: Mutual Information-based Feature Selection (for classification)
# Select the top 'k' features based on mutual information (e.g., top 3)
k = 3
selected_features_mi = SelectKBest(score_func=mutual_info_classif, k=k).fit(X, y)
selected_features_mi = X.columns[selected_features_mi.get_support()]

print(selected_features_mi)

Index(['sepal length (cm)', 'petal length (cm)', 'petal width (cm)'], dtype='object')


In [6]:
# Chi-square Test

from sklearn.feature_selection import SelectKBest, chi2

chi2_features = SelectKBest(chi2, k=3)
X_kbest_features = chi2_features.fit_transform(X, y)

print(X.shape)
print(X_kbest_features.shape)


(150, 4)
(150, 3)


In [8]:
# Fisher's Score

# Perform Fisher's Score feature selection
k = 2  # Number of top features to select
f_score_selector = SelectKBest(score_func=f_classif, k=k, )
X_new = f_score_selector.fit_transform(X, y)

# Get the indices of the selected features
selected_feature_indices = f_score_selector.get_support(indices=True)

# Print the selected features and their indices
selected_features = [data.feature_names[i] for i in selected_feature_indices]
print("Selected Features:")
print(selected_features)


Selected Features:
['petal length (cm)', 'petal width (cm)']


In [10]:
# Missing Value Ration

from sklearn.impute import SimpleImputer

# Calculate the missing value ratio for each feature
missing_value_threshold = 0.3  # Set your desired threshold for missing values (e.g., 30%)
missing_value_ratio = X.isnull().mean()

# Select features with missing value ratio below the threshold
selected_features = X.columns[missing_value_ratio < missing_value_threshold]

# Impute missing values if needed (e.g., using mean imputation)
imputer = SimpleImputer(strategy='mean')
X[selected_features] = imputer.fit_transform(X[selected_features])

# Print the selected features
print("Selected Features:")
print(selected_features)

Selected Features:
Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')
