<a href="https://colab.research.google.com/github/neelsoumya/python_machine_learning/blob/main/feature_selection_LASSO_trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature selection

## LASSO

In [1]:
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
import numpy as np

# Load open dataset
data = load_diabetes()
X = data.data
y = data.target
feature_names = data.feature_names

# Standardize features
X_scaled = StandardScaler().fit_transform(X)

# LASSO with cross-validated alpha
lasso = LassoCV(cv=5, random_state=1).fit(X_scaled, y)

# Selected features
selected_idx = np.flatnonzero(lasso.coef_ != 0)
selected_names = [feature_names[i] for i in selected_idx]

print(f"Chosen alpha: {lasso.alpha_:.6f}")
print("Selected feature indices by LASSO:", selected_idx.tolist())
print("Selected feature names by LASSO:", selected_names)

Chosen alpha: 0.078918
Selected feature indices by LASSO: [0, 1, 2, 3, 4, 5, 7, 8, 9]
Selected feature names by LASSO: ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's4', 's5', 's6']


## Random forests

In [2]:
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Load the same open dataset (diabetes)
data = load_diabetes()
X = data.data
y_reg = data.target
feature_names = data.feature_names

# Convert regression target to binary: 1 if above median, else 0
y = (y_reg > np.median(y_reg)).astype(int)

# Fit a random forest
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X, y)

# Get feature importances (print with names and indices)
importances = clf.feature_importances_
for i, imp in enumerate(importances):
    print(f"Feature {i} ({feature_names[i]}): importance {imp:.3f}")

Feature 0 (age): importance 0.073
Feature 1 (sex): importance 0.023
Feature 2 (bmi): importance 0.188
Feature 3 (bp): importance 0.117
Feature 4 (s1): importance 0.077
Feature 5 (s2): importance 0.087
Feature 6 (s3): importance 0.093
Feature 7 (s4): importance 0.073
Feature 8 (s5): importance 0.186
Feature 9 (s6): importance 0.082
