<a href="https://colab.research.google.com/github/manishsethi/Automation-tasks/blob/main/Exercises/day-3/Embedded_methods/Lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Lab Exercises: Embedded Feature‐Selection Methods
# Use the Breast Cancer dataset (sklearn.datasets.load_breast_cancer) for all exercises. Split once into training and test sets:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Exercise 4: Tree-Based Importance
- Fit RandomForestClassifier(n_estimators=100, random_state=0).

- Use SelectFromModel to select the top 5 features by impurity importance.

- Retrain and evaluate.

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(X_train, y_train)

sfm_rf = SelectFromModel(rf, prefit=True, max_features=5, threshold=-np.inf)
feat_rf = X_train.columns[sfm_rf.get_support()]

model = LogisticRegression(max_iter=5000).fit(X_train[feat_rf], y_train)
print("RF features:", list(feat_rf))
print("Accuracy (RF):", accuracy_score(y_test, model.predict(X_test[feat_rf])))

# Exercise 5: Gradient-Boosting (XGBoost) Importance
- Fit XGBClassifier(n_estimators=100, use_label_encoder=False, - eval_metric='logloss', random_state=0).

- Use SelectFromModel to pick the top 5 features.

- Retrain and evaluate.

In [None]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier(
    n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=0
)
xgb_clf.fit(X_train, y_train)

sfm_xgb = SelectFromModel(xgb_clf, prefit=True, max_features=5, threshold=-np.inf)
feat_xgb = X_train.columns[sfm_xgb.get_support()]

model = LogisticRegression(max_iter=5000).fit(X_train[feat_xgb], y_train)
print("XGB features:", list(feat_xgb))
print("Accuracy (XGB):", accuracy_score(y_test, model.predict(X_test[feat_xgb])))


# Exercise 6: Stability Selection (Randomized Lasso)
- Fit RandomizedLasso(alpha=0.025, random_state=0).

- Select the top 5 features with highest selection frequency.

- Retrain and evaluate.

In [None]:
from sklearn.linear_model import RandomizedLasso

rl = RandomizedLasso(alpha=0.025, random_state=0)
rl.fit(X_train, y_train)

feat_rl = X_train.columns[rl.get_support()][:5]
model = LogisticRegression(max_iter=5000).fit(X_train[feat_rl], y_train)
print("Stability features:", list(feat_rl))
print("Accuracy (Stability):", accuracy_score(y_test, model.predict(X_test[feat_rl])))


# Exercise 7: Embedded Specialized Model (Decision Tree)
- Fit DecisionTreeClassifier(max_depth=3, random_state=0).

- Use its feature_importances_ to select top 5.

- Retrain logistic model and evaluate.

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=3, random_state=0)
dt.fit(X_train, y_train)

import numpy as np
idx_dt = np.argsort(dt.feature_importances_)[-5:]
feat_dt = X_train.columns[idx_dt]

model = LogisticRegression(max_iter=5000).fit(X_train[feat_dt], y_train)
print("DT features:", list(feat_dt))
print("Accuracy (DT):", accuracy_score(y_test, model.predict(X_test[feat_dt])))
