# 📦 Modul 5: Feature Selection & Engineering

Notebook ini akan membahas teknik untuk memilih fitur yang relevan dan membuat fitur baru yang lebih informatif.

## 📥 1. Import Library

In [None]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## 📊 2. Load Dataset & EDA

In [None]:

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
print(X.shape)
X.head()

## 📈 3. Korelasi antar fitur

In [None]:

plt.figure(figsize=(12,8))
sns.heatmap(X.corr(), cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

## 📌 4. Feature Selection - Chi-Square

In [None]:

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
selector = SelectKBest(score_func=chi2, k=10)
X_kbest = selector.fit_transform(X_scaled, y)
selected_features = X.columns[selector.get_support()].tolist()
print("Fitur terbaik (Chi-Square):", selected_features)

## 🌲 5. Feature Importance - Random Forest

In [None]:

rf = RandomForestClassifier()
rf.fit(X, y)
importances = pd.Series(rf.feature_importances_, index=X.columns)
importances.nlargest(10).plot(kind='barh')
plt.title("Top 10 Feature Importances")
plt.show()

## 🔁 6. Recursive Feature Elimination (RFE)

In [None]:
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=10)
fit = rfe.fit(X, y)
print("Fitur terbaik (RFE):", X.columns[fit.support_].tolist())

## 🛠️ 7. Feature Engineering Sederhana

In [None]:
# Contoh: rasio antara dua fitur
X_fe = X.copy()
X_fe['mean_area_ratio'] = X_fe['mean area'] / (X_fe['mean radius'] + 1e-5)
X_fe[['mean area', 'mean radius', 'mean_area_ratio']].head()