In [1]:
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
x, y = make_classification(n_samples=1000, n_classes=2, weights=[0.8, 0.2], random_state=42)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

model = RandomForestClassifier(n_estimators=101, random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print("\nClassification Report (imbalanced):")
print(classification_report(y_test, y_pred))


Classification Report (imbalanced):
              precision    recall  f1-score   support

           0       0.91      0.99      0.95       157
           1       0.93      0.63      0.75        43

    accuracy                           0.91       200
   macro avg       0.92      0.81      0.85       200
weighted avg       0.91      0.91      0.90       200



**Creating Interaction Features**

In [6]:
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=True)
x_poly = poly.fit_transform(x_train)

print(f"Original features: {x_train.shape[1]}")
print(f"Polynomial features: {x_poly.shape[1]}")

Original features: 20
Polynomial features: 211


**Binning Continuous Features**

In [16]:
df = pd.DataFrame(x_train, columns=[f'feature_{i}' for i in range(x_train.shape[1])])

df['target'] = y_train

# Bin a feature
df['feature_0_binned'] = pd.cut(df['feature_0'], bins=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

df_encoded = pd.get_dummies(df, columns=['feature_0_binned'], dtype=int, drop_first=True)

df_encoded.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_15,feature_16,feature_17,feature_18,feature_19,target,feature_0_binned_Low,feature_0_binned_Medium,feature_0_binned_High,feature_0_binned_Very High
0,0.503637,-0.544704,-0.469071,1.901766,-0.870643,-0.704015,1.662914,1.291052,-0.167136,-1.047184,...,-1.408904,0.033016,-0.803409,-0.027931,1.412096,0,0,1,0,0
1,-0.697719,0.345153,-1.798215,1.22803,-0.518069,-0.145556,-2.223574,1.243406,-0.386542,-0.599821,...,-0.566705,-0.653463,-0.657135,0.21142,-1.381592,0,1,0,0,0
2,-1.332245,-1.695822,-0.79852,1.118541,-0.201774,1.839238,-1.907222,0.034769,-1.291807,1.028849,...,0.09808,-1.217499,-0.431751,-1.406808,0.419342,1,1,0,0,0
3,-2.418775,-1.150447,-1.646511,1.258761,-0.803825,-0.57386,0.430089,0.542504,-1.044925,0.101327,...,0.473045,0.111088,0.696931,-0.357948,0.047673,0,0,0,0,0
4,-0.178707,0.612874,0.248305,0.299672,-0.588109,-1.441689,0.752483,-2.342478,0.804943,0.195162,...,0.944215,0.624818,1.665252,0.762825,-0.968336,0,0,1,0,0


**Feature Selection**
- Method 1: Univariate feature selection

In [19]:
selector = SelectKBest(score_func=f_classif, k=5)
x_selected = selector.fit_transform(x_train, y_train)

- Method 2: Recursive Feature Elimination

In [20]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(estimator=rf, n_features_to_select=5)
x_rfe = rfe.fit_transform(x_train, y_train)

- Method 3: Feature importance from Random Forest

In [22]:
rf.fit(x_train, y_train)
feature_importance = pd.DataFrame({
    'feature': range(x_train.shape[1]),
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 5 features:")
print(feature_importance.head())

Top 5 features:
    feature  importance
5         5    0.293543
18       18    0.235944
1         1    0.110475
14       14    0.081817
2         2    0.021411
