In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Sample dataset
data = {
    'age': [25, 30, 35, 40, 22, 28, 33, 45],
    'salary': [40000, 50000, 60000, 80000, 30000, 45000, 52000, 90000],
    'account_balance': [1000, 1500, 2000, 2500, 1000, 1400, 1900, 3000],
    'customer_satisfaction_score': [3, 4, 5, 4, 2, 3, 4, 5],
    'zipcode': [12345, 12345, 12345, 12345, 12345, 12345, 12345, 12345],
    'loyalty_card_usage': [1, 0, 1, 1, 0, 1, 0, 1],
    'churn': [0, 1, 0, 0, 1, 0, 1, 0]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Separate features and target variable
X = df.drop(columns=['churn'])
y = df['churn']

# Step 1: Feature Filtering
# Remove low-variance features
print("Before variance threshold:", X.columns.tolist())
var_thresh = VarianceThreshold(threshold=0.1)  # Threshold can be adjusted
X_filtered = var_thresh.fit_transform(X)

# Convert back to DataFrame and remove 'zipcode'
X_filtered = pd.DataFrame(X_filtered, columns=X.columns[var_thresh.get_support()])
X_filtered = X_filtered.drop(columns=['zipcode'], errors='ignore')
print("After filtering:", X_filtered.columns.tolist())

# Step 2: Feature Selection with RFE
# Use Logistic Regression as the model
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_filtered)
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=3)  # Select top 3 features
X_rfe = rfe.fit_transform(X_scaled, y)

# Get selected feature names
selected_features_rfe = X_filtered.columns[rfe.support_].tolist()
print("Selected features (RFE):", selected_features_rfe)

# Step 3: Feature Selection with Lasso
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.1)  # Regularization strength (alpha can be tuned)
lasso.fit(X_scaled, y)

# Get selected features from Lasso (non-zero coefficients)
selected_features_lasso = X_filtered.columns[lasso.coef_ != 0].tolist()
print("Selected features (Lasso):", selected_features_lasso)

# Final selected features by combining methods
final_features = set(selected_features_rfe) & set(selected_features_lasso)
print("Final selected features:", final_features)

Before variance threshold: ['age', 'salary', 'account_balance', 'customer_satisfaction_score', 'zipcode', 'loyalty_card_usage']
After filtering: ['age', 'salary', 'account_balance', 'customer_satisfaction_score', 'loyalty_card_usage']
Selected features (RFE): ['age', 'salary', 'loyalty_card_usage']
Selected features (Lasso): ['loyalty_card_usage']
Final selected features: {'loyalty_card_usage'}
