# 1. Import necessary dependencies

In [12]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# 2. Create sample Dataset

In [13]:
# Sample DataFrame (Men's Sports Apparel  - Classification Example)

data = pd.DataFrame({
    'Hours_Spent_Browsing': np.random.randint(1, 10, 100),
    'Number_Of_Visits': np.random.randint(1, 5, 100),
    'Discount_Offered': np.random.uniform(0, 0.5, 100),
    'Customer_Rating': np.random.uniform(1, 5, 100),
    'Time_Of_Day': np.random.choice(['Morning', 'Afternoon', 'Evening'], 100),
    'Product_Category': np.random.choice(['T-shirt', 'Shorts', 'Track Pants'], 100),
    'Delivery_Speed': np.random.randint(1, 5, 100),
    'Previous_Purchases': np.random.randint(0, 10, 100),
    'Marketing_Campaign': np.random.choice(['Yes', 'No'], 100),
    'Purchase_Probability': np.random.choice(['Low', 'Medium', 'High'], 100)  # Classification target
})

data

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Time_Of_Day,Product_Category,Delivery_Speed,Previous_Purchases,Marketing_Campaign,Purchase_Probability
0,9,4,0.424585,4.786316,Afternoon,Track Pants,2,8,Yes,High
1,2,2,0.392286,4.663326,Afternoon,Track Pants,3,3,Yes,Low
2,3,1,0.083823,2.604983,Morning,Shorts,1,7,No,Medium
3,1,2,0.064764,2.592285,Afternoon,Track Pants,2,2,Yes,Low
4,5,4,0.236726,4.965573,Evening,Track Pants,2,5,No,High
...,...,...,...,...,...,...,...,...,...,...
95,7,4,0.012086,3.593971,Evening,Shorts,3,0,No,High
96,4,4,0.300018,2.771257,Morning,Shorts,1,2,No,Medium
97,6,1,0.484810,3.102849,Evening,Shorts,1,9,No,High
98,1,2,0.438290,1.649503,Afternoon,T-shirt,2,7,Yes,Low


# 3. Convert categorical features to numerical using one-hot encoding

In [14]:
# Convert categorical features to numerical using one-hot encoding

data = pd.get_dummies(data, columns=['Time_Of_Day', 'Product_Category', 'Marketing_Campaign'], drop_first=True , dtype=np.int64)
data

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Delivery_Speed,Previous_Purchases,Purchase_Probability,Time_Of_Day_Evening,Time_Of_Day_Morning,Product_Category_T-shirt,Product_Category_Track Pants,Marketing_Campaign_Yes
0,9,4,0.424585,4.786316,2,8,High,0,0,0,1,1
1,2,2,0.392286,4.663326,3,3,Low,0,0,0,1,1
2,3,1,0.083823,2.604983,1,7,Medium,0,1,0,0,0
3,1,2,0.064764,2.592285,2,2,Low,0,0,0,1,1
4,5,4,0.236726,4.965573,2,5,High,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
95,7,4,0.012086,3.593971,3,0,High,1,0,0,0,0
96,4,4,0.300018,2.771257,1,2,Medium,0,1,0,0,0
97,6,1,0.484810,3.102849,1,9,High,1,0,0,0,0
98,1,2,0.438290,1.649503,2,7,Low,0,0,1,0,1


# 4. Encode the target variable (Purchase_Probability) with Label Encoder

In [15]:
# Encode the target variable (Purchase_Probability)

le = LabelEncoder()
data['Purchase_Probability_Encoded'] = le.fit_transform(data['Purchase_Probability'])
data

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Delivery_Speed,Previous_Purchases,Purchase_Probability,Time_Of_Day_Evening,Time_Of_Day_Morning,Product_Category_T-shirt,Product_Category_Track Pants,Marketing_Campaign_Yes,Purchase_Probability_Encoded
0,9,4,0.424585,4.786316,2,8,High,0,0,0,1,1,0
1,2,2,0.392286,4.663326,3,3,Low,0,0,0,1,1,1
2,3,1,0.083823,2.604983,1,7,Medium,0,1,0,0,0,2
3,1,2,0.064764,2.592285,2,2,Low,0,0,0,1,1,1
4,5,4,0.236726,4.965573,2,5,High,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,7,4,0.012086,3.593971,3,0,High,1,0,0,0,0,0
96,4,4,0.300018,2.771257,1,2,Medium,0,1,0,0,0,2
97,6,1,0.484810,3.102849,1,9,High,1,0,0,0,0,0
98,1,2,0.438290,1.649503,2,7,Low,0,0,1,0,1,1


# 5. Separate features and target

Separate the features (X) and the target variable (y)

In [16]:
# Separate features and target

X = data.drop(['Purchase_Probability', 'Purchase_Probability_Encoded'], axis=1)
y = data['Purchase_Probability_Encoded']

In [17]:
X.head()

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Delivery_Speed,Previous_Purchases,Time_Of_Day_Evening,Time_Of_Day_Morning,Product_Category_T-shirt,Product_Category_Track Pants,Marketing_Campaign_Yes
0,9,4,0.424585,4.786316,2,8,0,0,0,1,1
1,2,2,0.392286,4.663326,3,3,0,0,0,1,1
2,3,1,0.083823,2.604983,1,7,0,1,0,0,0
3,1,2,0.064764,2.592285,2,2,0,0,0,1,1
4,5,4,0.236726,4.965573,2,5,1,0,0,1,0


In [18]:
y.head()

Unnamed: 0,Purchase_Probability_Encoded
0,0
1,1
2,2
3,1
4,0


# 6. Split data into training and testing sets

In [19]:
# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((80, 11), (20, 11), (80,), (20,))

# 7. Define the Utility Function recursive_feature_elimination_classification :

* Initializes a LogisticRegression model, which is suitable for classification tasks.
* Creates an RFE object with the LogisticRegression model and the desired number of features (num_features).
* Fits the RFE object to the training data.
* Uses rfe.support_ to get a boolean mask indicating the selected features.
* Uses the mask to get the names of the selected features from the training data's columns.
* Returns the list of selected feature names.

In [21]:
def recursive_feature_elimination_classification(X_train, y_train, X_test, y_test, num_features=5):
    """
    Performs recursive feature elimination (RFE) for feature selection (classification).

    Args:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training target.
        X_test (pd.DataFrame): Testing features.
        y_test (pd.Series): Testing target.
        num_features (int): Number of features to select.

    Returns:
        list: List of selected feature names.
    """
    model = LogisticRegression(solver='liblinear', multi_class='auto')  # Choose a suitable solver
    rfe = RFE(estimator=model, n_features_to_select=num_features)
    rfe.fit(X_train, y_train)

    selected_features = X_train.columns[rfe.support_].tolist()
    return selected_features

# 8. Execute Recursive Feature elimination

Perform RFE for Classification:

* Calls the recursive_feature_elimination_classification function to select features.
* Prints the selected features.

In [25]:
# Perform RFE for classification

selected_features = recursive_feature_elimination_classification(X_train, y_train, X_test, y_test, num_features=5)
print("\nSelected Features (RFE - Classification):\n")
selected_features


Selected Features (RFE - Classification):



['Discount_Offered',
 'Time_Of_Day_Morning',
 'Product_Category_T-shirt',
 'Product_Category_Track Pants',
 'Marketing_Campaign_Yes']

# **Note

Key Points:

* Classification Model: We use LogisticRegression for classification.
* Accuracy Metric: We use accuracy_score to evaluate the model's performance.
* Solver: The solver='liblinear' and multi_class='auto' are chosen for LogisticRegression to handle multiclass classification and smaller datasets efficiently.
* RFE with Classification: RFE can be used for both regression and classification tasks.
* Label Encoding: Remember to encode your target variable appropriately for classification.
* Computational Cost: RFE can be computationally expensive, especially for large datasets.
* Cross-Validation: Always use cross-validation to find the optimal number of features and avoid overfitting.