# 1. Import necessary dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 2. Create sample Dataset

In [2]:
# Sample DataFrame (Men's Sports Apparel - Regression Example)

data = pd.DataFrame({
    'Hours_Spent_Browsing': np.random.randint(1, 10, 100),
    'Number_Of_Visits': np.random.randint(1, 5, 100),
    'Discount_Offered': np.random.uniform(0, 0.5, 100),
    'Customer_Rating': np.random.uniform(1, 5, 100),
    'Time_Of_Day': np.random.choice(['Morning', 'Afternoon', 'Evening'], 100),
    'Product_Category': np.random.choice(['T-shirt', 'Shorts', 'Track Pants'], 100),
    'Delivery_Speed': np.random.randint(1, 5, 100),
    'Previous_Purchases': np.random.randint(0, 10, 100),
    'Marketing_Campaign': np.random.choice(['Yes', 'No'], 100),
    'Purchase_Amount': np.random.uniform(100, 5000, 100)  # Target variable
})

data

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Time_Of_Day,Product_Category,Delivery_Speed,Previous_Purchases,Marketing_Campaign,Purchase_Amount
0,7,2,0.294408,2.629808,Evening,Shorts,3,0,Yes,2811.522656
1,8,4,0.164118,2.695800,Afternoon,Track Pants,3,7,No,2578.063221
2,1,1,0.423858,3.466369,Afternoon,Track Pants,4,7,Yes,2267.091945
3,2,3,0.220124,4.338865,Evening,Shorts,4,1,Yes,1970.118097
4,6,4,0.135585,2.552127,Evening,T-shirt,4,0,Yes,3269.227439
...,...,...,...,...,...,...,...,...,...,...
95,6,4,0.403131,4.536583,Afternoon,Track Pants,1,2,No,4129.383142
96,3,4,0.232001,1.363595,Evening,Track Pants,2,8,Yes,1456.784669
97,2,1,0.095226,3.980724,Afternoon,Shorts,2,2,No,1796.472949
98,7,1,0.310532,1.134325,Morning,Shorts,1,0,Yes,4830.109850


# 3. Convert categorical features to numerical using one-hot encoding

In [3]:
# Convert categorical features to numerical using one-hot encoding

data = pd.get_dummies(data, columns=['Time_Of_Day', 'Product_Category', 'Marketing_Campaign'], drop_first=True , dtype=np.int64)
data

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Delivery_Speed,Previous_Purchases,Purchase_Amount,Time_Of_Day_Evening,Time_Of_Day_Morning,Product_Category_T-shirt,Product_Category_Track Pants,Marketing_Campaign_Yes
0,7,2,0.294408,2.629808,3,0,2811.522656,1,0,0,0,1
1,8,4,0.164118,2.695800,3,7,2578.063221,0,0,0,1,0
2,1,1,0.423858,3.466369,4,7,2267.091945,0,0,0,1,1
3,2,3,0.220124,4.338865,4,1,1970.118097,1,0,0,0,1
4,6,4,0.135585,2.552127,4,0,3269.227439,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
95,6,4,0.403131,4.536583,1,2,4129.383142,0,0,0,1,0
96,3,4,0.232001,1.363595,2,8,1456.784669,1,0,0,1,1
97,2,1,0.095226,3.980724,2,2,1796.472949,0,0,0,0,0
98,7,1,0.310532,1.134325,1,0,4830.109850,0,1,0,0,1


# 4. Separate features and target

Separate the features (X) and the target variable (y)

In [4]:
# Separate features and target

X = data.drop('Purchase_Amount', axis=1)
y = data['Purchase_Amount']

In [5]:
X.head()

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Delivery_Speed,Previous_Purchases,Time_Of_Day_Evening,Time_Of_Day_Morning,Product_Category_T-shirt,Product_Category_Track Pants,Marketing_Campaign_Yes
0,7,2,0.294408,2.629808,3,0,1,0,0,0,1
1,8,4,0.164118,2.6958,3,7,0,0,0,1,0
2,1,1,0.423858,3.466369,4,7,0,0,0,1,1
3,2,3,0.220124,4.338865,4,1,1,0,0,0,1
4,6,4,0.135585,2.552127,4,0,1,0,1,0,1


In [6]:
y.head()

Unnamed: 0,Purchase_Amount
0,2811.522656
1,2578.063221
2,2267.091945
3,1970.118097
4,3269.227439


# 5. Split data into training and testing sets

In [7]:
# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((80, 11), (20, 11), (80,), (20,))

# 6. Define the Utility Function recursive_feature_elimination :

* Initializes a LinearRegression model.
* Creates an RFE object with the model and the desired number of features (num_features).
* Fits the RFE object to the training data.
* Uses rfe.support_ to get a boolean mask indicating the selected features.
* Uses the mask to get the names of the selected features from the training data's columns.
* Returns the list of selected feature names.

In [9]:
def recursive_feature_elimination(X_train, y_train, X_test, y_test, num_features=5):
    """
    Performs recursive feature elimination (RFE) for feature selection.

    Args:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training target.
        X_test (pd.DataFrame): Testing features.
        y_test (pd.Series): Testing target.
        num_features (int): Number of features to select.

    Returns:
        list: List of selected feature names.
    """
    model = LinearRegression()
    rfe = RFE(estimator=model, n_features_to_select=num_features)
    rfe.fit(X_train, y_train)

    selected_features = X_train.columns[rfe.support_].tolist()
    return selected_features

# 7. Execute and Evaluate Recursive Feature elimination

Perform RFE:

* Calls the recursive_feature_elimination function to select features.
* Prints the selected features.

Evaluate Final Model:

* Trains a final LinearRegression model using the selected features.
* Calculates the MSE of the final model on the test set.
* Prints the final model's score.

In [10]:
# Perform RFE

selected_features = recursive_feature_elimination(X_train, y_train, X_test, y_test, num_features=5)
print("\nSelected Features (RFE):\n")
selected_features


Selected Features (RFE):



['Discount_Offered',
 'Delivery_Speed',
 'Time_Of_Day_Evening',
 'Time_Of_Day_Morning',
 'Product_Category_T-shirt']

In [11]:
# Evaluate the model with the selected features

model_final = LinearRegression()
model_final.fit(X_train[selected_features], y_train)
y_pred_final = model_final.predict(X_test[selected_features])
final_score = mean_squared_error(y_test, y_pred_final)
print("\nFinal Model Score (MSE):")
final_score


Final Model Score (MSE):


3350234.636306759

# **Note

Key Points:

* Wrapper Method: RFE is a wrapper method.
* Estimator: The estimator parameter specifies the model to use for ranking features.
* n_features_to_select: This parameter determines the number of features to select.
* rfe.support_: This attribute of the RFE object indicates which features were selected.
* Regression Example: This example is for regression. For classification, use different models.
* Computational Cost: RFE can be computationally expensive, especially for large datasets.
* Model Dependency: The selected features depend on the model used as the estimator.
* Iterative Process: RFE is an iterative process that removes the least important feature at each step.
* Cross-Validation: It's good practice to use cross-validation with RFE to find the optimal number of features and avoid overfitting.