# 1. Import necessary dependencies

In [83]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# 2. Create sample Dataset

In [84]:
# Sample DataFrame (Men's Sports Apparel - Regression Example)

data = pd.DataFrame({
    'Hours_Spent_Browsing': np.random.randint(1, 10, 100),
    'Number_Of_Visits': np.random.randint(1, 5, 100),
    'Discount_Offered': np.random.uniform(0, 0.5, 100),
    'Customer_Rating': np.random.uniform(1, 5, 100),
    'Time_Of_Day': np.random.choice(['Morning', 'Afternoon', 'Evening'], 100),
    'Product_Category': np.random.choice(['T-shirt', 'Shorts', 'Track Pants'], 100),
    'Delivery_Speed': np.random.randint(1, 5, 100),
    'Previous_Purchases': np.random.randint(0, 10, 100),
    'Marketing_Campaign': np.random.choice(['Yes', 'No'], 100),
    'Purchase_Amount': np.random.uniform(100, 5000, 100)  # Target variable
})

data

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Time_Of_Day,Product_Category,Delivery_Speed,Previous_Purchases,Marketing_Campaign,Purchase_Amount
0,3,2,0.267413,2.644629,Afternoon,Shorts,4,2,Yes,3878.665447
1,3,4,0.037444,2.408758,Morning,Shorts,2,7,Yes,322.455878
2,5,4,0.388919,2.938373,Afternoon,Track Pants,3,5,Yes,1337.428397
3,6,2,0.273623,1.321547,Afternoon,T-shirt,4,9,No,2575.491910
4,1,4,0.474728,1.096928,Morning,Shorts,2,4,No,4791.532715
...,...,...,...,...,...,...,...,...,...,...
95,9,1,0.420637,4.340098,Evening,Track Pants,3,7,No,4642.417417
96,4,2,0.416996,1.628918,Morning,Shorts,3,4,No,3990.544173
97,6,1,0.157337,1.763367,Afternoon,T-shirt,2,4,No,2256.728567
98,7,2,0.027817,4.730611,Evening,Shorts,3,5,No,4989.904951


# 3. Convert categorical features to numerical using one-hot encoding

In [85]:
# Convert categorical features to numerical using one-hot encoding

data = pd.get_dummies(data, columns=['Time_Of_Day', 'Product_Category', 'Marketing_Campaign'], drop_first=True , dtype=np.int64)
data

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Delivery_Speed,Previous_Purchases,Purchase_Amount,Time_Of_Day_Evening,Time_Of_Day_Morning,Product_Category_T-shirt,Product_Category_Track Pants,Marketing_Campaign_Yes
0,3,2,0.267413,2.644629,4,2,3878.665447,0,0,0,0,1
1,3,4,0.037444,2.408758,2,7,322.455878,0,1,0,0,1
2,5,4,0.388919,2.938373,3,5,1337.428397,0,0,0,1,1
3,6,2,0.273623,1.321547,4,9,2575.491910,0,0,1,0,0
4,1,4,0.474728,1.096928,2,4,4791.532715,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
95,9,1,0.420637,4.340098,3,7,4642.417417,1,0,0,1,0
96,4,2,0.416996,1.628918,3,4,3990.544173,0,1,0,0,0
97,6,1,0.157337,1.763367,2,4,2256.728567,0,0,1,0,0
98,7,2,0.027817,4.730611,3,5,4989.904951,1,0,0,0,0


# 4. Separate features and target

Separate the features (X) and the target variable (y)

In [86]:
# Separate features and target

X = data.drop('Purchase_Amount', axis=1)
y = data['Purchase_Amount']

In [87]:
X.head()

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Delivery_Speed,Previous_Purchases,Time_Of_Day_Evening,Time_Of_Day_Morning,Product_Category_T-shirt,Product_Category_Track Pants,Marketing_Campaign_Yes
0,3,2,0.267413,2.644629,4,2,0,0,0,0,1
1,3,4,0.037444,2.408758,2,7,0,1,0,0,1
2,5,4,0.388919,2.938373,3,5,0,0,0,1,1
3,6,2,0.273623,1.321547,4,9,0,0,1,0,0
4,1,4,0.474728,1.096928,2,4,0,1,0,0,0


In [88]:
y.head()

Unnamed: 0,Purchase_Amount
0,3878.665447
1,322.455878
2,1337.428397
3,2575.49191
4,4791.532715


# 5. Split data into training and testing sets

In [89]:
# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [90]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((80, 11), (20, 11), (80,), (20,))

# 6. Define the utility function for forward selection

Initializes selected_features as an empty list and best_score with a high value.
Iterates through the available features:

* For each feature, it adds it to the selected_features and trains a LinearRegression model.
* It calculates the model's performance using the mean_squared_error (MSE) scoring function.
*If the score improves (MSE decreases), it updates the best_score and best_feature.
* If a best_feature is found, it's added to selected_features, removed from available_features, and the process repeats.
* The loop stops when no further improvement is found.
* Returns the list of selected_features.

In [91]:
def forward_selection(X_train, y_train, X_test, y_test, scoring_function=mean_squared_error):
    """
    Performs forward selection for feature selection.

    Args:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training target.
        X_test (pd.DataFrame): Testing features.
        y_test (pd.Series): Testing target.
        scoring_function (function): Scoring function to evaluate model performance.

    Returns:
        list: List of selected feature names.
    """
    selected_features = []
    best_score = float('inf')  # Initialize with a high value for MSE
    available_features = X_train.columns.tolist()

    while available_features:
        best_feature = None
        current_best_score = best_score

        for feature in available_features:
            temp_features = selected_features + [feature]
            model = LinearRegression()
            model.fit(X_train[temp_features], y_train)
            y_pred = model.predict(X_test[temp_features])
            score = scoring_function(y_test, y_pred)

            if score < current_best_score:
                current_best_score = score
                best_feature = feature

        if best_feature:
            selected_features.append(best_feature)
            available_features.remove(best_feature)
            best_score = current_best_score
            print(f"Added feature: {best_feature}, Current score: {best_score}")
        else:
            break  # No improvement, stop selection

    return selected_features

# 7. Execute and forward selection

Perform Forward Selection:

* Calls the forward_selection function to select features.
* Prints the selected features.

Evaluate Final Model:

* Trains a final LinearRegression model using only the selected features.
* Calculates the MSE of the final model on the test set.
* Prints the final model's score.

In [92]:
# Perform forward selection

selected_features = forward_selection(X_train, y_train, X_test, y_test)
print("\nSelected Features (Forward Selection):\n")
selected_features

Added feature: Product_Category_T-shirt, Current score: 1698724.6176588659
Added feature: Time_Of_Day_Morning, Current score: 1678104.9035466455

Selected Features (Forward Selection):



['Product_Category_T-shirt', 'Time_Of_Day_Morning']

In [93]:
# Evaluate the model with the selected features

model_final = LinearRegression()
model_final.fit(X_train[selected_features], y_train)
y_pred_final = model_final.predict(X_test[selected_features])
final_score = mean_squared_error(y_test, y_pred_final)
print("\nFinal Model Score (MSE):")
final_score


Final Model Score (MSE):


1678104.9035466455

# **Note

Key Points:

* Wrapper Method: Forward selection is a wrapper method because it evaluates feature subsets using a model's performance.
* Scoring Function: The scoring_function parameter allows you to use different metrics (e.g., R-squared, accuracy) to evaluate the model.
* Regression Example: This implementation uses LinearRegression and mean_squared_error for a regression task. For classification, you would use different models and metrics.
* Greedy Algorithm: Forward selection is a greedy algorithm, meaning it makes the locally optimal choice at each step.
* Computational Cost: Forward selection can be computationally expensive for datasets with many features.
* Overfitting: It is possible to overfit the model if you add too many features. Always use cross-validation to find the optimal number of features.