# 1. Import necessary dependencies

In [94]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# 2. Create sample Dataset

In [95]:
# Sample DataFrame (Men's Sports Apparel  - Regression Example)

data = pd.DataFrame({
    'Hours_Spent_Browsing': np.random.randint(1, 10, 100),
    'Number_Of_Visits': np.random.randint(1, 5, 100),
    'Discount_Offered': np.random.uniform(0, 0.5, 100),
    'Customer_Rating': np.random.uniform(1, 5, 100),
    'Time_Of_Day': np.random.choice(['Morning', 'Afternoon', 'Evening'], 100),
    'Product_Category': np.random.choice(['T-shirt', 'Shorts', 'Track Pants'], 100),
    'Delivery_Speed': np.random.randint(1, 5, 100),
    'Previous_Purchases': np.random.randint(0, 10, 100),
    'Marketing_Campaign': np.random.choice(['Yes', 'No'], 100),
    'Purchase_Amount': np.random.uniform(100, 5000, 100)  # Target variable
})

data

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Time_Of_Day,Product_Category,Delivery_Speed,Previous_Purchases,Marketing_Campaign,Purchase_Amount
0,3,2,0.014764,1.191415,Morning,Track Pants,4,1,Yes,3823.755543
1,1,2,0.469715,2.269567,Afternoon,Track Pants,2,5,No,4447.221606
2,9,2,0.175664,4.329375,Afternoon,Shorts,2,2,Yes,3205.399724
3,8,4,0.413604,3.575380,Evening,Shorts,2,9,No,4595.113991
4,4,3,0.336432,2.864234,Morning,Shorts,3,6,Yes,4756.255918
...,...,...,...,...,...,...,...,...,...,...
95,5,1,0.240295,4.878988,Morning,Shorts,2,5,Yes,114.387324
96,1,3,0.440818,1.632470,Evening,Track Pants,2,3,No,3273.209045
97,8,2,0.042107,3.544117,Morning,T-shirt,3,5,Yes,4108.540775
98,1,3,0.175444,2.794511,Afternoon,T-shirt,4,3,Yes,4882.184007


# 3. Convert categorical features to numerical using one-hot encoding

In [96]:
# Convert categorical features to numerical using one-hot encoding

data = pd.get_dummies(data, columns=['Time_Of_Day', 'Product_Category', 'Marketing_Campaign'], drop_first=True , dtype=np.int64)
data

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Delivery_Speed,Previous_Purchases,Purchase_Amount,Time_Of_Day_Evening,Time_Of_Day_Morning,Product_Category_T-shirt,Product_Category_Track Pants,Marketing_Campaign_Yes
0,3,2,0.014764,1.191415,4,1,3823.755543,0,1,0,1,1
1,1,2,0.469715,2.269567,2,5,4447.221606,0,0,0,1,0
2,9,2,0.175664,4.329375,2,2,3205.399724,0,0,0,0,1
3,8,4,0.413604,3.575380,2,9,4595.113991,1,0,0,0,0
4,4,3,0.336432,2.864234,3,6,4756.255918,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
95,5,1,0.240295,4.878988,2,5,114.387324,0,1,0,0,1
96,1,3,0.440818,1.632470,2,3,3273.209045,1,0,0,1,0
97,8,2,0.042107,3.544117,3,5,4108.540775,0,1,1,0,1
98,1,3,0.175444,2.794511,4,3,4882.184007,0,0,1,0,1


# 4. Separate features and target

Separate the features (X) and the target variable (y)

In [97]:
# Separate features and target

X = data.drop('Purchase_Amount', axis=1)
y = data['Purchase_Amount']

In [98]:
X.head()

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Delivery_Speed,Previous_Purchases,Time_Of_Day_Evening,Time_Of_Day_Morning,Product_Category_T-shirt,Product_Category_Track Pants,Marketing_Campaign_Yes
0,3,2,0.014764,1.191415,4,1,0,1,0,1,1
1,1,2,0.469715,2.269567,2,5,0,0,0,1,0
2,9,2,0.175664,4.329375,2,2,0,0,0,0,1
3,8,4,0.413604,3.57538,2,9,1,0,0,0,0
4,4,3,0.336432,2.864234,3,6,0,1,0,0,1


In [99]:
y.head()

Unnamed: 0,Purchase_Amount
0,3823.755543
1,4447.221606
2,3205.399724
3,4595.113991
4,4756.255918


# 5. Split data into training and testing sets

In [100]:
# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [101]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape

((80, 11), (20, 11), (80,), (20,))

# 6. Define the utility function for backward elimination

* Initializes selected_features with all features and best_score with a high value.
* Enters a while loop that continues as long as there are features to remove.
Trains a LinearRegression model using the current selected_features and calculates the MSE.
* If the current score is better than the best_score, it updates best_score and best_features.
* Iterates through the selected_features to find the worst_feature:

For each feature, it creates a temporary list of features without that feature.
Trains a model and calculates the MSE.

If the temporary score is better than the current_best_score, it updates current_best_score and worst_feature.

If a worst_feature is found, it's removed from selected_features, and the process repeats.

* The loop stops when no further improvement is found.
* Returns the best_features.

In [106]:
def backward_elimination(X_train, y_train, X_test, y_test, scoring_function=mean_squared_error):
    """
    Performs backward elimination for feature selection.

    Args:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training target.
        X_test (pd.DataFrame): Testing features.
        y_test (pd.Series): Testing target.
        scoring_function (function): Scoring function to evaluate model performance.

    Returns:
        list: List of selected feature names.
    """
    selected_features = X_train.columns.tolist()
    best_score = float('inf')  # Initialize with a high value for MSE
    best_features = selected_features.copy()  # Initialize best_features

    while selected_features:
        model = LinearRegression()
        model.fit(X_train[selected_features], y_train)
        y_pred = model.predict(X_test[selected_features])
        score = scoring_function(y_test, y_pred)

        if score < best_score:
            best_score = score
            best_features = selected_features.copy()

        worst_feature = None
        current_best_score = float('inf')

        for feature in selected_features:
            temp_features = [f for f in selected_features if f != feature]
            # Check if temp_features is empty to avoid the ValueError
            if not temp_features:
                break  # All features removed, stop elimination

            model = LinearRegression()
            model.fit(X_train[temp_features], y_train)
            y_pred = model.predict(X_test[temp_features])
            temp_score = scoring_function(y_test, y_pred)

            if temp_score < current_best_score:
                current_best_score = temp_score
                worst_feature = feature

        if worst_feature:
            selected_features.remove(worst_feature)
            print(f"Removed feature: {worst_feature}, Current score: {current_best_score}")
        else:
            break  # No improvement, stop elimination

    return best_features  # Return the best features found

# 7. Execute and Evaluate Backward elimination

Perform Backward Elimination:

* Calls the backward_elimination function to select features.
* Prints the selected features.

Evaluate Final Model:

* Trains a final LinearRegression model using the selected features.
* Calculates the MSE of the final model on the test set.
* Prints the final model's score.

In [107]:
# Perform backward elimination

selected_features = backward_elimination(X_train, y_train, X_test, y_test)
print("\nSelected Features (Backward Elimination):\n")
selected_features

Removed feature: Hours_Spent_Browsing, Current score: 1910253.2671468654
Removed feature: Time_Of_Day_Evening, Current score: 1831087.3910966502
Removed feature: Delivery_Speed, Current score: 1771466.3956312772
Removed feature: Product_Category_Track Pants, Current score: 1760296.1473758966
Removed feature: Marketing_Campaign_Yes, Current score: 1751862.6463471693
Removed feature: Product_Category_T-shirt, Current score: 1754114.6218479325
Removed feature: Discount_Offered, Current score: 1758240.3701282921
Removed feature: Number_Of_Visits, Current score: 1775432.681826835
Removed feature: Previous_Purchases, Current score: 1803443.412477862
Removed feature: Time_Of_Day_Morning, Current score: 1879120.6575322915

Selected Features (Backward Elimination):



['Number_Of_Visits',
 'Discount_Offered',
 'Customer_Rating',
 'Previous_Purchases',
 'Time_Of_Day_Morning',
 'Product_Category_T-shirt']

In [108]:
# Evaluate the model with the selected features

model_final = LinearRegression()
model_final.fit(X_train[selected_features], y_train)
y_pred_final = model_final.predict(X_test[selected_features])
final_score = mean_squared_error(y_test, y_pred_final)
print("\nFinal Model Score (MSE):")
final_score


Final Model Score (MSE):


1751862.6463471693

# **Note

Key Points:

* Wrapper Method: Backward elimination is a wrapper method.
* Scoring Function: You can use different scoring functions.
* Regression Example: This example is for regression. For classification, use different models and metrics.
* Greedy Algorithm: Backward elimination is a greedy algorithm.
* Computational Cost: It can be computationally expensive.
* Overfitting: Use cross-validation to avoid overfitting.
* Starting Point: Backward elimination starts with all features, which can be a disadvantage if you have many irrelevant features.