# 1. Import necessary dependencies

In [61]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler

# 2. Create sample Dataset

1. Purchase_Amount : Target Variable
2. Time_Of_Day , Product_Category , Marketing_Campaign : Categorical Independent Variable
3. Others are numerical independent variables

In [62]:
# 1. Create a Sample Dataset

data = pd.DataFrame({
    'Hours_Spent_Browsing': np.random.randint(1, 10, 100),
    'Number_Of_Visits': np.random.randint(1, 5, 100),
    'Discount_Offered': np.random.uniform(0, 0.5, 100),
    'Customer_Rating': np.random.uniform(1, 5, 100),
    'Time_Of_Day': np.random.choice(['Morning', 'Afternoon', 'Evening'], 100),
    'Product_Category': np.random.choice(['T-shirt', 'Shorts', 'Track Pants'], 100),
    'Delivery_Speed': np.random.randint(1, 5, 100),
    'Previous_Purchases': np.random.randint(0, 10, 100),
    'Marketing_Campaign': np.random.choice(['Yes', 'No'], 100),
    'Purchase_Amount': np.random.uniform(100, 5000, 100)  # Target variable
})

data

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Time_Of_Day,Product_Category,Delivery_Speed,Previous_Purchases,Marketing_Campaign,Purchase_Amount
0,3,4,0.202364,3.000245,Afternoon,Track Pants,2,1,Yes,4040.217621
1,5,3,0.364581,4.728080,Evening,Track Pants,2,0,Yes,526.160328
2,2,4,0.140739,3.301043,Morning,Shorts,4,0,Yes,2226.297377
3,6,1,0.142450,1.587278,Evening,Shorts,2,9,Yes,170.323226
4,6,2,0.444636,4.014756,Evening,T-shirt,3,8,Yes,4570.955886
...,...,...,...,...,...,...,...,...,...,...
95,2,2,0.426068,4.238263,Afternoon,Shorts,3,5,No,3817.453912
96,4,3,0.394249,2.442962,Morning,Shorts,2,7,Yes,980.330141
97,8,4,0.169576,2.445000,Evening,T-shirt,1,9,Yes,4696.789459
98,6,1,0.153412,4.475802,Afternoon,Track Pants,4,9,No,2737.991363


# 3. Encoding Categorical variables using One hot encoding

In [63]:
# Convert categorical features to numerical using one-hot encoding
data = pd.get_dummies(data, columns=['Time_Of_Day', 'Product_Category', 'Marketing_Campaign'], drop_first=True , dtype=np.int64)
data

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Delivery_Speed,Previous_Purchases,Purchase_Amount,Time_Of_Day_Evening,Time_Of_Day_Morning,Product_Category_T-shirt,Product_Category_Track Pants,Marketing_Campaign_Yes
0,3,4,0.202364,3.000245,2,1,4040.217621,0,0,0,1,1
1,5,3,0.364581,4.728080,2,0,526.160328,1,0,0,1,1
2,2,4,0.140739,3.301043,4,0,2226.297377,0,1,0,0,1
3,6,1,0.142450,1.587278,2,9,170.323226,1,0,0,0,1
4,6,2,0.444636,4.014756,3,8,4570.955886,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
95,2,2,0.426068,4.238263,3,5,3817.453912,0,0,0,0,0
96,4,3,0.394249,2.442962,2,7,980.330141,0,1,0,0,1
97,8,4,0.169576,2.445000,1,9,4696.789459,1,0,1,0,1
98,6,1,0.153412,4.475802,4,9,2737.991363,0,0,0,1,0


# 4. Calculate Pearson correlation of Target variable ( Purchase_Amount) with all other variables

Calculate Correlation (Pearson's):

1. We use data.corr()['Purchase_Amount'] to calculate the Pearson correlation between each feature and the 'target' variable.
2. We drop the correlation of 'target' with itself.
3. We print the correlation values.

In [64]:
# Calculate Correlation (Pearson's Correlation)

correlations = data.corr()['Purchase_Amount'].drop('Purchase_Amount')
print("Correlation with Purchase_Amount:\n")
correlations

Correlation with Purchase_Amount:



Unnamed: 0,Purchase_Amount
Hours_Spent_Browsing,-0.053166
Number_Of_Visits,0.04058
Discount_Offered,0.102222
Customer_Rating,0.130759
Delivery_Speed,-0.093778
Previous_Purchases,0.07233
Time_Of_Day_Evening,0.082442
Time_Of_Day_Morning,0.016596
Product_Category_T-shirt,0.149397
Product_Category_Track Pants,-0.056213


# 5. Select top k features that are highly correlated

Select Top K Features (Correlation):

1. We define k (e.g., 5) to select the top k features.
2. We use correlations.abs().nlargest(k).index to get the indices of the top k features based on the absolute correlation values.
3. We print the names of the top k features.

In [65]:
# Select Top K Features (Filter Method using Correlation)

k = 5  # Select top 5 features
top_k_features = correlations.abs().nlargest(k).index
print("\nTop", k, "Features (Correlation-based):\n")
top_k_features.tolist()


Top 5 Features (Correlation-based):



['Product_Category_T-shirt',
 'Customer_Rating',
 'Discount_Offered',
 'Delivery_Speed',
 'Time_Of_Day_Evening']

# 6. Feature Selection using SelectKBest

Using SelectKBest (f_regression):

1. We use SelectKBest from scikit-learn with the f_regression score function, which is suitable for regression tasks.
2. We scale the data using StandardScaler before applying SelectKBest to ensure that features are on the same scale.
3. We fit SelectKBest to the scaled data and the target variable.
4. We use selector.get_support(indices=True) to get the indices of the selected features.
5. We print the names of the selected features.

In [66]:
# Using SelectKBest (f_regression) - for regression tasks
# Scale the numerical data first

numerical_features = data.select_dtypes(include=np.number).drop('Purchase_Amount', axis=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(numerical_features)
y = data['Purchase_Amount']

selector = SelectKBest(score_func=f_regression, k=k)
X_new = selector.fit_transform(X_scaled, y)

# Get the selected features
selected_features_indices = selector.get_support(indices=True)
selected_features_names = numerical_features.columns[selected_features_indices]
print("\nTop", k, "Features (SelectKBest f_regression):\n")
selected_features_names.tolist()


Top 5 Features (SelectKBest f_regression):



['Discount_Offered',
 'Customer_Rating',
 'Delivery_Speed',
 'Time_Of_Day_Evening',
 'Product_Category_T-shirt']

# 7. Using Pearson correlation directly with corrwith for feature selection

Using Pearson correlation directly with corrwith:


1. We use correlation_values.abs().nlargest(k).index to get the top k features based on absolute correlation.
2. We print the names of the top k features.

In [67]:
#Using Pearson correlation directly for feature selection

correlation_values = numerical_features.corrwith(y)

top_k_features_pearson = correlation_values.abs().nlargest(k).index

print("\nTop", k, "Features (Pearson Correlation directly):\n")
top_k_features_pearson.tolist()


Top 5 Features (Pearson Correlation directly):



['Product_Category_T-shirt',
 'Customer_Rating',
 'Discount_Offered',
 'Delivery_Speed',
 'Time_Of_Day_Evening']

# ** Note

Key Points:

1. Correlation: Pearson's correlation measures the linear relationship between two variables.
2. Absolute Correlation: We use the absolute correlation to rank features, as both positive and negative correlations can indicate strong relationships.
3. SelectKBest: This is a convenient scikit-learn class for feature selection using various score functions.
4. f_regression: This score function is used for regression tasks.
5. Scaling: Scaling the data is important when using SelectKBest with f_regression or other methods that rely on distance or variance.
6. K Value: The k value determines the number of features to select. You can adjust this based on your needs.
7. Filter Method: This is a filter method because feature selection is done independently of the model training.
8. Regression Task: This example demonstrates feature selection for a regression task. For classification tasks, you would use different score functions (e.g., chi2, f_classif).