# 1. Import necessary dependencies

In [68]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import StandardScaler, LabelEncoder

# 2. Create sample Dataset

1. Purchase_Probability : Target Variable
2. Time_Of_Day , Product_Category , Marketing_Campaign : Categorical Independent Variable
3. Others are numerical independent variables

In [69]:
# 1. Create a Sample Classification Dataset

data = pd.DataFrame({
    'Hours_Spent_Browsing': np.random.randint(1, 10, 100),
    'Number_Of_Visits': np.random.randint(1, 5, 100),
    'Discount_Offered': np.random.uniform(0, 0.5, 100),
    'Customer_Rating': np.random.uniform(1, 5, 100),
    'Time_Of_Day': np.random.choice(['Morning', 'Afternoon', 'Evening'], 100),
    'Product_Category': np.random.choice(['T-shirt', 'Shorts', 'Track Pants'], 100),
    'Delivery_Speed': np.random.randint(1, 5, 100),
    'Previous_Purchases': np.random.randint(0, 10, 100),
    'Marketing_Campaign': np.random.choice(['Yes', 'No'], 100),
    'Purchase_Probability': np.random.choice(['Low', 'Medium', 'High'], 100)  # Classification target
})

data

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Time_Of_Day,Product_Category,Delivery_Speed,Previous_Purchases,Marketing_Campaign,Purchase_Probability
0,8,1,0.376967,2.764440,Evening,T-shirt,4,5,Yes,High
1,1,4,0.433958,1.539806,Evening,Shorts,3,2,No,Low
2,5,2,0.088757,3.479388,Morning,Track Pants,4,7,Yes,High
3,5,1,0.097766,1.723391,Afternoon,Track Pants,3,5,Yes,Low
4,7,1,0.468437,2.809649,Afternoon,Track Pants,1,1,No,Low
...,...,...,...,...,...,...,...,...,...,...
95,7,3,0.429826,1.368788,Morning,Track Pants,1,3,No,Medium
96,7,4,0.434837,1.315521,Afternoon,Shorts,1,3,Yes,High
97,4,1,0.297805,4.645568,Morning,Shorts,4,8,Yes,Medium
98,7,2,0.090610,1.109495,Afternoon,Track Pants,3,0,Yes,High


# 3. Encoding Categorical variables using One hot encoding

Categorical features are one-hot encoded using pd.get_dummies()

In [70]:
# Convert categorical features to numerical using one-hot encoding
data = pd.get_dummies(data, columns=['Time_Of_Day', 'Product_Category', 'Marketing_Campaign'], drop_first=True , dtype=np.int64)
data

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Delivery_Speed,Previous_Purchases,Purchase_Probability,Time_Of_Day_Evening,Time_Of_Day_Morning,Product_Category_T-shirt,Product_Category_Track Pants,Marketing_Campaign_Yes
0,8,1,0.376967,2.764440,4,5,High,1,0,1,0,1
1,1,4,0.433958,1.539806,3,2,Low,1,0,0,0,0
2,5,2,0.088757,3.479388,4,7,High,0,1,0,1,1
3,5,1,0.097766,1.723391,3,5,Low,0,0,0,1,1
4,7,1,0.468437,2.809649,1,1,Low,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
95,7,3,0.429826,1.368788,1,3,Medium,0,1,0,1,0
96,7,4,0.434837,1.315521,1,3,High,0,0,0,0,1
97,4,1,0.297805,4.645568,4,8,Medium,0,1,0,0,1
98,7,2,0.090610,1.109495,3,0,High,0,0,0,1,1


# 4. Encode the target variable with LabelEncoder

The Purchase_Probability target is label-encoded using LabelEncoder to convert it into numerical values (0, 1, 2).

In [71]:
# Encode the target variable (Purchase_Probability)
le = LabelEncoder()
data['Purchase_Probability_Encoded'] = le.fit_transform(data['Purchase_Probability'])
data

Unnamed: 0,Hours_Spent_Browsing,Number_Of_Visits,Discount_Offered,Customer_Rating,Delivery_Speed,Previous_Purchases,Purchase_Probability,Time_Of_Day_Evening,Time_Of_Day_Morning,Product_Category_T-shirt,Product_Category_Track Pants,Marketing_Campaign_Yes,Purchase_Probability_Encoded
0,8,1,0.376967,2.764440,4,5,High,1,0,1,0,1,0
1,1,4,0.433958,1.539806,3,2,Low,1,0,0,0,0,1
2,5,2,0.088757,3.479388,4,7,High,0,1,0,1,1,0
3,5,1,0.097766,1.723391,3,5,Low,0,0,0,1,1,1
4,7,1,0.468437,2.809649,1,1,Low,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,7,3,0.429826,1.368788,1,3,Medium,0,1,0,1,0,2
96,7,4,0.434837,1.315521,1,3,High,0,0,0,0,1,0
97,4,1,0.297805,4.645568,4,8,Medium,0,1,0,0,1,2
98,7,2,0.090610,1.109495,3,0,High,0,0,0,1,1,0


# 5. ANOVA F-value selection for classification

SelectKBest is used with f_classif which is used for ANOVA F-value selection for classification. This is more appropriate for multiclass targets.

In [73]:
#4. ANOVA F-value selection for classification
from sklearn.feature_selection import f_classif
k = 5  # Select top 5 features
# Select only numerical features
numerical_features = data.select_dtypes(include=np.number).drop('Purchase_Probability_Encoded', axis=1)

# Scale the data first
scaler = StandardScaler()
X_scaled = scaler.fit_transform(numerical_features)
y = data['Purchase_Probability_Encoded']
selector_anova = SelectKBest(score_func=f_classif, k=k)
X_new_anova = selector_anova.fit_transform(X_scaled, y)

selected_features_indices_anova = selector_anova.get_support(indices=True)
selected_features_names_anova = numerical_features.columns[selected_features_indices_anova]
print("\nTop", k, "Features (ANOVA F-value):\n")
selected_features_names_anova.tolist()


Top 5 Features (ANOVA F-value):



['Number_Of_Visits',
 'Customer_Rating',
 'Time_Of_Day_Evening',
 'Product_Category_Track Pants',
 'Marketing_Campaign_Yes']