# 1. Import necessary dependencies

In [74]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder

# 2. Create sample Dataset

* Purchase_Probability : Target Variable
* Time_Of_Day , Product_Category , Marketing_Campaign , Customer_Segment : Categorical Independent Variable

In [75]:
# Sample DataFrame

data = pd.DataFrame({
    'Product_Category': np.random.choice(['T-shirt', 'Shorts', 'Track Pants'], 100),
    'Time_Of_Day': np.random.choice(['Morning', 'Afternoon', 'Evening'], 100),
    'Marketing_Campaign': np.random.choice(['Yes', 'No'], 100),
    'Customer_Segment': np.random.choice(['Young Adults', 'Middle-Aged', 'Seniors'], 100),
    'Purchase_Probability': np.random.choice(['Low', 'Medium', 'High'], 100)  # Target variable
})

print("Original Data:\n")
data

Original Data:



Unnamed: 0,Product_Category,Time_Of_Day,Marketing_Campaign,Customer_Segment,Purchase_Probability
0,Shorts,Afternoon,Yes,Seniors,Medium
1,T-shirt,Morning,No,Middle-Aged,Low
2,Shorts,Afternoon,Yes,Middle-Aged,High
3,Shorts,Afternoon,No,Middle-Aged,Medium
4,Shorts,Evening,No,Young Adults,Low
...,...,...,...,...,...
95,Track Pants,Afternoon,No,Young Adults,Medium
96,Shorts,Evening,Yes,Middle-Aged,Medium
97,Track Pants,Evening,No,Young Adults,Low
98,Shorts,Morning,No,Seniors,High


# 3. Encoding categorical features and target variable using Label encoding

We use LabelEncoder to convert all categorical features and the target variable into numerical representations. This is necessary for the Chi-square test, which works with numerical data.

In [76]:
# Encode categorical features and target variable

for col in data.columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

print("\nEncoded Data:\n")
data


Encoded Data:



Unnamed: 0,Product_Category,Time_Of_Day,Marketing_Campaign,Customer_Segment,Purchase_Probability
0,0,0,1,1,2
1,1,2,0,0,1
2,0,0,1,0,0
3,0,0,0,0,2
4,0,1,0,2,1
...,...,...,...,...,...
95,2,0,0,2,2
96,0,1,1,0,2
97,2,1,0,2,1
98,0,2,0,1,0


# 4. Separate features and target

Separate the features (X) and the target variable (y)

In [77]:
# Separate features and target

X = data.drop('Purchase_Probability', axis=1)
y = data['Purchase_Probability']

In [79]:
X.head()

Unnamed: 0,Product_Category,Time_Of_Day,Marketing_Campaign,Customer_Segment
0,0,0,1,1
1,1,2,0,0
2,0,0,1,0
3,0,0,0,0
4,0,1,0,2


In [80]:
y.head()

Unnamed: 0,Purchase_Probability
0,2
1,1
2,0
3,2
4,1


# 5. Apply Chi-square test for feature selection

* We use SelectKBest with the chi2 score function to perform feature selection.

* k is set to 3, meaning we want to select the top 3 features.

* fit_transform applies the Chi-square test and transforms the data to include only the selected features.

* We use selector.get_support(indices=True) to get the indices of the selected features.

* We use these indices to get the names of the selected features from the original DataFrame's columns.

* We print the names of the selected features.

In [81]:
# Apply Chi-square test

k = 3  # Select top 3 features
selector = SelectKBest(score_func=chi2, k=k)
X_new = selector.fit_transform(X, y)

# Get selected features
selected_features_indices = selector.get_support(indices=True)
selected_features_names = X.columns[selected_features_indices]

print("\nTop", k, "Features (Chi-square):\n")
selected_features_names.tolist()


Top 3 Features (Chi-square):



['Product_Category', 'Time_Of_Day', 'Customer_Segment']

We also print the Chi-squared scores for each feature, sorted in descending order, to see the relative importance of each feature.

In [82]:
#Show the scores of each feature
chi2_scores = chi2(X,y)[0]
feature_scores = pd.Series(chi2_scores, index=X.columns)
print("\nChi-squared Scores:\n")
feature_scores.sort_values(ascending=False)


Chi-squared Scores:



Unnamed: 0,0
Time_Of_Day,3.742112
Product_Category,0.519228
Customer_Segment,0.413408
Marketing_Campaign,0.262582


# ** Note

Key Points:

1. Categorical Data: Chi-square is specifically designed for categorical features and categorical targets.
2. Label Encoding: You must convert categorical data to numerical data using techniques like label encoding before using the Chi-square test.
3. SelectKBest: This scikit-learn class provides a convenient way to perform feature selection using various score functions.
4. chi2: This score function calculates the Chi-squared statistic between each feature and the target.
5. K Value: The k value determines the number of features to select.
Interpretation: The Chi-squared scores indicate the strength of the relationship between each feature and the target. Higher scores indicate stronger relationships.
6. Non-negative values: Chi-squared requires non-negative values. After label encoding, the values will be non-negative.