In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/100_days_of_Deep_Learning/ML-PROJECTS/23-Ecommerce-Fraud-Transaction-Detection

/content/drive/MyDrive/100_days_of_Deep_Learning/ML-PROJECTS/23-Ecommerce-Fraud-Transaction-Detection


In [None]:
!ls

1.ipynb  2.ipynb  Transaction_Data.csv


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Set the seed for reproducibility
seed_value = 42

# Setting the seed for numpy
np.random.seed(seed_value)

# Setting the seed for sklearn (train_test_split uses random splitting)
import random
random.seed(seed_value)

In [None]:
# loading the dataset to a Pandas DataFrame
Transaction_Data = pd.read_csv('Transaction_Data.csv')

In [None]:
# print first 5 rows of the dataset
Transaction_Data.head()

Unnamed: 0,Transaction ID,Customer ID,Transaction Amount,Transaction Date,Payment Method,Product Category,Quantity,Customer Age,Customer Location,Device Used,IP Address,Shipping Address,Billing Address,Is Fraudulent,Account Age Days,Transaction Hour
0,c12e07a0-8a06-4c0d-b5cc-04f3af688570,8ca9f102-02a4-4207-ab63-484e83a1bdf0,42.32,2024-03-24 23:42:43,PayPal,electronics,1,40,East Jameshaven,desktop,110.87.246.85,5399 Rachel Stravenue Suite 718\nNorth Blakebu...,5399 Rachel Stravenue Suite 718\nNorth Blakebu...,0,282,23
1,7d187603-7961-4fce-9827-9698e2b6a201,4d158416-caae-4b09-bd5b-15235deb9129,301.34,2024-01-22 00:53:31,credit card,electronics,3,35,Kingstad,tablet,14.73.104.153,"5230 Stephanie Forge\nCollinsbury, PR 81853","5230 Stephanie Forge\nCollinsbury, PR 81853",0,223,0
2,f2c14f9d-92df-4aaf-8931-ceaf4e63ed72,ccae47b8-75c7-4f5a-aa9e-957deced2137,340.32,2024-01-22 08:06:03,debit card,toys & games,5,29,North Ryan,desktop,67.58.94.93,"195 Cole Oval\nPort Larry, IA 58422","4772 David Stravenue Apt. 447\nVelasquezside, ...",0,360,8
3,e9949bfa-194d-486b-84da-9565fca9e5ce,b04960c0-aeee-4907-b1cd-4819016adcef,95.77,2024-01-16 20:34:53,credit card,electronics,5,45,Kaylaville,mobile,202.122.126.216,"7609 Cynthia Square\nWest Brenda, NV 23016","7609 Cynthia Square\nWest Brenda, NV 23016",0,325,20
4,7362837c-7538-434e-8731-0df713f5f26d,de9d6351-b3a7-4bc7-9a55-8f013eb66928,77.45,2024-01-16 15:47:23,credit card,clothing,5,42,North Edwardborough,desktop,96.77.232.76,"2494 Robert Ramp Suite 313\nRobinsonport, AS 5...","2494 Robert Ramp Suite 313\nRobinsonport, AS 5...",0,116,15


In [None]:
Transaction_Data.shape

(23634, 16)

In [None]:
# getting some info about the data
Transaction_Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23634 entries, 0 to 23633
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Transaction ID      23634 non-null  object 
 1   Customer ID         23634 non-null  object 
 2   Transaction Amount  23634 non-null  float64
 3   Transaction Date    23634 non-null  object 
 4   Payment Method      23634 non-null  object 
 5   Product Category    23634 non-null  object 
 6   Quantity            23634 non-null  int64  
 7   Customer Age        23634 non-null  int64  
 8   Customer Location   23634 non-null  object 
 9   Device Used         23634 non-null  object 
 10  IP Address          23634 non-null  object 
 11  Shipping Address    23634 non-null  object 
 12  Billing Address     23634 non-null  object 
 13  Is Fraudulent       23634 non-null  int64  
 14  Account Age Days    23634 non-null  int64  
 15  Transaction Hour    23634 non-null  int64  
dtypes: f

In [None]:
# checking the number of missing values in each column
Transaction_Data.isnull().sum()

Unnamed: 0,0
Transaction ID,0
Customer ID,0
Transaction Amount,0
Transaction Date,0
Payment Method,0
Product Category,0
Quantity,0
Customer Age,0
Customer Location,0
Device Used,0


In [None]:
# statistical measures about the data
Transaction_Data.describe()

# this method only works for numerical columns and give count, mean, std, min, 25%, 50%, 75%, max

Unnamed: 0,Transaction Amount,Quantity,Customer Age,Is Fraudulent,Account Age Days,Transaction Hour
count,23634.0,23634.0,23634.0,23634.0,23634.0,23634.0
mean,229.367099,3.00055,34.56021,0.051705,178.660531,11.266015
std,282.046669,1.419663,10.009471,0.221436,107.388682,6.980659
min,10.0,1.0,-2.0,0.0,1.0,0.0
25%,69.07,2.0,28.0,0.0,84.0,5.0
50%,151.415,3.0,35.0,0.0,178.0,11.0
75%,296.1275,4.0,41.0,0.0,272.0,17.0
max,9716.5,5.0,73.0,1.0,365.0,23.0


In [None]:
Transaction_Data.describe(include='all')

# this include all columns regardless of there types
# for categorical col : count, unique, top, freq

Unnamed: 0,Transaction ID,Customer ID,Transaction Amount,Transaction Date,Payment Method,Product Category,Quantity,Customer Age,Customer Location,Device Used,IP Address,Shipping Address,Billing Address,Is Fraudulent,Account Age Days,Transaction Hour
count,23634,23634,23634.0,23634,23634,23634,23634.0,23634.0,23634,23634,23634,23634,23634,23634.0,23634.0,23634.0
unique,23634,23634,,23607,4,5,,,14868,3,23634,23634,23634,,,
top,23e3c107-f2fc-48c2-abbc-7b809bf6f102,d8d7a64e-8419-4421-910a-a7cf709a900b,,2024-01-06 14:38:19,debit card,home & garden,,,North Michael,desktop,116.188.254.162,"289 Adams Wells\nWest Joeltown, LA 69190","289 Adams Wells\nWest Joeltown, LA 69190",,,
freq,1,1,,2,5952,4786,,,30,7923,1,1,1,,,
mean,,,229.367099,,,,3.00055,34.56021,,,,,,0.051705,178.660531,11.266015
std,,,282.046669,,,,1.419663,10.009471,,,,,,0.221436,107.388682,6.980659
min,,,10.0,,,,1.0,-2.0,,,,,,0.0,1.0,0.0
25%,,,69.07,,,,2.0,28.0,,,,,,0.0,84.0,5.0
50%,,,151.415,,,,3.0,35.0,,,,,,0.0,178.0,11.0
75%,,,296.1275,,,,4.0,41.0,,,,,,0.0,272.0,17.0


In [None]:
# Check for duplicate rows ..
print(f"Duplicate Rows: {Transaction_Data.duplicated().sum()}")

Duplicate Rows: 0


In [None]:
# Unique values per column
print(Transaction_Data.nunique())

Transaction ID        23634
Customer ID           23634
Transaction Amount    18375
Transaction Date      23607
Payment Method            4
Product Category          5
Quantity                  5
Customer Age             74
Customer Location     14868
Device Used               3
IP Address            23634
Shipping Address      23634
Billing Address       23634
Is Fraudulent             2
Account Age Days        365
Transaction Hour         24
dtype: int64


### **Feature Engineering**

### **Creating New Features**

In [None]:
df = Transaction_Data.copy()

In [None]:
df.shape

(23634, 16)

In [None]:
# Convert 'Transaction Date' to datetime
df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])

# Feature Engineering - Create 15 New Features
df['Transaction_Day'] = df['Transaction Date'].dt.day
df['Transaction_Month'] = df['Transaction Date'].dt.month
df['Transaction_Weekday'] = df['Transaction Date'].dt.weekday
df['Is_Weekend'] = df['Transaction_Weekday'].isin([5,6]).astype(int)


df['Is_High_Amount'] = (df['Transaction Amount'] > df['Transaction Amount'].median()).astype(int)
df['Customer_Age_Group'] = pd.cut(df['Customer Age'], bins=[0, 25, 45, 65, 100], labels=['Young', 'Adult', 'Mid-age', 'Senior'])

df['Total_Value'] = df['Transaction Amount'] * df['Quantity']
df['Log_Amount'] = np.log1p(df['Transaction Amount'])
df['Account_Age_Years'] = df['Account Age Days'] / 365
df['Is_Night_Transaction'] = df['Transaction Hour'].apply(lambda x: 1 if x < 6 or x > 22 else 0)
df['Same_Shipping_Billing'] = (df['Shipping Address'] == df['Billing Address']).astype(int)
df['Location_First_Letter'] = df['Customer Location'].str[0]
df['Transaction_Amount_Per_Account_Age'] = df['Transaction Amount'] / (df['Account Age Days'] + 1)
df['Fraud_Risk_Score'] = (
    df['Is_Night_Transaction'] * 1.5 +
    df['Is_High_Amount'] * 2 +
    (df['Device Used'] == 'mobile').astype(int) * 1.2
)

In [None]:
df.shape

(23634, 30)

In [None]:
df['Log_Amount'] = np.log1p(df['Transaction Amount'])

# Cyclical Encoding of Hour
df['Hour_sin'] = np.sin(2 * np.pi * df['Transaction Hour'] / 24)
df['Hour_cos'] = np.cos(2 * np.pi * df['Transaction Hour'] / 24)

# Per-customer stats
customer_stats = df.groupby('Customer ID').agg({
    'Transaction ID': 'count',
    'Transaction Amount': 'mean',
    'Quantity': 'mean'
}).rename(columns={
    'Transaction ID': 'Customer_Total_Transactions',
    'Transaction Amount': 'Customer_Avg_Transaction_Amount',
    'Quantity': 'Customer_Avg_Quantity'
})
df = df.merge(customer_stats, on='Customer ID', how='left')
df['Deviation_From_Customer_Avg'] = (
    df['Transaction Amount'] - df['Customer_Avg_Transaction_Amount']
)

# High-value flag
amount_95 = df['Transaction Amount'].quantile(0.95)
df['Is_High_Value'] = (df['Transaction Amount'] > amount_95).astype(int)

# Z-score of transaction amount
std_per_customer = df.groupby('Customer ID')['Transaction Amount'].transform('std').fillna(1)
df['Amount_Z_Score'] = df['Deviation_From_Customer_Avg'] / std_per_customer

# Time-based flags
df['Transaction Date'] = pd.to_datetime(df['Transaction Date'])
df['Is_Night_Transaction'] = df['Transaction Hour'].apply(lambda x: 1 if x < 6 else 0)
df['Is_Weekend_Transaction'] = df['Transaction Date'].dt.dayofweek.apply(lambda x: 1 if x >= 5 else 0)
df['Days_Since_Account_Creation'] = (
    pd.Timestamp.today().normalize() -
    (df['Transaction Date'] - pd.to_timedelta(df['Account Age Days'], unit='d'))
).dt.days

# Address mismatch
df['Billing_Shipping_Mismatch'] = (
    df['Billing Address'] != df['Shipping Address']
).astype(int)

# Fraud rate by device type
device_fraud_rate = df.groupby('Device Used')['Is Fraudulent'].mean().to_dict()
df['Device_Fraud_Rate'] = df['Device Used'].map(device_fraud_rate)



In [None]:
df.shape

(23634, 42)

In [None]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,Transaction ID,Customer ID,Transaction Amount,Transaction Date,Payment Method,Product Category,Quantity,Customer Age,Customer Location,Device Used,IP Address,Shipping Address,Billing Address,Is Fraudulent,Account Age Days,Transaction Hour,Transaction_Day,Transaction_Month,Transaction_Weekday,Is_Weekend,Is_High_Amount,Customer_Age_Group,Total_Value,Log_Amount,Account_Age_Years,Is_Night_Transaction,Same_Shipping_Billing,Location_First_Letter,Transaction_Amount_Per_Account_Age,Fraud_Risk_Score,Hour_sin,Hour_cos,Customer_Total_Transactions,Customer_Avg_Transaction_Amount,Customer_Avg_Quantity,Deviation_From_Customer_Avg,Is_High_Value,Amount_Z_Score,Is_Weekend_Transaction,Days_Since_Account_Creation,Billing_Shipping_Mismatch,Device_Fraud_Rate
0,c12e07a0-8a06-4c0d-b5cc-04f3af688570,8ca9f102-02a4-4207-ab63-484e83a1bdf0,42.32,2024-03-24 23:42:43,PayPal,electronics,1,40,East Jameshaven,desktop,110.87.246.85,5399 Rachel Stravenue Suite 718\nNorth Blakebu...,5399 Rachel Stravenue Suite 718\nNorth Blakebu...,0,282,23,24,3,6,1,0,Adult,42.32,3.768614,0.772603,0,1,E,0.149541,1.5,-0.258819,0.965926,1,42.32,1.0,0.0,0,0.0,1,693,0,0.051243
1,7d187603-7961-4fce-9827-9698e2b6a201,4d158416-caae-4b09-bd5b-15235deb9129,301.34,2024-01-22 00:53:31,credit card,electronics,3,35,Kingstad,tablet,14.73.104.153,"5230 Stephanie Forge\nCollinsbury, PR 81853","5230 Stephanie Forge\nCollinsbury, PR 81853",0,223,0,22,1,0,0,1,Adult,904.02,5.711552,0.610959,1,1,K,1.345268,3.5,0.0,1.0,1,301.34,3.0,0.0,0,0.0,0,696,0,0.049425
2,f2c14f9d-92df-4aaf-8931-ceaf4e63ed72,ccae47b8-75c7-4f5a-aa9e-957deced2137,340.32,2024-01-22 08:06:03,debit card,toys & games,5,29,North Ryan,desktop,67.58.94.93,"195 Cole Oval\nPort Larry, IA 58422","4772 David Stravenue Apt. 447\nVelasquezside, ...",0,360,8,22,1,0,0,1,Adult,1701.6,5.83282,0.986301,0,0,N,0.942715,2.0,0.866025,-0.5,1,340.32,5.0,0.0,0,0.0,0,833,1,0.051243
3,e9949bfa-194d-486b-84da-9565fca9e5ce,b04960c0-aeee-4907-b1cd-4819016adcef,95.77,2024-01-16 20:34:53,credit card,electronics,5,45,Kaylaville,mobile,202.122.126.216,"7609 Cynthia Square\nWest Brenda, NV 23016","7609 Cynthia Square\nWest Brenda, NV 23016",0,325,20,16,1,1,0,0,Adult,478.85,4.572337,0.890411,0,1,K,0.293773,1.2,-0.866025,0.5,1,95.77,5.0,0.0,0,0.0,0,804,0,0.054435
4,7362837c-7538-434e-8731-0df713f5f26d,de9d6351-b3a7-4bc7-9a55-8f013eb66928,77.45,2024-01-16 15:47:23,credit card,clothing,5,42,North Edwardborough,desktop,96.77.232.76,"2494 Robert Ramp Suite 313\nRobinsonport, AS 5...","2494 Robert Ramp Suite 313\nRobinsonport, AS 5...",0,116,15,16,1,1,0,0,Adult,387.25,4.362461,0.317808,0,1,N,0.661966,0.0,-0.707107,-0.707107,1,77.45,5.0,0.0,0,0.0,0,595,0,0.051243


### **Drop Irrelevant Features**

In [None]:
# Drop irrelevant columns for modeling
df = df.drop(columns=[
    'Transaction ID', 'Customer ID', 'IP Address', 'Transaction Date',
    'Shipping Address', 'Billing Address'
])


In [None]:
print(df.shape)

(23634, 36)


In [None]:
# getting some info about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23634 entries, 0 to 23633
Data columns (total 36 columns):
 #   Column                              Non-Null Count  Dtype   
---  ------                              --------------  -----   
 0   Transaction Amount                  23634 non-null  float64 
 1   Payment Method                      23634 non-null  object  
 2   Product Category                    23634 non-null  object  
 3   Quantity                            23634 non-null  int64   
 4   Customer Age                        23634 non-null  int64   
 5   Customer Location                   23634 non-null  object  
 6   Device Used                         23634 non-null  object  
 7   Is Fraudulent                       23634 non-null  int64   
 8   Account Age Days                    23634 non-null  int64   
 9   Transaction Hour                    23634 non-null  int64   
 10  Transaction_Day                     23634 non-null  int32   
 11  Transaction_Month           

In [None]:
# Unique values per column
print(df.nunique())

Transaction Amount                    18375
Payment Method                            4
Product Category                          5
Quantity                                  5
Customer Age                             74
Customer Location                     14868
Device Used                               3
Is Fraudulent                             2
Account Age Days                        365
Transaction Hour                         24
Transaction_Day                          31
Transaction_Month                         4
Transaction_Weekday                       7
Is_Weekend                                2
Is_High_Amount                            2
Customer_Age_Group                        4
Total_Value                           21028
Log_Amount                            18372
Account_Age_Years                       365
Is_Night_Transaction                      2
Same_Shipping_Billing                     2
Location_First_Letter                    26
Transaction_Amount_Per_Account_A

In [None]:
df = df.loc[:, df.nunique() > 1]
# those columns are not useful ,which has only 1 unique value, because they dont help model to learn anything, so drop them ..
# this kind of columns also create problem when we apply different filter method for feature selection ..

In [None]:
print(df.shape)
print(df.nunique())

(23634, 33)
Transaction Amount                    18375
Payment Method                            4
Product Category                          5
Quantity                                  5
Customer Age                             74
Customer Location                     14868
Device Used                               3
Is Fraudulent                             2
Account Age Days                        365
Transaction Hour                         24
Transaction_Day                          31
Transaction_Month                         4
Transaction_Weekday                       7
Is_Weekend                                2
Is_High_Amount                            2
Customer_Age_Group                        4
Total_Value                           21028
Log_Amount                            18372
Account_Age_Years                       365
Is_Night_Transaction                      2
Same_Shipping_Billing                     2
Location_First_Letter                    26
Transaction_Amount_P

 **Now we have created Enough Features , Its time To perform feature selection to select top 10 features for model training**

### **1. Filter Method**

In [None]:
# Before applying filter method it is required to convert all categorical columns into numerical columns ..

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, chi2
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Assuming df is your DataFrame
X = df.drop('Is Fraudulent', axis=1)
y = df['Is Fraudulent']


In [None]:
print(X.shape, y.shape)

(23634, 32) (23634,)


In [None]:
# 1. One-hot encode categorical variables
X_encoded_onehot = pd.get_dummies(X, drop_first=True)
print(X_encoded_onehot.shape)

# we cant use directly one hot encoding on all categorical columns because some columns has more than 1000 unique values
# so the number of columsn will increase drastically and this cause curse of dimensionality ..

(23634, 14930)


### **Custom Encoding**

In [None]:
# One-hot encoding for low-cardinality columns (< 10)
# Frequency encoding for high-cardinality columns (>= 10)

def smart_encode(df, target_col, cardinality_threshold=10):
    df_encoded = df.copy()
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

    for col in categorical_cols:
        unique_vals = df[col].nunique()

        if unique_vals < cardinality_threshold:
            # Low cardinality → One-hot encode
            dummies = pd.get_dummies(df_encoded[col], prefix=col, drop_first=True)
            df_encoded = pd.concat([df_encoded.drop(columns=[col]), dummies], axis=1)
        else:
            # High cardinality → Frequency encode
            freq_map = df[col].value_counts().to_dict()
            df_encoded[col] = df[col].map(freq_map)

    return df_encoded



In [None]:
# Example usage
X_smart_encoded = smart_encode(X, target_col='Is Fraudulent')
print(X_smart_encoded.shape)

(23634, 40)


In [None]:
X_smart_encoded.head()

Unnamed: 0,Transaction Amount,Quantity,Customer Age,Customer Location,Account Age Days,Transaction Hour,Transaction_Day,Transaction_Month,Transaction_Weekday,Is_Weekend,Is_High_Amount,Total_Value,Log_Amount,Account_Age_Years,Is_Night_Transaction,Same_Shipping_Billing,Location_First_Letter,Transaction_Amount_Per_Account_Age,Fraud_Risk_Score,Hour_sin,Hour_cos,Customer_Avg_Transaction_Amount,Customer_Avg_Quantity,Is_High_Value,Is_Weekend_Transaction,Days_Since_Account_Creation,Billing_Shipping_Mismatch,Device_Fraud_Rate,Payment Method_bank transfer,Payment Method_credit card,Payment Method_debit card,Product Category_electronics,Product Category_health & beauty,Product Category_home & garden,Product Category_toys & games,Device Used_mobile,Device Used_tablet,Customer_Age_Group_Adult,Customer_Age_Group_Mid-age,Customer_Age_Group_Senior
0,42.32,1,40,1,282,23,24,3,6,1,0,42.32,3.768614,0.772603,0,1,2052,0.149541,1.5,-0.258819,0.965926,42.32,1.0,0,1,693,0,0.051243,False,False,False,True,False,False,False,False,False,True,False,False
1,301.34,3,35,1,223,0,22,1,0,0,1,904.02,5.711552,0.610959,1,1,496,1.345268,3.5,0.0,1.0,301.34,3.0,0,0,696,0,0.049425,False,True,False,True,False,False,False,False,True,True,False,False
2,340.32,5,29,5,360,8,22,1,0,0,1,1701.6,5.83282,0.986301,0,0,3593,0.942715,2.0,0.866025,-0.5,340.32,5.0,0,0,833,1,0.051243,False,False,True,False,False,False,True,False,False,True,False,False
3,95.77,5,45,1,325,20,16,1,1,0,0,478.85,4.572337,0.890411,0,1,496,0.293773,1.2,-0.866025,0.5,95.77,5.0,0,0,804,0,0.054435,False,True,False,True,False,False,False,True,False,True,False,False
4,77.45,5,42,1,116,15,16,1,1,0,0,387.25,4.362461,0.317808,0,1,3593,0.661966,0.0,-0.707107,-0.707107,77.45,5.0,0,0,595,0,0.051243,False,True,False,False,False,False,False,False,False,True,False,False


In [None]:
# Unique values per column
print(X_smart_encoded.nunique())

Transaction Amount                    18375
Quantity                                  5
Customer Age                             74
Customer Location                        23
Account Age Days                        365
Transaction Hour                         24
Transaction_Day                          31
Transaction_Month                         4
Transaction_Weekday                       7
Is_Weekend                                2
Is_High_Amount                            2
Total_Value                           21028
Log_Amount                            18372
Account_Age_Years                       365
Is_Night_Transaction                      2
Same_Shipping_Billing                     2
Location_First_Letter                    25
Transaction_Amount_Per_Account_Age    23522
Fraud_Risk_Score                          8
Hour_sin                                 22
Hour_cos                                 22
Customer_Avg_Transaction_Amount       18375
Customer_Avg_Quantity           

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif, VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
import numpy as np



# ===========================
# 1. Variance Threshold
# ===========================
var_selector = VarianceThreshold(threshold=0.01)  #This means any feature with a variance below 0.01 will be removed, in this method we cant fix the number of output cols ..
var_selector.fit(X_smart_encoded)
selected_features_var = X_smart_encoded.columns[var_selector.get_support()].tolist()

# ===========================
# 2. Pearson Correlation
# ===========================
correlations = X_smart_encoded.corrwith(y).abs()
selected_features_corr = correlations.sort_values(ascending=False).head(10).index.tolist()

# ===========================
# 3. Chi-Square Test
# ===========================
X_chi_scaled = MinMaxScaler().fit_transform(X_smart_encoded)  # Chi2 requires non-negative values, so we have to normalize it
chi2_selector = SelectKBest(score_func=chi2, k=10)
chi2_selector.fit(X_chi_scaled, y)
selected_features_chi2 = X_smart_encoded.columns[chi2_selector.get_support()].tolist()

# ===========================
# 4. ANOVA F-Test
# ===========================
f_selector = SelectKBest(score_func=f_classif, k=10)
f_selector.fit(X_smart_encoded, y)
selected_features_f = X_smart_encoded.columns[f_selector.get_support()].tolist()

# ===========================
# 5. Mutual Information
# ===========================
mi_selector = SelectKBest(score_func=mutual_info_classif, k=10)
mi_selector.fit(X_smart_encoded, y)
selected_features_mi = X_smart_encoded.columns[mi_selector.get_support()].tolist()

# ===========================
# Combine results into a dictionary
# ===========================
filter_results = {
    'VarianceThreshold': selected_features_var,
    'PearsonCorrelation': selected_features_corr,
    'Chi2': selected_features_chi2,
    'ANOVA_F': selected_features_f,
    'Mutual_Info': selected_features_mi
}

# Display results
for method, features in filter_results.items():
    print(f"\nTop features by {method}:")
    print(features)



Top features by VarianceThreshold:
['Transaction Amount', 'Quantity', 'Customer Age', 'Customer Location', 'Account Age Days', 'Transaction Hour', 'Transaction_Day', 'Transaction_Month', 'Transaction_Weekday', 'Is_Weekend', 'Is_High_Amount', 'Total_Value', 'Log_Amount', 'Account_Age_Years', 'Is_Night_Transaction', 'Same_Shipping_Billing', 'Location_First_Letter', 'Transaction_Amount_Per_Account_Age', 'Fraud_Risk_Score', 'Hour_sin', 'Hour_cos', 'Customer_Avg_Transaction_Amount', 'Customer_Avg_Quantity', 'Is_High_Value', 'Is_Weekend_Transaction', 'Days_Since_Account_Creation', 'Billing_Shipping_Mismatch', 'Payment Method_bank transfer', 'Payment Method_credit card', 'Payment Method_debit card', 'Product Category_electronics', 'Product Category_health & beauty', 'Product Category_home & garden', 'Product Category_toys & games', 'Device Used_mobile', 'Device Used_tablet', 'Customer_Age_Group_Adult', 'Customer_Age_Group_Mid-age']

Top features by PearsonCorrelation:
['Transaction Amount', 

In [None]:
print(len(selected_features_var))
print(len(selected_features_corr))
print(len(selected_features_chi2))
print(len(selected_features_f))
print(len(selected_features_mi))

38
10
10
10
10


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

**1. Variance Threshold**

In [None]:
# Keep only selected features
X = X_smart_encoded[selected_features_var]


# --------------------------
# Step 3: Train-Test Split
# --------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Randomforest + SMOTE Pipeline
pipeline = ImbPipeline(steps=[
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit the pipeline
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]

# Evaluation
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob))


Confusion Matrix:
 [[4390   93]
 [ 178   66]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97      4483
           1       0.42      0.27      0.33       244

    accuracy                           0.94      4727
   macro avg       0.69      0.62      0.65      4727
weighted avg       0.93      0.94      0.94      4727


ROC-AUC Score: 0.7904958806127337


### **Pearson's Correlation**

In [None]:
# Keep only selected features
X = X_smart_encoded[selected_features_corr]


# --------------------------
# Step 3: Train-Test Split
# --------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Randomforest + SMOTE Pipeline
pipeline = ImbPipeline(steps=[
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit the pipeline
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]

# Evaluation
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob))


Confusion Matrix:
 [[4060  423]
 [ 144  100]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.91      0.93      4483
           1       0.19      0.41      0.26       244

    accuracy                           0.88      4727
   macro avg       0.58      0.66      0.60      4727
weighted avg       0.93      0.88      0.90      4727


ROC-AUC Score: 0.7513923272983913


### **Chi-Square Test**

In [None]:
# Keep only selected features
X = X_smart_encoded[selected_features_chi2]


# --------------------------
# Step 3: Train-Test Split
# --------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Randomforest + SMOTE Pipeline
pipeline = ImbPipeline(steps=[
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit the pipeline
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]

# Evaluation
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob))


Confusion Matrix:
 [[4162  321]
 [ 157   87]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.93      0.95      4483
           1       0.21      0.36      0.27       244

    accuracy                           0.90      4727
   macro avg       0.59      0.64      0.61      4727
weighted avg       0.92      0.90      0.91      4727


ROC-AUC Score: 0.738559238361314


### **ANOVA F-Test**

In [None]:
# Keep only selected features
X = X_smart_encoded[selected_features_f]


# --------------------------
# Step 3: Train-Test Split
# --------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Randomforest + SMOTE Pipeline
pipeline = ImbPipeline(steps=[
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit the pipeline
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]

# Evaluation
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob))


Confusion Matrix:
 [[4067  416]
 [ 146   98]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.91      0.94      4483
           1       0.19      0.40      0.26       244

    accuracy                           0.88      4727
   macro avg       0.58      0.65      0.60      4727
weighted avg       0.93      0.88      0.90      4727


ROC-AUC Score: 0.7486826371392108


### **Mutual Information Test**

In [None]:
# Keep only selected features
X = X_smart_encoded[selected_features_mi]


# --------------------------
# Step 3: Train-Test Split
# --------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Randomforest + SMOTE Pipeline
pipeline = ImbPipeline(steps=[
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit the pipeline
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]

# Evaluation
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob))


Confusion Matrix:
 [[4242  241]
 [ 158   86]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.95      0.96      4483
           1       0.26      0.35      0.30       244

    accuracy                           0.92      4727
   macro avg       0.61      0.65      0.63      4727
weighted avg       0.93      0.92      0.92      4727


ROC-AUC Score: 0.7620071088227657


### **Wrapper Method**

**We Have done with the Filter Method, Now move to Wrapper Method, In this method we require a model to select features its not stastical .**

In [None]:
'''
1. Forward Selection (SFS)
2. Backward Elimination (SBS)
3. Recursive Feature Elimination (RFE)
4. Exhaustive Feature Selection (Brute Force)
5. Sequential Feature Selection (SFS & SBS)   - this 2 are exactly same like 1 and 2


In case of wrapper method since it use ML models to select best features, it is very important to do train test split before applying wrapper methods.
and then use the same train test data for my main model training and evaluation, because

This mimics a real-world scenario: your model only has access to training data.
Prevents data leakage: the test set must remain unseen and untouched during any kind of training or preprocessing.
Maintains generalization integrity of model performance metrics.

'''

'''
A. from mlxtend.feature_selection import SequentialFeatureSelector - This one class provide 4 different types

1. Forward Selection (SFS) → forward=True, floating=False
2. Backward Elimination (SBS) → forward=False, floating=False
3. Floating Forward Selection (SFFS) → forward=True, floating=True
4. Floating Backward Selection (SBFS) → forward=False, floating=True


B. from sklearn.feature_selection import RFE

Eliminates features recursively based on model coefficients or feature importances.
Works only with estimators that expose .coef_ or .feature_importances_.

C. from sklearn.feature_selection import RFECV

Recursive Feature Elimination with Cross-Validation (RFECV)
Same as RFE, but automatically selects the best number of features using cross-validation.

Except this 2 (RFE & RFECV) in all other methods we can pass any model (regressor for regression problem , classifier for classification problem)

D. from mlxtend.feature_selection import ExhaustiveFeatureSelector

Tries all possible combinations of features within a range.
Very slow for large feature sets (n_features > 20)


'''

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
import pandas as pd

# Assume: X_smart_encoded and y are already defined
X = X_smart_encoded
y = y

# Train/test split for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Estimator
model = LogisticRegression(max_iter=1000)

# Dictionary to store top 10 features from each method
wrapper_results = {}

# 1. Forward Selection (greedy)
sfs_forward = SFS(model,
                  k_features=10,
                  forward=True,
                  floating=False,
                  scoring='accuracy',
                  cv=5)
sfs_forward.fit(X_train, y_train)
wrapper_results['Forward_Selection'] = list(sfs_forward.k_feature_names_)

# 2. Backward Elimination
sfs_backward = SFS(model,
                   k_features=10,
                   forward=False,
                   floating=False,
                   scoring='accuracy',
                   cv=5)
sfs_backward.fit(X_train, y_train)
wrapper_results['Backward_Elimination'] = list(sfs_backward.k_feature_names_)

# 3. Recursive Feature Elimination (RFE)
rfe = RFE(estimator=model, n_features_to_select=10)
rfe.fit(X_train, y_train)
rfe_selected = X.columns[rfe.support_]
wrapper_results['RFE'] = rfe_selected.tolist()

# 4. Exhaustive Feature Selector (Brute Force)
# ⚠️ Very slow for large numbers of features — use with caution.
efs = EFS(model,
          min_features=10,
          max_features=10,
          scoring='accuracy',
          print_progress=False,
          cv=3)
efs.fit(X_train, y_train)
wrapper_results['Exhaustive'] = list(efs.best_feature_names_)

# 5. Sequential Forward Selection (Floating)
sfs_floating = SFS(model,
                   k_features=10,
                   forward=True,
                   floating=True,  # enables adding and dropping
                   scoring='accuracy',
                   cv=5)
sfs_floating.fit(X_train, y_train)
wrapper_results['SFS_Floating'] = list(sfs_floating.k_feature_names_)

# Convert results to a comparison DataFrame
result_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in wrapper_results.items()]))
print(result_df)


In [None]:
# This code is just use cross_validation additionally ...

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score, StratifiedKFold
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
import pandas as pd
import numpy as np

# Assume X_smart_encoded and y are already defined
X = X_smart_encoded
y = y

# Estimator
model = LogisticRegression(max_iter=1000)
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Dictionary to store results
wrapper_results = {}
cv_scores = {}

# 1. Forward Selection (greedy)
sfs_forward = SFS(model,
                  k_features=10,
                  forward=True,
                  floating=False,
                  scoring='accuracy',
                  cv=cv_strategy)
sfs_forward.fit(X, y)
forward_features = list(sfs_forward.k_feature_names_)
wrapper_results['Forward_Selection'] = forward_features

# Evaluate with selected features
score = cross_val_score(model, X[forward_features], y, cv=cv_strategy, scoring='accuracy').mean()
cv_scores['Forward_Selection'] = score

# 2. Backward Elimination
sfs_backward = SFS(model,
                   k_features=10,
                   forward=False,
                   floating=False,
                   scoring='accuracy',
                   cv=cv_strategy)
sfs_backward.fit(X, y)
backward_features = list(sfs_backward.k_feature_names_)
wrapper_results['Backward_Elimination'] = backward_features
score = cross_val_score(model, X[backward_features], y, cv=cv_strategy, scoring='accuracy').mean()
cv_scores['Backward_Elimination'] = score

# 3. Recursive Feature Elimination (RFE)
rfe = RFE(estimator=model, n_features_to_select=10)
rfe.fit(X, y)
rfe_selected = X.columns[rfe.support_].tolist()
wrapper_results['RFE'] = rfe_selected
score = cross_val_score(model, X[rfe_selected], y, cv=cv_strategy, scoring='accuracy').mean()
cv_scores['RFE'] = score

# 4. Exhaustive Feature Selector (Brute Force)
# ⚠️ VERY slow for many features — be cautious
efs = EFS(model,
          min_features=10,
          max_features=10,
          scoring='accuracy',
          print_progress=False,
          cv=3)
efs.fit(X, y)
exhaustive_features = list(efs.best_feature_names_)
wrapper_results['Exhaustive'] = exhaustive_features
score = cross_val_score(model, X[exhaustive_features], y, cv=cv_strategy, scoring='accuracy').mean()
cv_scores['Exhaustive'] = score

# 5. Sequential Floating Forward Selection
sfs_floating = SFS(model,
                   k_features=10,
                   forward=True,
                   floating=True,
                   scoring='accuracy',
                   cv=cv_strategy)
sfs_floating.fit(X, y)
floating_features = list(sfs_floating.k_feature_names_)
wrapper_results['SFS_Floating'] = floating_features
score = cross_val_score(model, X[floating_features], y, cv=cv_strategy, scoring='accuracy').mean()
cv_scores['SFS_Floating'] = score

# Convert feature subsets to DataFrame
result_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in wrapper_results.items()]))

# Print feature comparison
print("\nSelected Features by Method:")
print(result_df)

# Print cross-validated performance
print("\nCross-validated Accuracy:")
for method, acc in cv_scores.items():
    print(f"{method}: {acc:.4f}")


In [None]:
# REFCV automatically select the best number of features


from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

# Define cross-validation strategy
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# RFECV automatically selects the optimal number of features
rfecv = RFECV(estimator=model,
              step=1,
              cv=cv_strategy,
              scoring='accuracy',  # or another metric like 'f1', 'roc_auc'
              min_features_to_select=1)

# Fit RFECV on training data
rfecv.fit(X_train, y_train)

# Get selected features
rfecv_selected = X_train.columns[rfecv.support_]
wrapper_results['RFECV'] = rfecv_selected.tolist()

# (Optional) View the optimal number of features
print(f"Optimal number of features: {rfecv.n_features_}")

# (Optional) Plot performance vs number of features
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
plt.title('RFECV - Accuracy vs Number of Selected Features')
plt.xlabel('Number of Features Selected')
plt.ylabel('Cross-Validation Accuracy')
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.grid(True)
plt.show()
