**Keep only useful numeric features based on:**
-   (A) Two-sample t-tests (group 0 vs. group 1, H0: means are equal)
-   (B) VIF analysis to reduce multicollinearity

In [57]:
import yaml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.proportion import proportions_ztest


from pathlib import Path
import sys
# Point to the project root (adjust parents[1] to parents[2] if notebook is deeper)
PROJECT_ROOT = Path.cwd().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

# <-- Loads YAML configuration to dynamically reference CSV output files. <-- #

config = None  # <-- Initialize config
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

In [58]:
# 0) Load the dataset
df = pd.read_csv(config['input_data']['file1']) 

In [59]:
target_col = "Fraud_Label"
assert target_col in df.columns, "Target column Fraud_Label not found."

# Show quick info
print("Data shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df[[target_col]].value_counts(normalize=True))

Data shape: (50000, 21)
Columns: ['Transaction_ID', 'User_ID', 'Transaction_Amount', 'Transaction_Type', 'Timestamp', 'Account_Balance', 'Device_Type', 'Location', 'Merchant_Category', 'IP_Address_Flag', 'Previous_Fraudulent_Activity', 'Daily_Transaction_Count', 'Avg_Transaction_Amount_7d', 'Failed_Transaction_Count_7d', 'Card_Type', 'Card_Age', 'Transaction_Distance', 'Authentication_Method', 'Risk_Score', 'Is_Weekend', 'Fraud_Label']
Fraud_Label
0              0.67866
1              0.32134
Name: proportion, dtype: float64


In [60]:
# =========================
# Step 1 — Select numeric columns (X_num)
# =========================
# We only work with numeric columns.
# We do not include the target column in the numeric feature list.
numeric_cols_all = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols_all = [c for c in numeric_cols_all if c != target_col]

print("\nNumeric candidates:", numeric_cols_all)

# Simple imputation for numeric columns (median) so tests won't fail due to NaNs
for col in numeric_cols_all:
    if df[col].isna().any():
        df[col] = df[col].fillna(df[col].median())

# Split data by the target groups (0 and 1)
group0 = df[df[target_col] == 0]
group1 = df[df[target_col] == 1]


Numeric candidates: ['Transaction_Amount', 'Account_Balance', 'IP_Address_Flag', 'Previous_Fraudulent_Activity', 'Daily_Transaction_Count', 'Avg_Transaction_Amount_7d', 'Failed_Transaction_Count_7d', 'Card_Age', 'Transaction_Distance', 'Risk_Score', 'Is_Weekend']


In [61]:
# =========================
# Step 2 — Two-sample t-tests (Welch)
# H0: mean(group0) = mean(group1)
# If we REJECT H0 (p < alpha), we KEEP the column (means are different)
# Otherwise, we DROP the column
# =========================
alpha = 0.05  # significance level
keep_after_ttest = []
ttest_results = []  # will store: [feature, test_type, mean0, mean1, p_value, decision]

for col in numeric_cols_all:
    # Take the column values per group and drop NaNs just in case
    x0 = pd.to_numeric(group0[col], errors="coerce").dropna()
    x1 = pd.to_numeric(group1[col], errors="coerce").dropna()
    
    # --- Just to SEE the means (not required for the test) ---
    mean0 = x0.mean() if len(x0) > 0 else np.nan
    mean1 = x1.mean() if len(x1) > 0 else np.nan
    
    # --- If the column is binary (only 0/1 values), use proportions test ---
    unique_vals = pd.Series(pd.concat([x0, x1], axis=0).unique())
    is_binary = unique_vals.dropna().isin([0,1]).all()

    if is_binary:
        # ----- Two-proportions z-test for binary columns -----
        # successes = count of 1s; nobs = total observations in each group
        count = np.array([(x0 == 1).sum(), (x1 == 1).sum()])
        nobs  = np.array([len(x0), len(x1)])

        # basic safety checks
        if (nobs < 2).any():  # too few observations in a group → skip
            p_value = np.nan
            decision = "DROP (not enough data)"
            reject = False
        else:
            # z-test for difference in proportions
            stat, p_value = proportions_ztest(count, nobs)  # Two-proportion Z-test (two-sided)
            reject = (p_value < alpha)
            decision = "KEEP" if reject else "DROP"

        ttest_results.append([col, "prop_ztest", mean0, mean1, p_value, decision])
        
        if reject:
            keep_after_ttest.append(col)
    else:
        # ----- Welch's two-sample t-test for continuous columns -----
        # If any group has too few values or zero variance, t-test can be unstable
        if len(x0) < 5 or len(x1) < 5:
            p_value = np.nan
            decision = "DROP (not enough variation/data)"
            reject = False
        elif x0.std(ddof=1) == 0 or x1.std(ddof=1) == 0:
            p_value = np.nan
            decision = "DROP (not enough variation/data)"
            reject = False
        else:
            # Welch's t-test (doesn't assume equal variances)
            t_stat, p_value = stats.ttest_ind(x0, x1, equal_var=False)  # Welch's t-test (one-sided)
            reject = (p_value < alpha)
            decision = "KEEP" if reject else "DROP"

        ttest_results.append([col, "welch_ttest", mean0, mean1, p_value, decision])
        
        if reject:
            keep_after_ttest.append(col)

# Show t-test summary
ttest_df = pd.DataFrame(ttest_results, columns=["feature", "test_type", "mean_group0", "mean_group1", "p_value", "decision"])
print("\nT-test results (first 20):")
print(ttest_df.sort_values("p_value", na_position="last").head(20))
print("\nKept after t-test:", keep_after_ttest)

# If nothing passed (rare), fall back to using all numeric cols to avoid empty set
if len(keep_after_ttest) == 0:
    print("\n[Note] No columns passed the t-test threshold; using all numeric columns as fallback.")
    keep_after_ttest = numeric_cols_all.copy()


T-test results (first 20):
                         feature    test_type   mean_group0   mean_group1  \
6    Failed_Transaction_Count_7d  welch_ttest      1.507353      3.051472   
9                     Risk_Score  welch_ttest      0.425158      0.662904   
4        Daily_Transaction_Count  welch_ttest      7.504877      7.443767   
1                Account_Balance  welch_ttest  50356.472852  50162.264626   
7                       Card_Age  welch_ttest    120.148056    119.687123   
2                IP_Address_Flag   prop_ztest      0.049745      0.051161   
0             Transaction_Amount  welch_ttest     99.281907     99.683678   
3   Previous_Fraudulent_Activity   prop_ztest      0.098547      0.098089   
5      Avg_Transaction_Amount_7d  welch_ttest    255.203530    255.416370   
8           Transaction_Distance  welch_ttest   2499.278762   2498.922109   
10                    Is_Weekend   prop_ztest      0.299620      0.299683   

     p_value decision  
6   0.000000     KEEP  

In [62]:
# =========================
# Step 3 — VIF analysis (on remaining columns)
# We do NOT include the target in the VIF input.
# Steps:
#   - Build X with the kept columns
#   - Add a constant (intercept) for VIF calculation
#   - Compute VIF for each feature
#   - Drop features with high VIF (e.g., > 5 or > 10). We'll use 5 here.
# =========================
vif_threshold = 5.0  # common thresholds are 5 or 10; we choose 5 to be stricter

# numeric_cols_all = df.select_dtypes(include=[np.number]).columns.tolist()
# numeric_cols_all = [c for c in numeric_cols_all if c not in keep_after_ttest]

# Build design matrix with the kept columns
X_vif = df[keep_after_ttest].copy()

# If any new NaNs appeared, fill them to avoid VIF errors
for col in X_vif.columns:
    if X_vif[col].isna().any():
        X_vif[col] = X_vif[col].fillna(X_vif[col].median())

# Add constant for VIF calculation
X_vif_const = sm.add_constant(X_vif, has_constant="add")

# Compute VIF for each feature (excluding the constant itself when reporting)
vif_data = []
for i, col in enumerate(X_vif_const.columns):
    vif_val = variance_inflation_factor(X_vif_const.values, i)
    vif_data.append([col, vif_val])

vif_df = pd.DataFrame(vif_data, columns=["feature", "VIF"])
print("\nVIF table (including constant):")
print(vif_df)

# Filter to features only (exclude constant)
vif_features_df = vif_df[vif_df["feature"] != "const"].copy()

# Identify high VIF features
high_vif = vif_features_df[vif_features_df["VIF"] > vif_threshold]["feature"].tolist()
print("\nHigh VIF features (> {}):".format(vif_threshold), high_vif)

# Drop high VIF features in one pass (simple approach)
final_kept = [c for c in keep_after_ttest if c not in high_vif]

# If everything gets dropped (rare), keep the lowest-VIF feature as a fallback
if len(final_kept) == 0:
    lowest_vif_row = vif_features_df.sort_values("VIF").head(1)
    final_kept = lowest_vif_row["feature"].tolist()

print("\nFinal kept numeric features after t-test and VIF:")
print(final_kept)


VIF table (including constant):
                       feature       VIF
0                        const  6.040628
1  Failed_Transaction_Count_7d  1.000001
2                   Risk_Score  1.000001

High VIF features (> 5.0): []

Final kept numeric features after t-test and VIF:
['Failed_Transaction_Count_7d', 'Risk_Score']


In [63]:
# =========================
# Step 4 — Summary / Next step
# =========================
# 'final_kept' is our cleaned numeric feature list to use it in KNN/scaling.
selected_df = df[final_kept + [target_col]].copy().reset_index(drop=True)
print("\nSelected data shape:", selected_df.shape)

# Save the final df with kept columns + target_col to a CSV for reuse in other notebooks
selected_df.to_csv(config['output_data']['file1'], index=False, sep=",", encoding="utf-8")

print("Saved kept numeric features to:", config['output_data']['file1'])
display(selected_df.head())

# turn list -> Series with a name so 02-notebook can read the column by name
final_kept = pd.Series(final_kept, name="kept_numeric_features")

# Save the kept column names to a CSV for reuse in other notebooks
final_kept.to_csv(config['output_data']['file2'], index=False, sep=",", encoding="utf-8")

print("Saved kept numeric features to:", config['output_data']['file2'])
display(final_kept.head())


Selected data shape: (50000, 3)
Saved kept numeric features to: ../data/clean/selected_fraud_dataset.csv


Unnamed: 0,Failed_Transaction_Count_7d,Risk_Score,Fraud_Label
0,3,0.8494,0
1,4,0.0959,1
2,4,0.84,1
3,4,0.7935,1
4,4,0.3819,1


Saved kept numeric features to: ../data/clean/selected_numeric_features.csv


0    Failed_Transaction_Count_7d
1                     Risk_Score
Name: kept_numeric_features, dtype: object