**02_knn_with_scaling_and_onehot.ipynb**
- Goal: KNN with simple feature engineering (scaling numeric features + one-hot encoding categorical features)
- Load CSV
- Build numeric and categorical feature lists
- Simple imputation (median/mode)
- StandardScaler for numeric
- OneHotEncoder for categoricals (no Pipeline, we do it step-by-step to keep it simple)
- Train/test split
- Fit KNN and evaluate

In [28]:
import yaml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from pathlib import Path
import sys
# Point to the project root (adjust parents[1] to parents[2] if notebook is deeper)
PROJECT_ROOT = Path.cwd().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

# <-- Imports custom preprocessing functions from 'functions.py' <-- #

from lib.feature_engineering import (engineer_features)

# <-- Loads YAML configuration to dynamically reference CSV output files. <-- #

config = None  # <-- Initialize config
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

In [29]:
# 1) Load the dataset
df_fraud_dataset = pd.read_csv(config['input_data']['file1'])
kept_numeric_cols_df = pd.read_csv(config['output_data']['file2'])

print("\ndf_fraud_dataset Shape:", df_fraud_dataset.shape)
print("Columns:", df_fraud_dataset.columns.tolist())
display(df_fraud_dataset.head(3))


df_fraud_dataset Shape: (50000, 21)
Columns: ['Transaction_ID', 'User_ID', 'Transaction_Amount', 'Transaction_Type', 'Timestamp', 'Account_Balance', 'Device_Type', 'Location', 'Merchant_Category', 'IP_Address_Flag', 'Previous_Fraudulent_Activity', 'Daily_Transaction_Count', 'Avg_Transaction_Amount_7d', 'Failed_Transaction_Count_7d', 'Card_Type', 'Card_Age', 'Transaction_Distance', 'Authentication_Method', 'Risk_Score', 'Is_Weekend', 'Fraud_Label']


Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Timestamp,Account_Balance,Device_Type,Location,Merchant_Category,IP_Address_Flag,...,Daily_Transaction_Count,Avg_Transaction_Amount_7d,Failed_Transaction_Count_7d,Card_Type,Card_Age,Transaction_Distance,Authentication_Method,Risk_Score,Is_Weekend,Fraud_Label
0,TXN_33553,USER_1834,39.79,POS,2023-08-14 19:30:00,93213.17,Laptop,Sydney,Travel,0,...,7,437.63,3,Amex,65,883.17,Biometric,0.8494,0,0
1,TXN_9427,USER_7875,1.19,Bank Transfer,2023-06-07 04:01:00,75725.25,Mobile,New York,Clothing,0,...,13,478.76,4,Mastercard,186,2203.36,Password,0.0959,0,1
2,TXN_199,USER_2734,28.96,Online,2023-06-20 15:25:00,1588.96,Tablet,Mumbai,Restaurants,0,...,14,50.01,4,Visa,226,1909.29,Biometric,0.84,0,1


In [30]:
# 2) Target
target_col = "Fraud_Label"
assert target_col in df_fraud_dataset.columns, "Target column Fraud_Label not found."

In [31]:
# 3) Optional: if Timestamp exists, create hour/day-of-week (simple time features)
if "Timestamp" in df_fraud_dataset.columns:
    df_fraud_dataset["Timestamp"] = pd.to_datetime(df_fraud_dataset["Timestamp"], errors="coerce")
    df_fraud_dataset["tx_hour"] = df_fraud_dataset["Timestamp"].dt.hour
    df_fraud_dataset["tx_dow"] = df_fraud_dataset["Timestamp"].dt.day_name()
    
display(df_fraud_dataset[['Timestamp', "tx_hour", "tx_dow"]])

Unnamed: 0,Timestamp,tx_hour,tx_dow
0,2023-08-14 19:30:00,19,Monday
1,2023-06-07 04:01:00,4,Wednesday
2,2023-06-20 15:25:00,15,Tuesday
3,2023-12-07 00:31:00,0,Thursday
4,2023-11-11 23:44:00,23,Saturday
...,...,...,...
49995,2023-01-29 18:38:00,18,Sunday
49996,2023-05-09 08:55:00,8,Tuesday
49997,2023-01-30 19:32:00,19,Monday
49998,2023-03-09 19:47:00,19,Thursday


In [32]:
kept_numeric_cols_list = kept_numeric_cols_df["kept_numeric_features"].tolist()

print("Numeric columns to use from filter_numeric_ttest_vif:", kept_numeric_cols_list[:10], " ... total:", len(kept_numeric_cols_list))

cat_cols_list = df_fraud_dataset.select_dtypes(include='object').columns.tolist()
print("\nCategorical columns to use:", cat_cols_list[:10], " ... total:", len(cat_cols_list))

Numeric columns to use from filter_numeric_ttest_vif: ['Failed_Transaction_Count_7d', 'Risk_Score']  ... total: 2

Categorical columns to use: ['Transaction_ID', 'User_ID', 'Transaction_Type', 'Device_Type', 'Location', 'Merchant_Category', 'Card_Type', 'Authentication_Method', 'tx_dow']  ... total: 9


In [33]:
# 4) Choose numeric and categorical features (keep it small and clear)
candidate_numeric = (
    kept_numeric_cols_list  # only if they exist; we'll filter below
)
candidate_categorical = [
    "Transaction_Type", "Device_Type", "Location", "Card_Type", "Authentication_Method" # only if they exist; we'll filter below
]

num_cols = [c for c in candidate_numeric if c in df_fraud_dataset.columns]
cat_cols = [c for c in candidate_categorical if c in df_fraud_dataset.columns]

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

Numeric columns: ['Failed_Transaction_Count_7d', 'Risk_Score']
Categorical columns: ['Transaction_Type', 'Device_Type', 'Location', 'Card_Type', 'Authentication_Method']


In [34]:
# 5) Drop rows where target is missing (just to be safe)
df_fraud_dataset = df_fraud_dataset.dropna(subset=[target_col]).copy()
y = df_fraud_dataset[target_col].astype(int)

In [35]:
# 6) Build numeric matrix
X_num = df_fraud_dataset[num_cols].copy()

# Simple numeric imputation: fill missing values with median
num_imputer = SimpleImputer(strategy="median")
X_num_imputed = num_imputer.fit_transform(X_num)

# Scale numeric features (KNN cares about distances, so scaling helps)
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num_imputed)

In [36]:
# 7) Build categorical matrix
# Use OneHotEncoder to convert categories to 0/1 columns.
if len(cat_cols) > 0:
    X_cat = df_fraud_dataset[cat_cols].astype(str).copy()
    cat_imputer = SimpleImputer(strategy="most_frequent")
    X_cat_imputed = cat_imputer.fit_transform(X_cat)

    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    X_cat_ohe = ohe.fit_transform(X_cat_imputed)
else:
    # If there are no categorical columns available, just create empty array
    X_cat_ohe = np.empty((len(df_fraud_dataset), 0))

In [37]:
# 8) Combine numeric and categorical
# np.hstack stacks arrays side-by-side
X_all = np.hstack([X_num_scaled, X_cat_ohe])
print("Final feature matrix shape:", X_all.shape)

Final feature matrix shape: (50000, 22)


In [38]:
# 9) (Optional) small stratified sample for speed if your dataset is very big
# Here we keep it simple and train on everything; uncomment if needed:
# from sklearn.utils import resample
# # Example: take 6000 rows keeping class balance approximately
# sample_size = 6000
# if len(y) > sample_size:
#     strat_idx = (df[[target_col]]
#                  .assign(idx=np.arange(len(df)))
#                  .groupby(target_col, group_keys=False)
#                  .apply(lambda x: x.sample(frac=sample_size/len(df), random_state=42)))["idx"].values
#     X_all = X_all[strat_idx]
#     y = y.iloc[strat_idx].reset_index(drop=True)

In [39]:
# 10) Train/Test split (stratify keeps the same fraud ratio)
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y, test_size=0.20, random_state=42, stratify=y
)

print("Train:", X_train.shape, " Test:", X_test.shape)
print("Fraud rate train:", y_train.mean(), " Fraud rate test:", y_test.mean())

Train: (40000, 22)  Test: (10000, 22)
Fraud rate train: 0.32135  Fraud rate test: 0.3213


In [40]:
# 11) KNN model (start with k=25; try 5, 10, 15, 25, etc.)
knn = KNeighborsClassifier(n_neighbors=25)
knn.fit(X_train, y_train)

0,1,2
,n_neighbors,25
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [41]:
# 12) Predictions and evaluation
y_pred = knn.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Accuracy:", round(acc, 4))
print("\nClassification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9798

Classification report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      6787
           1       0.98      0.96      0.97      3213

    accuracy                           0.98     10000
   macro avg       0.98      0.97      0.98     10000
weighted avg       0.98      0.98      0.98     10000

Confusion matrix:
 [[6718   69]
 [ 133 3080]]
