**02_knn_with_scaling_and_onehot.ipynb**
- Goal: KNN with simple feature engineering (scaling numeric features + one-hot encoding categorical features)
- Load CSV
- Build numeric and categorical feature lists
- Simple imputation (median/mode)
- StandardScaler for numeric
- OneHotEncoder for categoricals (no Pipeline, we do it step-by-step to keep it simple)
- Train/test split
- Fit KNN and evaluate

In [1]:
import yaml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from pathlib import Path
import sys
# Point to the project root (adjust parents[1] to parents[2] if notebook is deeper)
PROJECT_ROOT = Path.cwd().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

# <-- Imports custom preprocessing functions from 'functions.py' <-- #

from lib.feature_engineering import (engineer_features)

# <-- Loads YAML configuration to dynamically reference CSV output files. <-- #

config = None  # <-- Initialize config
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

In [2]:
# 1) Load the dataset
df_fraud_dataset = pd.read_csv(config['input_data']['file1']) 
# data_path = "/mnt/data/synthetic_fraud_dataset.csv"
# df = pd.read_csv(data_path)

print("Shape:", df_fraud_dataset.shape)
print("Columns:", df_fraud_dataset.columns.tolist())
print(df_fraud_dataset.head(3))

Shape: (50000, 21)
Columns: ['Transaction_ID', 'User_ID', 'Transaction_Amount', 'Transaction_Type', 'Timestamp', 'Account_Balance', 'Device_Type', 'Location', 'Merchant_Category', 'IP_Address_Flag', 'Previous_Fraudulent_Activity', 'Daily_Transaction_Count', 'Avg_Transaction_Amount_7d', 'Failed_Transaction_Count_7d', 'Card_Type', 'Card_Age', 'Transaction_Distance', 'Authentication_Method', 'Risk_Score', 'Is_Weekend', 'Fraud_Label']
  Transaction_ID    User_ID  Transaction_Amount Transaction_Type  \
0      TXN_33553  USER_1834               39.79              POS   
1       TXN_9427  USER_7875                1.19    Bank Transfer   
2        TXN_199  USER_2734               28.96           Online   

             Timestamp  Account_Balance Device_Type  Location  \
0  2023-08-14 19:30:00         93213.17      Laptop    Sydney   
1  2023-06-07 04:01:00         75725.25      Mobile  New York   
2  2023-06-20 15:25:00          1588.96      Tablet    Mumbai   

  Merchant_Category  IP_Address

In [3]:
# 2) Target
target_col = "Fraud_Label"
assert target_col in df_fraud_dataset.columns, "Target column Fraud_Label not found."

In [4]:
# 3) Optional: if Timestamp exists, create hour/day-of-week (simple time features)
if "Timestamp" in df_fraud_dataset.columns:
    df_fraud_dataset["Timestamp"] = pd.to_datetime(df_fraud_dataset["Timestamp"], errors="coerce")
    df_fraud_dataset["tx_hour"] = df_fraud_dataset["Timestamp"].dt.hour
    df_fraud_dataset["tx_dow"] = df_fraud_dataset["Timestamp"].dt.dayofweek

In [5]:
# 4) Choose numeric and categorical features (keep it small and clear)
candidate_numeric = [
    "Transaction_Amount", "Account_Balance", "IP_Address_Flag",
    "Previous_Fraudulent_Activity", "Daily_Transaction_Count",
    "Avg_Transaction_Amount_7d", "Failed_Transaction_Count_7d",
    "Card_Age", "Transaction_Distance", "Risk_Score", "Is_Weekend",
    "tx_hour", "tx_dow"  # only if they exist; we'll filter below
]
candidate_categorical = [
    "Transaction_Type", "Device_Type", "Card_Type", "Authentication_Method"
]

num_cols = [c for c in candidate_numeric if c in df_fraud_dataset.columns]
cat_cols = [c for c in candidate_categorical if c in df_fraud_dataset.columns]

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

Numeric columns: ['Transaction_Amount', 'Account_Balance', 'IP_Address_Flag', 'Previous_Fraudulent_Activity', 'Daily_Transaction_Count', 'Avg_Transaction_Amount_7d', 'Failed_Transaction_Count_7d', 'Card_Age', 'Transaction_Distance', 'Risk_Score', 'Is_Weekend', 'tx_hour', 'tx_dow']
Categorical columns: ['Transaction_Type', 'Device_Type', 'Card_Type', 'Authentication_Method']


In [6]:
# 5) Drop rows where target is missing (just to be safe)
df_fraud_dataset = df_fraud_dataset.dropna(subset=[target_col]).copy()
y = df_fraud_dataset[target_col].astype(int)

In [7]:
# 6) Build numeric matrix
X_num = df_fraud_dataset[num_cols].copy()

# Simple numeric imputation: fill missing values with median
num_imputer = SimpleImputer(strategy="median")
X_num_imputed = num_imputer.fit_transform(X_num)

# Scale numeric features (KNN cares about distances, so scaling helps)
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num_imputed)

In [8]:
# 7) Build categorical matrix
# Use OneHotEncoder to convert categories to 0/1 columns.
if len(cat_cols) > 0:
    X_cat = df_fraud_dataset[cat_cols].astype(str).copy()
    cat_imputer = SimpleImputer(strategy="most_frequent")
    X_cat_imputed = cat_imputer.fit_transform(X_cat)

    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    X_cat_ohe = ohe.fit_transform(X_cat_imputed)
else:
    # If there are no categorical columns available, just create empty array
    X_cat_ohe = np.empty((len(df_fraud_dataset), 0))

In [9]:
# 8) Combine numeric and categorical
# np.hstack stacks arrays side-by-side
X_all = np.hstack([X_num_scaled, X_cat_ohe])
print("Final feature matrix shape:", X_all.shape)

Final feature matrix shape: (50000, 28)


In [10]:
# 9) (Optional) small stratified sample for speed if your dataset is very big
# Here we keep it simple and train on everything; uncomment if needed:
# from sklearn.utils import resample
# # Example: take 6000 rows keeping class balance approximately
# sample_size = 6000
# if len(y) > sample_size:
#     strat_idx = (df[[target_col]]
#                  .assign(idx=np.arange(len(df)))
#                  .groupby(target_col, group_keys=False)
#                  .apply(lambda x: x.sample(frac=sample_size/len(df), random_state=42)))["idx"].values
#     X_all = X_all[strat_idx]
#     y = y.iloc[strat_idx].reset_index(drop=True)

In [11]:
# 10) Train/Test split (stratify keeps the same fraud ratio)
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y, test_size=0.20, random_state=42, stratify=y
)

print("Train:", X_train.shape, " Test:", X_test.shape)
print("Fraud rate train:", y_train.mean(), " Fraud rate test:", y_test.mean())

Train: (40000, 28)  Test: (10000, 28)
Fraud rate train: 0.32135  Fraud rate test: 0.3213


In [12]:
# 11) KNN model (start with k=25; you can try 5, 10, 15, 25, etc.)
knn = KNeighborsClassifier(n_neighbors=25)
knn.fit(X_train, y_train)

0,1,2
,n_neighbors,25
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [13]:
# 12) Predictions and evaluation
y_pred = knn.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Accuracy:", round(acc, 4))
print("\nClassification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8548

Classification report:
               precision    recall  f1-score   support

           0       0.84      0.98      0.90      6787
           1       0.93      0.59      0.72      3213

    accuracy                           0.85     10000
   macro avg       0.88      0.79      0.81     10000
weighted avg       0.87      0.85      0.84     10000

Confusion matrix:
 [[6642  145]
 [1307 1906]]
