In [None]:
# This script reads the data, checks it, cleans it, builds features,
# trains four models, and prints how well each one works.

# 0. Project path setup 
import sys
from pathlib import Path

# Tell Python where to find our lib/ modules
PROJECT_ROOT = Path.cwd().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

# 1. Standard imports
import yaml                  # to read config files
import pandas as pd          # for tables of data

# 2. Pipeline modules
from lib.validate_input import validate_schema, validate_types, check_nulls_and_duplicates
   # functions to check columns, data types, missing values, duplicates
from lib.clean_data import clean_data
   # function to fill missing values and convert timestamps
from lib.feature_engineering import engineer_features
   # function to create new numeric features and encode categoricals

# 3. ML imports
from sklearn.model_selection import train_test_split
   # split data into train and test sets
from sklearn.linear_model import LogisticRegression
   # simple linear classifier
from sklearn.tree import DecisionTreeClassifier
   # tree-based classifier
from sklearn.ensemble import RandomForestClassifier
   # ensemble of trees
from sklearn.neighbors import KNeighborsClassifier
   # nearest-neighbor classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
   # metrics to measure model performance

# 4. Load config + raw data
with open("../config.yaml", "r") as f:
    config = yaml.safe_load(f)  
# get the CSV file path from the config
df_raw = pd.read_csv(config['input_data']['file1'])  
# read the raw transaction data

# 5. Validate schema
validate_schema(df_raw)  
# ensure required columns like 'Timestamp', 'Fraud_Label', etc. are present

# 6. Clean data (converts Timestamp to datetime, fills nulls)
df_cleaned = clean_data(df_raw)  
# fill numeric nulls with means, categorical nulls with modes,
# convert 'Timestamp' to datetime and forward-fill missing dates

# 7. Validate types after cleaning
type_issues = validate_types(df_cleaned)  
# confirm 'Timestamp' column is now a datetime
if type_issues:
    print("Type issues after cleaning:", type_issues)

# 8. Check nulls & duplicates
nulls, dupes = check_nulls_and_duplicates(df_cleaned)
print("Null counts:\n", nulls)        # show any remaining missing values
print("Duplicate rows:", dupes)       # show count of exact duplicate records

# 9. Feature engineering
df_model_ready = engineer_features(df_cleaned)
# create new numeric columns, scale numbers, encode categories,
# and drop any raw ID or timestamp columns inside that function

# 10. Define X, y
target = 'Fraud_Label' if 'Fraud_Label' in df_model_ready.columns else 'is_fraud'
X = df_model_ready.drop(target, axis=1)  
y = df_model_ready[target]  
# features go in X, the label to predict goes in y

if 'Timestamp' in X.columns:
    X = X.drop('Timestamp', axis=1)  
    # drop any leftover raw datetime column before modeling

# 11. Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)  
# hold out 20% of data for testing performance on unseen records

# 12. Define models (increase max_iter)
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'DecisionTree':       DecisionTreeClassifier(),
    'RandomForest':       RandomForestClassifier(),
    'KNN':                KNeighborsClassifier()
}
# list of algorithms we'll train and compare

# 13. Train & evaluate
results = []
for name, model in models.items():
    model.fit(X_train, y_train)         # learn patterns from training data
    preds = model.predict(X_test)       # predict on the test set
    results.append({
        'Model':     name,
        'Accuracy':  accuracy_score(y_test, preds),
        'Precision': precision_score(y_test, preds),
        'Recall':    recall_score(y_test, preds),
        'F1':        f1_score(y_test, preds)
    })
# store accuracy, precision, recall, and F1 for each model

# 14. Display results
print(pd.DataFrame(results))  
# print a table showing how well each classifier performed

Null counts:
 Transaction_ID                  0
User_ID                         0
Transaction_Amount              0
Transaction_Type                0
Timestamp                       0
Account_Balance                 0
Device_Type                     0
Location                        0
Merchant_Category               0
IP_Address_Flag                 0
Previous_Fraudulent_Activity    0
Daily_Transaction_Count         0
Avg_Transaction_Amount_7d       0
Failed_Transaction_Count_7d     0
Card_Type                       0
Card_Age                        0
Transaction_Distance            0
Authentication_Method           0
Risk_Score                      0
Is_Weekend                      0
Fraud_Label                     0
dtype: int64
Duplicate rows: 0
                Model  Accuracy  Precision    Recall        F1
0  LogisticRegression    0.8104   0.733543  0.648607  0.688465
1        DecisionTree    1.0000   1.000000  1.000000  1.000000
2        RandomForest    1.0000   1.000000  1.000000

# Results Summary

## Nulls and Duplicates

All columns report zero missing values and there are no duplicate rows.

- The cleaning step filled all nulls and removed duplicates successfully.

---

## Model Performance

| Model              | Accuracy | Precision | Recall  | F1      |
|--------------------|---------:|----------:|--------:|--------:|
| LogisticRegression |   0.8104 |   0.7335  | 0.6486  | 0.6885 |
| DecisionTree       |   1.0000 |   1.0000  | 1.0000  | 1.0000 |
| RandomForest       |   1.0000 |   1.0000  | 1.0000  | 1.0000 |
| KNN                |   0.7919 |   0.7661  | 0.5121  | 0.6138 |

---

## Interpretation

- LogisticRegression  
  - Accurately labels about 81% of transactions.  
  - Precision 0.73 means 73% of flagged frauds are real.  
  - Recall 0.65 means it catches 65% of actual frauds.

- DecisionTree and RandomForest  
  - Perfect scores (100%) indicate overfitting on the training data.  
  - They may fail to generalize to new, unseen data.

- KNN  
  - Accuracy near 79% and high precision (0.77).  
  - Recall 0.51 means it misses almost half of real frauds.