In [None]:
"""
lib/validate_input.py

Project Goal:
Build a fraud-scoring service that flags suspicious transactions with high accuracy.
To train reliable classifiers, we need clean, complete data:
  - All features and the target must be present.
  - Date and numeric columns must use correct types.
  - No missing values or duplicate records should slip into training.

Why validate first?
In supervised learning, models learn patterns from labeled examples.
If columns are missing or mistyped, downstream steps (cleaning, feature creation,
model training) will fail or produce misleading results.
Early checks enforce the “contract” between our raw input and the pipeline:
  • Schema consistency for reproducible workflows 
  • Correct dtypes for time-based features and numeric scaling 
  • Awareness of nulls/duplicates so cleaning can handle them explicitly

This module offers three functions:
  - validate_schema: confirms required columns exist
  - validate_types: flags wrong dtypes in datetime columns
  - check_nulls_and_duplicates: reports counts for nulls and duplicates
"""

import pandas as pd

# List every column our fraud models expect to find
REQUIRED_COLUMNS = [
    'Transaction_Amount',
    'Transaction_Type',
    'Timestamp',
    'Location',
    'Merchant_Category',
    'Risk_Score',
    'Fraud_Label'
]

def validate_schema(df: pd.DataFrame) -> bool:
    """
    Ensure that the raw DataFrame contains all required columns.
    Raises an error if any are missing, stopping the pipeline early.
    """
    missing = [col for col in REQUIRED_COLUMNS if col not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")
    return True

def validate_types(df: pd.DataFrame) -> dict:
    """
    Check that Timestamp is stored as a datetime type.
    Correct dtypes are essential for time-based feature engineering downstream.
    Returns a dict of issues found (empty if none).
    """
    issues = {}
    if 'Timestamp' in df.columns:
        if not pd.api.types.is_datetime64_any_dtype(df['Timestamp']):
            issues['Timestamp'] = 'Expected datetime, got object'
    return issues

def check_nulls_and_duplicates(df: pd.DataFrame) -> tuple:
    """
    Count missing values in each column and the number of exact duplicate rows.
    This prepares us to remove or impute nulls and drop duplicates in the cleaning step.
    Returns (null_counts, duplicate_count).
    """
    null_counts = df.isnull().sum()
    duplicate_count = df.duplicated().sum()
    return null_counts, duplicate_count

In [None]:
"""
lib/clean_data.py

Project Goal:
After confirming our raw data has the right columns and types,
we need to fill every gap and ensure correct formats before models train.
In supervised learning, missing values and wrong types break feature
creation and model fitting. This module:

  1. Fills numeric gaps with each column’s average.
  2. Fills categorical gaps with the most common value.
  3. Converts Timestamp text into real datetimes and forward‐fills any missing times.
  4. Removes exact duplicate rows.

The output is a complete, type-safe DataFrame ready for feature engineering.
"""

import pandas as pd

def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean a DataFrame by filling nulls and correcting types:
      • Numeric columns → fill with mean  
      • Categorical columns → fill with mode  
      • Timestamp → convert to datetime, then forward‐fill  
      • Drop duplicates  

    Returns the cleaned DataFrame.
    """
    # 1. Fill numeric columns with their mean
    numeric_cols = df.select_dtypes(include='number').columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

    # 2. Fill categorical columns with their most frequent value
    categorical_cols = df.select_dtypes(include='object').columns
    df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

    # 3. Fix Timestamp: convert and forward-fill
    if 'Timestamp' in df.columns:
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
        df['Timestamp'] = df['Timestamp'].ffill()

    # 4. Drop any exact duplicates
    df = df.drop_duplicates()

    return df


In [None]:
"""
lib/feature_engineering.py

Project Goal:
Turn cleaned transaction data into a set of numeric inputs that our fraud models can learn from.
Good features help:
  • Distance-based and linear models by scaling numbers
  • Tree-based models by encoding categories
  • All models by capturing user behavior in derived metrics

This module follows three steps:
  1. Selection   – drop ID or free-text columns that carry no predictive signal  
  2. Transformation – encode categorical fields and standardize numeric ones  
  3. Creation    – build new features like total spend or time gaps between transactions  
"""

import pandas as pd
from sklearn.preprocessing import StandardScaler

def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply feature engineering so the output is model-ready:
      - Remove unhelpful columns (IDs, free text)
      - One-hot encode every text category
      - Scale numeric features to mean 0, std 1 for KNN and Logistic Regression
      - Create summary features to capture money flow and timing
    """
    # 1. Drop ID or free-text columns
    for col in ['Transaction_ID', 'User_ID', 'Free_Text_Description']:
        if col in df.columns:
            df = df.drop(col, axis=1)

    # 2. One-hot encode each remaining text column
    categorical_cols = df.select_dtypes(include='object').columns.tolist()
    if categorical_cols:
        df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    # 3. Scale numeric features for distance-based and linear models
    numeric_cols = df.select_dtypes(include='number').columns.tolist()
    # exclude the label column if present
    numeric_cols = [c for c in numeric_cols if c.lower() not in ['fraud_label', 'is_fraud']]
    if numeric_cols:
        scaler = StandardScaler()
        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    # 4. Create derived features
    # TotalSpend: sum of all amount-related columns (captures overall spend)
    amount_cols = [c for c in df.columns if 'amount' in c.lower()]
    if len(amount_cols) > 1:
        df['TotalSpend'] = df[amount_cols].sum(axis=1)

    # TimeSinceLast: gap in seconds between consecutive transactions (captures user tempo)
    if 'Timestamp' in df.columns:
        df = df.sort_values(by='Timestamp')
        df['TimeSinceLast'] = (
            df['Timestamp'].diff().dt.total_seconds().fillna(0)
        )
        df = df.drop('Timestamp', axis=1)

    return df

In [4]:
"""
test_feature_engineering.py

This test covers the data preparation step that:
  • Encodes text categories as one-hot flags  
  • Scales all numeric inputs to mean 0 and standard deviation 1  
  • Builds two new features:
      – TotalSpend: sum of all “amount” columns  
      – TimeSinceLast: seconds since the previous transaction  
  • Drops the original Timestamp once the interval is computed

We feed sample transactions into engineer_features() and inspect the output.
"""

import pandas as pd
import sys

# allow import from src/feature_engineering.py
sys.path.append('../')  
from src.feature_engineering import engineer_features

# Sample input matching our feature rules
data = {
    'merchant_type':     ['grocery', 'electronics', 'grocery'],
    'location':          ['NY', 'CA', 'TX'],
    'amount':            [120.5, 560.0, 75.25],
    'time_gap':          [30, 120, 15],
    'amount1':           [50, 200, 25],
    'amount2':           [70.5, 360, 50.25],
    'amount3':           [0, 0, 0],
    'Timestamp': pd.to_datetime([
        '2025-10-14 10:00:00',
        '2025-10-14 10:02:00',
        '2025-10-14 10:05:00'
    ])
}

df_test = pd.DataFrame(data)

# Run the feature engineering pipeline
df_transformed = engineer_features(df_test)

# Inspect resulting columns and first rows
print(df_transformed.head())

# Expected outcome:
# 1) merchant_type → one-hot: merchant_type_grocery  
# 2) location → one-hot: location_NY, location_TX  
# 3) Numeric columns (amount, time_gap, amount1, amount2, amount3) are scaled  
# 4) New TotalSpend column = sum of scaled amount* columns  
# 5) New TimeSinceLast column = [0.0, 120.0, 180.0] (seconds)  
# 6) Original Timestamp is dropped  

     amount  time_gap   amount1   amount2  amount3  merchant_type_grocery  \
0 -0.601093 -0.539164 -0.539164 -0.634337      0.0                   True   
1  1.409156  1.401826  1.401826  1.411797      0.0                  False   
2 -0.808064 -0.862662 -0.862662 -0.777460      0.0                   True   

   location_NY  location_TX  TimeSinceLast  
0         True        False            0.0  
1        False        False          120.0  
2        False         True          180.0  
