In [1]:
import os
os.chdir('../')
%pwd

'c:\\Users\\karthikeya\\Fraud_Detection'

In [5]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_parquet('artifacts/train_data.parquet')
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6271196,619,,386385.08,,4669568.85,5055953.92,C1977099364,,119649.98,0,0
1888243,164,CASH_IN,212458.78,C83569848,234635.0,447093.78,C1690589535,806037.88,593579.1,0,0
5549957,382,PAYMENT,19967.6,C852995095,3634.0,0.0,M1695416333,0.0,0.0,0,0
2025342,180,CASH_OUT,527616.51,C61761046,180216.0,0.0,C577654587,92157.1,619773.61,0,0
682342,36,TRANSFER,206067.85,C758004147,0.0,0.0,C2143015292,2131494.48,2337562.32,0,0


In [3]:
numerical_columns = ['amount', 'oldbalanceOrg', 'newbalanceOrig',
       'oldbalanceDest', 'newbalanceDest','isFraud', 'isFlaggedFraud']
discrete_columns = ['step','type']

In [6]:
for column in numerical_columns:
    df[column] = df[column].replace('', np.nan).astype(float)

In [7]:
df.isnull().sum()

step                   0
type                   0
amount                 0
nameOrig               0
oldbalanceOrg          0
newbalanceOrig    213457
nameDest               0
oldbalanceDest    163045
newbalanceDest         0
isFraud                0
isFlaggedFraud         0
dtype: int64

In [8]:
df[df.isna().any(axis=1)]

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6271196,619,,386385.08,,4669568.85,5055953.92,C1977099364,,119649.98,0.0,0.0
3599993,263,PAYMENT,5292.90,C192639813,257.00,,M1055957644,0.00,0.00,0.0,0.0
1429393,139,,5650.96,,0.00,0.00,M1756208398,,0.00,0.0,0.0
2696603,211,TRANSFER,1166173.36,C1461753973,0.00,,C1633373750,7563328.41,8729501.77,0.0,0.0
5554203,383,CASH_OUT,351.22,C1237102715,0.00,,C1370603699,1001295.95,1001647.17,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2795513,216,,326122.58,,0.00,0.00,C1350354405,,1199862.46,0.0,0.0
5147581,357,PAYMENT,11221.57,C335512392,56127.00,,M1714473611,0.00,0.00,0.0,0.0
3275709,251,CASH_OUT,266104.05,C968963698,11142.00,,C1760225299,103740.54,369844.60,0.0,0.0
3344769,253,,153029.85,,0.00,0.00,C1385873548,,2085289.46,0.0,0.0


In [9]:
fraud_trans_count = df['isFraud'].value_counts().iloc[1]
genuine_trans_count = df['isFraud'].value_counts().iloc[0]
ratio =fraud_trans_count/genuine_trans_count
print(f'Ratio of fraud to genuine transactions is :{ratio:.6f}')

Ratio of fraud to genuine transactions is :0.001297


- Ratio of fraud to genuine transactions is : 0.001297

In [10]:
df.groupby(['type'])['isFraud'].value_counts()

type      isFraud
          0.0         162815
          1.0            230
CASH_IN   0.0        1083737
CASH_OUT  0.0        1729849
          1.0           3208
DEBIT     0.0          32139
PAYMENT   0.0        1665536
TRANSFER  0.0         409427
          1.0           3155
Name: count, dtype: int64

In [11]:
df['type'] = df['type'].map(lambda x: x if x!='' else 'UNKNOWN')

In [12]:
df[['type','isFraud']].groupby(['type']).value_counts()

type      isFraud
CASH_IN   0.0        1083737
CASH_OUT  0.0        1729849
          1.0           3208
DEBIT     0.0          32139
PAYMENT   0.0        1665536
TRANSFER  0.0         409427
          1.0           3155
UNKNOWN   0.0         162815
          1.0            230
Name: count, dtype: int64

###  
- Out of all types, CASHOUT, TRANSFER type transactions have highest fraud transcations.
- CASH_IN, DEBIT and PAYMENT transactions do not have any fraudlent transactions.

In [13]:
df['errorbalanceOrig'] = df.newbalanceOrig + df.amount - df.oldbalanceOrg
df['errorbalanceDest'] = df.oldbalanceDest + df.amount - df.newbalanceDest

In [14]:
fraud = df[df['isFraud']==1]

nonfraud = df[df['isFraud']==0]

In [15]:
from scipy.stats import mannwhitneyu

stat, p = mannwhitneyu(fraud['amount'], nonfraud['amount'])
print(f"Mann-Whitney U Test - P-value: {p}")


Mann-Whitney U Test - P-value: 0.0


In [16]:
from abc import ABC, abstractmethod
from typing import List

from src.logger import logger


In [3]:
tr_df = pd.read_parquet('artifacts/train_data.parquet')

In [4]:
# Step:1 -> Define Abstract Base Class for Missing Value Handling Strategy

class MissingValueHandlingStrategy(ABC):
    """
    Interface for Missinng Value Handling
    """
    @abstractmethod
    def handle(self, df:pd.DataFrame):
        """
        Abstract method to handle missing values in the DataFrame.

        Parameters:
        df (pd.DataFrame): The input DataFrame containing missing values.

        Returns:
        pd.DataFrame: The DataFrame with missing values handled.
        """
        pass


# Step:2 -> Define Concrete Strategy For  Handling Missing Values

class DropMissingValuesStrategy(MissingValueHandlingStrategy):
    def __init__(self, axis=0):
        """
        Initializes the DropMissingValuesStrategy with specific parameters.

        Parameters:
        axis (int): 0 to drop rows with missing values, 1 to drop columns with missing values.
        """
        self.axis = axis

    def handle(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Drops rows or columns with missing values based on the axis.

        Parameters:
        df (pd.DataFrame): The input DataFrame containing missing values.

        Returns:
        pd.DataFrame: The DataFrame with missing values dropped.
        """
        try:
            logger.info(f"Dropping missing values with axis={self.axis}")
            df_cleaned = df.dropna(axis=self.axis)
            logger.info("Missing values dropped.")
            return df_cleaned
        except Exception as e:
            logger.error(f"Error in dropping missing values: {str(e)}")
            raise


    
# Step:3 -> Define Context Class for Handling Missing Values

class MissingValueHandler:
    def __init__(self, strategy: MissingValueHandlingStrategy ):
        self.strategy = strategy
        """
        Initializes the MissingValueHandler with a specific missing value handling strategy.

        Parameters: Strategy (MissingValueHandlingStrategy): The strategy to be used for handling missing values.
        """
        pass

    def set_strategy(self, strategy:MissingValueHandlingStrategy):
        """
        Sets a strategy for handling missing values
        Parameters:
            Strategy(MissingValueHandlingStrategy) : The strategy to be used for handling missing values.
        """
        self.strategy = strategy

    def handle_missing_values(self, df:pd.DataFrame)->pd.DataFrame:
        """
        Executes the missing value handling using the current strategy.

        Parameters:
        df (pd.DataFrame): The input DataFrame containing missing values.

        Returns:
        pd.DataFrame: The DataFrame with missing values handled.
        """
        logger.info("Executing missing value handling strategy.")
        return self.strategy.handle(df)



In [9]:
# Step: 1 -> Define abstract base class for feature engineering
class FeatureEngineeringStrategy(ABC):

    @abstractmethod
    def feature_engineer(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Abstract method for feature engineering
        """
        pass

# Step: 2 -> Define concrete strategy for feature engineering

# Concrete strategy to drop columns
class DropColumnsStrategy(FeatureEngineeringStrategy):

    def __init__(self, columns: List):
        """
        Initiates DropColumnsStrategy
        Parameters:
            columns (List) : List of columns (features) to be dropped.
        """
        self.columns = columns

    def feature_engineer(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Drops the columns from DataFrame given the list of columns
        Parameters:
            df (pd.DataFrame): DataFrame from which columns need to be dropped
        Returns:
            df (pd.DataFrame) Returns DataFrame after dropping columns
        """
        try:
            logger.info(f"Dropping columns: {self.columns}")
            df = df.drop(columns=self.columns, axis=1)
            logger.info("Columns dropped successfully.")
            return df
        except Exception as e:
            logger.error(f"Error in dropping columns: {str(e)}")
            raise

# Concrete strategy to create new columns
class CreateColumnsStrategy(FeatureEngineeringStrategy):

    def __init__(self):
        """
        Initiates CreateColumnsStrategy
        """
        pass

    def feature_engineer(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Method to create two new columns "errorbalanceOrig", "errorbalanceDest" from the given DataFrame
        Params:
            df (pd.DataFrame): DataFrame to create new columns
        Returns:
            df (pd.DataFrame): DataFrame with two added columns
        """
        try:
            logger.info("Adding two new features: 'errorbalanceOrig' and 'errorbalanceDest'")
            df['errorbalanceOrig'] = df.newbalanceOrig + df.amount - df.oldbalanceOrg
            df['errorbalanceDest'] = df.oldbalanceDest + df.amount - df.newbalanceDest
            logger.info("New features added successfully.")
            return df
        except Exception as e:
            logger.error(f"Error in creating new features: {str(e)}")
            raise

# Concrete strategy for filling NaN in columns of object type
class FillObjectColumsWithNaN(FeatureEngineeringStrategy):

    def __init__(self, columns: List):
        """
        Initiates the FillObjectColumnsWithNaN strategy
        Params:
            columns (List): Columns of object type with empty string to be replaced with NaN.
        """
        self.columns = columns

    def feature_engineer(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Method to fill NaN in the empty rows of object type columns and convert them to float dtype
        Params:
            df (pd.DataFrame): DataFrame in which empty rows of object type columns are present.
        Returns:
            Returns DataFrame with empty rows replaced with NaN and dtype converted to float
        """
        try:
            logger.info(f"Filling NaN for columns: {self.columns}")
            for column in self.columns:
                df[column] = df[column].replace('', np.nan).astype(float)
            logger.info("NaN values filled and dtype conversion completed.")
            return df
        except Exception as e:
            logger.error(f"Error in filling NaN and converting to float dtype: {str(e)}")
            raise

# Step: 3 -> Define context class for usage of the strategy
class EngineerFeatures:
    def __init__(self, strategy: FeatureEngineeringStrategy = None):
        """
        Initiates the context class to define a specific feature engineering strategy.
        Optionally, a strategy can be set at initialization.
        
        Params:
            strategy (FeatureEngineeringStrategy): Initial strategy for feature engineering (default: None)
        """
        self.strategy = strategy
        if strategy:
            logger.info(f"Feature engineering strategy set during initialization: {strategy.__class__.__name__}")

    def set_strategy(self, strategy: FeatureEngineeringStrategy):
        """
        Sets the type of strategy required for feature engineering.
        
        Params:
            strategy (FeatureEngineeringStrategy): Strategy object type
        """
        logger.info(f"Setting feature engineering strategy: {strategy.__class__.__name__}")
        self.strategy = strategy

    def engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Executes the feature engineering using the current strategy.
        
        Params:
            df (pd.DataFrame): DataFrame on which feature engineering will be applied
            
        Returns:
            df_feature_engineered (pd.DataFrame): DataFrame after feature engineering
        """
        if not self.strategy:
            logger.error("Feature engineering strategy is not set.")
            raise ValueError("Strategy not set. Use `set_strategy` or provide a strategy during initialization.")
        try:
            logger.info("Applying feature engineering strategy...")
            df = self.strategy.feature_engineer(df)
            logger.info("Feature engineering completed successfully.")
            return df
        except Exception as e:
            logger.error(f"Error in feature engineering process: {str(e)}")
            raise


In [23]:
column_dropper = EngineerFeatures(strategy=DropColumnsStrategy(columns=['step', 'type', 'isFlaggedFraud', "nameDest","nameOrig"]))
df_dropped= column_dropper.engineer_features(tr_df)

engineer_features = EngineerFeatures(strategy=FillObjectColumsWithNaN(columns=df_dropped.columns))
df_transformed = engineer_features.engineer_features(df_dropped)

missing_value_handler = MissingValueHandler(DropMissingValuesStrategy(axis=0))
df_cleaned = missing_value_handler.handle_missing_values(df_transformed)

add_features = EngineerFeatures(strategy=CreateColumnsStrategy())
df_new = add_features.engineer_features(df_cleaned)
df_new

[2025-01-13 13:09:22,978, INFO, 2364742296, Feature engineering strategy set during initialization: DropColumnsStrategy]
[2025-01-13 13:09:22,980, INFO, 2364742296, Applying feature engineering strategy...]
[2025-01-13 13:09:22,981, INFO, 2364742296, Dropping columns: ['step', 'type', 'isFlaggedFraud', 'nameDest', 'nameOrig']]
[2025-01-13 13:09:23,307, INFO, 2364742296, Columns dropped successfully.]
[2025-01-13 13:09:23,308, INFO, 2364742296, Feature engineering completed successfully.]
[2025-01-13 13:09:23,309, INFO, 2364742296, Feature engineering strategy set during initialization: FillObjectColumsWithNaN]
[2025-01-13 13:09:23,310, INFO, 2364742296, Applying feature engineering strategy...]
[2025-01-13 13:09:23,310, INFO, 2364742296, Filling NaN for columns: Index(['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'isFraud'],
      dtype='object')]
[2025-01-13 13:09:30,911, INFO, 2364742296, NaN values filled and dtype conversion completed.]
[2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['errorbalanceOrig'] = df.newbalanceOrig + df.amount - df.oldbalanceOrg
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['errorbalanceDest'] = df.oldbalanceDest + df.amount - df.newbalanceDest


Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,errorbalanceOrig,errorbalanceDest
1888243,212458.78,234635.00,447093.78,806037.88,593579.10,0.0,424917.56,4.249176e+05
5549957,19967.60,3634.00,0.00,0.00,0.00,0.0,16333.60,1.996760e+04
2025342,527616.51,180216.00,0.00,92157.10,619773.61,0.0,347400.51,0.000000e+00
682342,206067.85,0.00,0.00,2131494.48,2337562.32,0.0,206067.85,1.000000e-02
4453388,141.42,174.00,32.58,0.00,0.00,0.0,0.00,1.414200e+02
...,...,...,...,...,...,...,...,...
1570006,129715.85,5054252.83,5183968.68,246692.94,116977.09,0.0,259431.70,2.594317e+05
2234489,2459.70,0.00,0.00,0.00,0.00,0.0,2459.70,2.459700e+03
4926484,10579.16,59279.00,48699.84,322754.16,333333.32,0.0,0.00,-5.820766e-11
4304572,73020.76,20289.00,0.00,256102.84,329123.61,0.0,52731.76,-1.000000e-02
