### Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

### Load the Dataset

In [2]:
df = pd.read_csv('Fraud.csv')  

### Performing EDA (Exploratory Data Analysis)

In [3]:
print(df.info())
print(df.describe())
print(df['isFraud'].value_counts()) 
df = df.drop(columns=['nameOrig', 'nameDest','type'])
# Handling  missing values
df.fillna(df.median(), inplace=True)  

# Compute IQR only on non-fraud transactions
Q1 = df[df['isFraud'] == 0].quantile(0.25)
Q3 = df[df['isFraud'] == 0].quantile(0.75)
IQR = Q3 - Q1

# Apply IQR filtering only to non-fraud cases
non_fraud_filtered = df[df['isFraud'] == 0][~((df[df['isFraud'] == 0] < (Q1 - 1.5 * IQR)) | (df[df['isFraud'] == 0] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Keep all fraud cases
fraud_cases = df[df['isFraud'] == 1]

# Combine them back together
df = pd.concat([non_fraud_filtered, fraud_cases])

# Reset index after filtering
df = df.reset_index(drop=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
None
               step        amount  oldbalanceOrg  newbalanceOrig  \
count  6.362620e+06  6.362620e+06   6.362620e+06    6.362620e+06   
mean   2.433972e+02  1.798619e+05   8.338831e+05    8.551137e+05   
std    1.423320e+02  6.038582e+05   2.888243e+06    2.924049e+06   
min    1.000000e+00  0.000000e+00   0.000000e+00    0.000000e+00   
25%    1.560000e+02  1.338957e+04   0.000000e+00    0.000000e+00   
50%    2.390000e+02  7.487194e+04

In [4]:
df.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0
2,1,11668.14,41554.0,29885.86,0.0,0.0,0,0
3,1,7817.71,53860.0,46042.29,0.0,0.0,0,0
4,1,7107.77,183195.0,176087.23,0.0,0.0,0,0


### Performing Feature Engineering

In [5]:
df['transaction_amount'] = df['oldbalanceOrg'] - df['newbalanceOrig']
df['transaction_amount'] = df['transaction_amount'].clip(lower=0)  
df['log_transaction_amount'] = np.log1p(df['transaction_amount'])  
df['balance_change_ratio'] = df['transaction_amount'] / (df['oldbalanceOrg'] + 1)
df['is_destination_new'] = ((df['oldbalanceDest'] == 0) & (df['newbalanceDest'] == 0)).astype(int)
df['dest_balance_change'] = df['newbalanceDest'] - df['oldbalanceDest']
df['is_origin_empty'] = (df['oldbalanceOrg'] == 0).astype(int)
df['origin_balance_change'] = df['newbalanceOrig'] - df['oldbalanceOrg']

In [6]:
df.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,transaction_amount,log_transaction_amount,balance_change_ratio,is_destination_new,dest_balance_change,is_origin_empty,origin_balance_change
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0,9839.64,9.194276,0.057834,1,0.0,0,-9839.64
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0,1864.28,7.531166,0.087731,1,0.0,0,-1864.28
2,1,11668.14,41554.0,29885.86,0.0,0.0,0,0,11668.14,9.364703,0.280788,1,0.0,0,-11668.14
3,1,7817.71,53860.0,46042.29,0.0,0.0,0,0,7817.71,8.964275,0.145146,1,0.0,0,-7817.71
4,1,7107.77,183195.0,176087.23,0.0,0.0,0,0,7107.77,8.869085,0.038799,1,0.0,0,-7107.77


In [7]:
# Checking Multicollinearity using VIF
import statsmodels
from statsmodels.stats.outliers_influence import variance_inflation_factor
X_num = df.select_dtypes(include=[np.number])  
vif_data = pd.DataFrame()
vif_data["Feature"] = X_num.columns
vif_data["VIF"] = [variance_inflation_factor(X_num.values, i) for i in range(X_num.shape[1])]
print(vif_data)

  vif = 1. / (1. - r_squared_i)


                   Feature           VIF
0                     step  3.823407e+00
1                   amount  9.490302e+00
2            oldbalanceOrg  7.370850e+07
3           newbalanceOrig  3.199700e+09
4           oldbalanceDest           inf
5           newbalanceDest           inf
6                  isFraud  1.362792e+00
7           isFlaggedFraud  1.049718e+00
8       transaction_amount  2.620349e+01
9   log_transaction_amount  7.329943e+00
10    balance_change_ratio  4.854259e+00
11      is_destination_new  3.170059e+00
12     dest_balance_change           inf
13         is_origin_empty  3.630650e+00
14   origin_balance_change  1.115802e+10


In [8]:
print(df['isFraud'].value_counts())  # Should show both 0 and 1

isFraud
0    4319031
1       8213
Name: count, dtype: int64


### Splitting the Data

#### Select Features and Target

In [9]:
X = df.drop(columns=['isFraud'])
y = df['isFraud']

In [10]:
print("Number of fraud cases:", df[df['isFraud'] == 1].shape[0])
print("Number of non-fraud cases:", df[df['isFraud'] == 0].shape[0])


Number of fraud cases: 8213
Number of non-fraud cases: 4319031


#### Splitting into Train and Test sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

#### Applying SMOTEENN

In [12]:
smote_enn = SMOTEENN(sampling_strategy=0.1, random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

### Training the model

In [13]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [50, 100],  # Reduce the number of trees
    'max_depth': [10, 20, None],  
    'min_samples_split': [5, 10],  # Increase min_samples_split to reduce overfitting
    'min_samples_leaf': [2, 4],  
    'bootstrap': [True]
}

rf = RandomForestClassifier(random_state=42)
rf_search = RandomizedSearchCV(rf, param_grid, cv=2, n_iter=5, scoring='f1', n_jobs=-1, random_state=42)
rf_search.fit(X_resampled, y_resampled)

best_rf = rf_search.best_estimator_
print(f"Best Parameters: {rf_search.best_params_}")


Best Parameters: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': True}


In [15]:
from sklearn.metrics import precision_score, recall_score, f1_score

y_pred_probs = best_rf.predict_proba(X_test)[:, 1]  # Getting fraud probabilities

y_pred_adjusted = (y_pred_probs > 0.3).astype(int)

# Recalculating metrics
precision_adj = precision_score(y_test, y_pred_adjusted)
recall_adj = recall_score(y_test, y_pred_adjusted)
f1_adj = f1_score(y_test, y_pred_adjusted)

print(f"Adjusted Precision: {precision_adj:.4f}")
print(f"Adjusted Recall: {recall_adj:.4f}")
print(f"Adjusted F1 Score: {f1_adj:.4f}")

Adjusted Precision: 0.5986
Adjusted Recall: 0.9781
Adjusted F1 Score: 0.7427


### These scores suggest that the model is very effective at detecting fraud cases (high recall of 0.9781). It's catching 97.81% of actual fraud cases.

# 🔎Q & 🅰️

## 📍 Q.1. Process of Data Cleaning?

### • For Missing Values – Used .fillna(df.median(), inplace=True) to replace missing values with the median.
### • For Outlier treatment – Used the Interquartile Range (IQR) method to remove extreme outliers.
### • For Multi-Collinearity Check – Used Variance Inflation Factor (VIF) to detect multicollinearity and Features with a VIF > 10 were considered for removal to avoid redundancy

## 📍 Q.2. Elaborate the Model.

### The model is based on a Random Forest Classifier, which is an ensemble learning method that builds multiple decision trees and combines their outputs to improve accuracy.
### • Workflow:
### 1. Data Preprocessing – Handled missing values, outliers, and created new meaningful features.

### 2. Feature Engineering – Derived features like balance_change_ratio, is_destination_new, etc.

### 3.Resampling the Data – Used SMOTE to handle class imbalance.

### 4. Hyperparameter Tuning – Used RandomizedSearchCV to optimize model parameters.

### 5. Evaluation Metrics – Measured Precision, Recall, F1-score, and AUC-ROC.



## 📍 Q.3. Which variables are selected for the Model?

### - Selected transaction-related features relevant to fraud detection.
### - Removed highly correlated features to avoid redundancy.
### - Dropped features with high collinearity.
### - Kept only the most influential predictors.

## 📍 Q.4. Performance of the Model using the Best Set of Tools.

### Metrics Used:

### ➢ Precision: 0.5986

### ➢ Recall: 0.9781

### ➢ F1 Score: 0.7427

## 📍 Q.5. What Are the Key Factors That Predict Fraudulent Customers?

### 1. Transaction Amount – Large transactions often indicate fraud.

### 2. Balance Change Ratio – Fraudsters tend to deplete an account’s balance in a single transaction.

### 3. Is Destination New? – Many fraudulent transactions are sent to newly created accounts.

### 4. Origin Balance Change – A large change in the sender’s balance suggests fraudulent behavior.

## 📍 Q.6. Do These Factors Make Sense? If Yes, How? If Not, How Not?

### Yes, these factors make sense because they align with common fraud patterns:

### ➢ Fraudsters often make high-value transactions.

### ➢ They may create new recipient accounts to receive stolen funds.

### ➢ Sudden draining of an account is a red flag for fraud.

## 📍 Q.7. What Kind of Prevention Should Be Adopted While the Company Updates Its Infrastructure?

### 1. Multi-Factor Authentication (MFA) – Reduce unauthorized access.

### 2. Transaction Limits & Alerts – Notify users for high-value or unusual transactions.

### 3. Behavioral Analytics – Track user behavior for deviations from normal activity.

## 📍 Q.8. Assuming These Actions Have Been Implemented, How would They Work?

### Monitor Key Performance Indicators (KPIs):

### 1. Reduction in Fraud Cases – Compare fraud cases before and after implementation.

### 2. Customer Complaints – Monitor feedback related to fraud detection accuracy.
