## Load & Explore the Data

In [168]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score,confusion_matrix,classification_report,f1_score

# Load the data
transactions = pd.read_csv('dataset/transactions.csv')
transactions.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,8,CASH_OUT,158007.12,C424875646,0.0,0.0,C1298177219,474016.32,1618631.97,0
1,236,CASH_OUT,457948.3,C1342616552,0.0,0.0,C1323169990,2720411.37,3178359.67,0
2,37,CASH_IN,153602.99,C900876541,11160428.67,11314031.67,C608741097,3274930.56,3121327.56,0
3,331,CASH_OUT,49555.14,C177696810,10865.0,0.0,C462716348,0.0,49555.14,0
4,250,CASH_OUT,29648.02,C788941490,0.0,0.0,C1971700992,56933.09,86581.1,0


## Transactions Info

In [169]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199999 entries, 0 to 199998
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   step            199999 non-null  int64  
 1   type            199999 non-null  object 
 2   amount          199999 non-null  float64
 3   nameOrig        199999 non-null  object 
 4   oldbalanceOrg   199999 non-null  float64
 5   newbalanceOrig  199999 non-null  float64
 6   nameDest        199999 non-null  object 
 7   oldbalanceDest  199999 non-null  float64
 8   newbalanceDest  199999 non-null  float64
 9   isFraud         199999 non-null  int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 15.3+ MB


# Summary statistics on amount column

In [170]:
transactions['amount'].describe()

count    1.999990e+05
mean     1.802425e+05
std      6.255482e+05
min      0.000000e+00
25%      1.338746e+04
50%      7.426695e+04
75%      2.086376e+05
max      5.204280e+07
Name: amount, dtype: float64

## Checking for null Features

In [171]:
transactions.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
dtype: int64

## Feature Engineering

In [172]:
# creating isPayment field
transactions['isPayment'] = [1 if x in ['PAYMENT','DEBIT'] else 0 for x in transactions['type']]

# Creatin isMovement field
transactions['isMovement'] = [1 if x in ['CASH_OUT','TRANSFER'] else 0 for x in transactions['type']]

# Creating accountDiff field
transactions['accountDiff'] = abs(transactions['oldbalanceDest'] - transactions['oldbalanceOrg'])

transactions[['isPayment','isMovement','accountDiff']]

Unnamed: 0,isPayment,isMovement,accountDiff
0,0,1,474016.32
1,0,1,2720411.37
2,0,0,7885498.11
3,0,1,10865.00
4,0,1,56933.09
...,...,...,...
199994,1,0,69376.00
199995,0,1,40423.00
199996,0,0,2588672.49
199997,0,0,1429025.02


## Model Training

In [173]:
# features and label variables
features = transactions[['amount','isPayment','isMovement','accountDiff']] # x

label = transactions['isFraud'] # y

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    label, 
                                                    test_size=0.3,random_state=42)


## Normalizing the features variables

In [174]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

print(X_train_scaled.shape,y_train.shape)

(139999, 4) (139999,)


# Evaluating models

In [175]:
print("\nLogistic Regression Results:")
log_reg = LogisticRegression()

kf=KFold(n_splits=5,shuffle=True ,random_state=42)

cv_results=cross_val_score(log_reg,features,label,cv=kf)
log_reg.fit(X_train_scaled, y_train)
print("\nModel Coefficients:")
print(log_reg.coef_)
print("\nCross Val Score:")
print(cv_results)




Logistic Regression Results:

Model Coefficients:
[[ 0.27085886 -0.86017198  2.1629665  -0.93256293]]

Cross Val Score:
[0.9986     0.99845    0.998475   0.99885    0.99842496]


# Testing and Predicting

In [177]:
# # New transaction data
# transaction1 = np.array([123456.78, 0.0, 1.0, 54670.1])
# transaction2 = np.array([98765.43, 1.0, 0.0, 8524.75])
# transaction3 = np.array([543678.31, 1.0, 0.0, 510025.5])
# your_transaction = np.array([6472.54, 1.0, 0.0, 55901.23])

# # Combining and predicting
# sample_transactions = np.stack((transaction1, transaction2, transaction3, your_transaction))
# sample_transactions_scaled = scaler.transform(sample_transactions)

# print("\nFraud Predictions for New Transactions:")
# for i, tx in enumerate(sample_transactions_scaled):
#     proba = log_reg.predict_proba(tx.reshape(1, -1))[0] 
#     print(f"Transaction {i+1}:")
#     print(f"  Fraud Probability: {proba[1]:.4f}")
#     print(f"  Prediction: {'Fraud' if proba[1] > 0.5 else 'Legitimate'}")
#     print("-"*40)
# New transaction data
sample_transactions = np.array([
    [123456.78, 0.0, 1.0, 54670.1],
    [98765.43, 1.0, 0.0, 8524.75],
    [543678.31, 1.0, 0.0, 510025.5],
    [6472.54, 1.0, 0.0, 55901.23]
])

# Normalize new data
sample_transactions_scaled = scaler.transform(sample_transactions)

# Predict fraud
print("Fraud Predictions:", log_reg.predict(sample_transactions_scaled))
print("Fraud Probabilities:", log_reg.predict_proba(sample_transactions_scaled))

Fraud Predictions: [0 0 0 0]
Fraud Probabilities: [[9.96528861e-01 3.47113881e-03]
 [9.99992733e-01 7.26678035e-06]
 [9.99992107e-01 7.89307951e-06]
 [9.99993107e-01 6.89346538e-06]]




---

## 📌 To-Do

* Add confusion matrix and classification report
* Try other models: Random Forest, XGBoost
* Handle class imbalance with oversampling / weighting

---