## Importing necessary libraries.

In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, roc_auc_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

## Loading the dataset.

In [2]:
df = pd.read_csv("fraud.csv")

## Analysing the data

In [3]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
df.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [5]:
#Checking if there there exits any null values.
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

It seems the dataset does not have any null values.

In [6]:
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


The size of the data it really big (around 500MB). This would take more time for normal GPU to respond.  

In [7]:
df['type'].unique()

array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],
      dtype=object)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [9]:
legit = len(df[df.isFraud == 0])
fraud = len(df[df.isFraud == 1])

In [10]:
print("Number of Legit transactions: ", legit)
print("Number of Fraud transactions: ", fraud)

Number of Legit transactions:  6354407
Number of Fraud transactions:  8213


In [11]:
legitPercentage = (legit / (fraud + legit)) * 100
fraudPercentage = (fraud / (fraud + legit)) * 100

In [12]:
print("Percentage of Legit transactions: {:.4f} %".format(legitPercentage))
print("Percentage of Fraud transactions: {:.4f} %".format(fraudPercentage))

Percentage of Legit transactions: 99.8709 %
Percentage of Fraud transactions: 0.1291 %


Here, the last four cell we were calculating the number of fraud and legit transactions. The results shows that data is unbalanced where the percentage of legit transaction is around 99.8% and fraud transaction is 0.12%.
So, the two model that I have selected for this data are Random Forest and Logistic Regression though this models take so much of time to process but will give the best results according to me.

In [13]:
#creating a copy of original dataset to train and test models

new_df=df.copy()
new_df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## Label Encoding

In [14]:
objList = new_df.select_dtypes(include = "object").columns
print (objList)

Index(['type', 'nameOrig', 'nameDest'], dtype='object')


In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for feat in objList:
    new_df[feat] = le.fit_transform(new_df[feat].astype(str))

In [16]:
print (new_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            int32  
 2   amount          float64
 3   nameOrig        int32  
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        int32  
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int32(3), int64(3)
memory usage: 461.2 MB
None


In [17]:
!pip install statsmodels



## Multicolinearity

In [18]:
# Import library for VIF

from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(df):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = df.columns
    vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]

    return(vif)

calc_vif(new_df)

Unnamed: 0,variables,VIF
0,step,2.79161
1,type,4.467405
2,amount,4.149312
3,nameOrig,2.764234
4,oldbalanceOrg,576.803777
5,newbalanceOrig,582.709128
6,nameDest,3.300975
7,oldbalanceDest,73.349937
8,newbalanceDest,85.005614
9,isFraud,1.195305


In [19]:
new_df['Actual_amount_orig'] = new_df.apply(lambda x: x['oldbalanceOrg'] - x['newbalanceOrig'],axis=1)
new_df['Actual_amount_dest'] = new_df.apply(lambda x: x['oldbalanceDest'] - x['newbalanceDest'],axis=1)
new_df['TransactionPath'] = new_df.apply(lambda x: x['nameOrig'] + x['nameDest'],axis=1)

#Dropping columns
new_df = new_df.drop(['oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest','step','nameOrig','nameDest'],axis=1)

calc_vif(new_df)

Unnamed: 0,variables,VIF
0,type,2.687803
1,amount,3.818902
2,isFraud,1.184479
3,isFlaggedFraud,1.002546
4,Actual_amount_orig,1.30791
5,Actual_amount_dest,3.754335
6,TransactionPath,2.677167


## Model Building

In [20]:
from sklearn.preprocessing import StandardScaler

## Normalizing Amount

In [21]:
scaler = StandardScaler()
new_df["NormalizedAmount"] = scaler.fit_transform(new_df["amount"].values.reshape(-1, 1))
new_df.drop(["amount"], inplace= True, axis= 1)

Y = new_df["isFraud"]
X = new_df.drop(["isFraud"], axis= 1)

## Train-Test Split

In [22]:
(X_train, X_test, Y_train, Y_test) = train_test_split(X, Y, test_size= 0.3, random_state= 42)

print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (4453834, 6)
Shape of X_test:  (1908786, 6)


## Model Training 

In [23]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators= 100)
random_forest.fit(X_train, Y_train)

Y_pred_rf = random_forest.predict(X_test)
random_forest_score = random_forest.score(X_test, Y_test) * 100

In [24]:
#Logistic Regression

logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, Y_train)

Y_pred_lr = logistic_regression.predict(X_test)
logistic_regression_score = logistic_regression.score(X_test, Y_test) * 100

In [30]:
print("Random Forest Score: ", random_forest_score)
print("Logistic Regression Score: ", logistic_regression_score)

Random Forest Score:  99.95892677335227
Logistic Regression Score:  99.8032781045125


## Evaluation

In [26]:
rf_accuracy = accuracy_score(Y_test, Y_pred_rf)
rf_precision = precision_score(Y_test, Y_pred_rf)
rf_recall = recall_score(Y_test, Y_pred_rf)
rf_f1_score = f1_score(Y_test, Y_pred_rf)
rf_roc_auc_score = roc_auc_score(Y_test, Y_pred_rf)

In [27]:
print("Random Forest Evaluation Metrics:")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1 Score:", rf_f1_score)
print("ROC AUC Score:", rf_roc_auc_score)

Random Forest Evaluation Metrics:
Accuracy: 0.9995892677335227
Precision: 0.9645469893078221
Recall: 0.7039014373716632
F1 Score: 0.8138651471984806
ROC AUC Score: 0.8519341949711536


In [28]:
lr_accuracy = accuracy_score(Y_test, Y_pred_lr)
lr_precision = precision_score(Y_test, Y_pred_lr)
lr_recall = recall_score(Y_test, Y_pred_lr)
lr_f1_score = f1_score(Y_test, Y_pred_lr)
lr_roc_auc_score = roc_auc_score(Y_test, Y_pred_lr)

In [29]:
print("\nLogistic Regression Evaluation Metrics:")
print("Accuracy:", lr_accuracy)
print("Precision:", lr_precision)
print("Recall:", lr_recall)
print("F1 Score:", lr_f1_score)
print("ROC AUC Score:", lr_roc_auc_score)


Logistic Regression Evaluation Metrics:
Accuracy: 0.998032781045125
Precision: 0.29127134724857684
Recall: 0.3782340862422998
F1 Score: 0.329104877613007
ROC AUC Score: 0.6885292709847491


Here Random Forest looks good.

## Conclusion

The models used here are Random forest and Logistic Regression which both seem to have equal accuracy, but the precision of random forest is more. In models like fraud detection precision is very important to consider as it highly important to have correct information of fraud transactions then actually having legit transaction. The model training for this kind of problem is maininly to find the nature of fraud and to have been able to find the required outcome. And the most importantly, the selection of this is because it works pretty good with this kind of Unbalanced data.

How did you select variables to be included in the model?

Using the VIF values and correlation heatmap. We just need to check if there are any two attributes highly correlated to each other and then drop the one which is less correlated to the isFraud Attribute.

What are the key factors that predict fraudulent customer?

1. Unusual transaction patterns, including large amounts, high frequency, and abnormal timing, can indicate fraudulent behavior.
2. Account activity and history, such as sudden balance changes or past fraudulent incidents, provide valuable insights into potential fraud risks.
3. Behavioral biometrics and device information, such as IP address, geolocation, and user behavior patterns, help detect anomalies and identify potential fraud attempts.

What kind of prevention should be adopted while company update its infrastructure?

1. Implement multi-factor authentication (MFA) to add an extra layer of security for user authentication.
2. Regularly update and patch software and systems to address known vulnerabilities and prevent exploitation by attackers.
3. Conduct regular security awareness training for employees to educate them about common threats and best practices for safeguarding sensitive information.
4. Employ encryption techniques to protect data both in transit and at rest, reducing the risk of unauthorized access or data breaches.
5. Implement intrusion detection and prevention systems (IDPS) to monitor network traffic and detect suspicious activity, allowing for timely response and mitigation of potential threats.

Assuming these actions have been implemented, how would you determine if they work?

1. Track reduction in unauthorized access attempts.
2. Measure decrease in security vulnerabilities.
3. Assess increase in employee security awareness.