In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install imbalanced-learn


In [None]:
#Import Python libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Reading the dataset

data=pd.read_csv("/kaggle/input/financial-dataset-for-fraud-detection-in-a-comapny/Fraud.csv")

# Analyzing the data

In [None]:
#Number of rows and columns in the dataset
data.shape

In [None]:
#First five rows of the data
data.head()

In [None]:
#Information about the data
data.info()

In [None]:
#Statistical details of the data
data.describe()

In [None]:
#Features of data
data.columns

# Data Preprocessing

In [None]:
#Unique values in column
data.nunique()

In [None]:
#sum of missing values in each column
data.isnull().sum()

In [None]:
# isFraud column values count
data.isFraud.value_counts()

In [None]:
# isFalggedFraud column values count
data.isFlaggedFraud.value_counts()

In [None]:
# As there is no information for customers that start with M (Merchants)
#filter out rows where the recipient name starts with "M"

new_data=data.loc[~data["nameDest"].str.startswith("M")]
new_data

# Exploratory Data Analysis

In [None]:
#Categorical features
new_data.describe(include=["object"])

In [None]:
#Converting categorical features to numerical
from sklearn.preprocessing import LabelEncoder
label=LabelEncoder()
new_data = new_data.copy()
new_data.loc[:, 'type'] = label.fit_transform(new_data['type'])
new_data.loc[:, 'nameOrig'] = label.fit_transform(new_data['nameOrig'])
new_data.loc[:, 'nameDest'] = label.fit_transform(new_data['nameDest'])

In [None]:
new_data.head()

In [None]:
new_data.info()

In [None]:
#Correlation heatmap
plt.figure(figsize=(15,10))
sns.heatmap(new_data.corr(),annot=True)
plt.show()

In [None]:
# Boxplot for visualizing outliers
features=new_data[['step', 'type', 'amount', 'nameOrig', 
          'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']]
plt.figure(figsize=(20,20))
plotnumber = 1

for column in features:
    if plotnumber<=20 :     
        ax = plt.subplot(20,1,plotnumber)
        sns.boxplot(x=features[column])
        plt.xlabel(column,fontsize=20)
    plotnumber+=1
plt.show()

# Building the model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Features and target variable
X = new_data.loc[:,['isFlaggedFraud','amount','oldbalanceOrg','newbalanceOrig','step','type','nameOrig']]
y = new_data.isFraud

In [None]:
# Using random undersampling to handle the imbalanced data
# removing random records from the majority class
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [None]:
# Trianing and testing data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
# Creating the model and Training the data
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
# Prediction on validation set
y_val_pred = model.predict(X_val)

In [None]:
# Prediction on test set
y_pred = model.predict(X_test)

In [None]:
#Validation set accuracy
accuracy_val = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", accuracy_val)

# Model's Performance evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)