In [1]:

# load libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
# load the dataset 

Transaction =pd.read_csv("transactions_modified.csv", delimiter=";")
Transaction.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isPayment,isMovement,accountDiff
0,206,CASH_OUT,62927.08,C473782114,0.0,0.0,C2096898696,649420.67,712347.75,0,0,1,649420.67
1,380,PAYMENT,32851.57,C1915112886,0.0,0.0,M916879292,0.0,0.0,0,1,0,0.0
2,570,CASH_OUT,1131750.38,C1396198422,1131750.38,0.0,C1612235515,313070.53,1444820.92,1,0,1,818679.85
3,184,CASH_OUT,60519.74,C982551468,60519.74,0.0,C1378644910,54295.32,182654.5,1,0,1,6224.42
4,162,CASH_IN,46716.01,C1759889425,7668050.6,7714766.61,C2059152908,2125468.75,2078752.75,0,0,0,5542581.85


In [3]:
Transaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            1000 non-null   int64  
 1   type            1000 non-null   object 
 2   amount          1000 non-null   float64
 3   nameOrig        1000 non-null   object 
 4   oldbalanceOrg   1000 non-null   float64
 5   newbalanceOrig  1000 non-null   float64
 6   nameDest        1000 non-null   object 
 7   oldbalanceDest  1000 non-null   float64
 8   newbalanceDest  1000 non-null   float64
 9   isFraud         1000 non-null   int64  
 10  isPayment       1000 non-null   int64  
 11  isMovement      1000 non-null   int64  
 12  accountDiff     1000 non-null   float64
dtypes: float64(6), int64(4), object(3)
memory usage: 101.7+ KB


In this data set, we have 1000 entries and 13 columns and also 3 types datatype floate, int and object.

In [4]:
Transaction.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isPayment,isMovement,accountDiff
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,280.664,537308.0,1049284.0,637614.6,1028848.0,1302326.0,0.282,0.22,0.605,1744828.0
std,167.174593,1423692.0,3226500.0,2717351.0,2678541.0,3038042.0,0.450198,0.414454,0.489095,3792962.0
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,164.0,29337.05,128.75,0.0,0.0,0.0,0.0,0.0,0.0,42736.75
50%,261.0,126530.5,43714.0,0.0,38819.27,195657.2,0.0,0.0,1.0,338381.6
75%,373.25,301037.8,408091.4,37956.63,774716.7,1185959.0,1.0,0.0,1.0,1759913.0
max,741.0,10000000.0,50399050.0,40399050.0,30856510.0,31839620.0,1.0,1.0,1.0,50399050.0


Above table shows the mean, standard deviation, minimum value and maximum value in between respective column.

In [5]:
# find out missing values in the data set

Transaction.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isPayment         0
isMovement        0
accountDiff       0
dtype: int64

here is no missing values as per above information.

In [6]:
# Create isPayment field

Transaction['isPayment']=0
Transaction['isPayment'][Transaction['type'].isin(['PAYMENT','DEBIT'])]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Transaction['isPayment'][Transaction['type'].isin(['PAYMENT','DEBIT'])]=1


In [7]:
# Create isMovement field

Transaction['isMovement']=0
Transaction['isMovement'][Transaction['type'].isin(['CASH_OUT','TRANSFER'])]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Transaction['isMovement'][Transaction['type'].isin(['CASH_OUT','TRANSFER'])]=1


In [8]:
# Create accountDiff field

Transaction['accountDiff']=abs(Transaction['oldbalanceOrg']-Transaction['oldbalanceDest'])

In [9]:
Transaction

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isPayment,isMovement,accountDiff
0,206,CASH_OUT,62927.08,C473782114,0.00,0.00,C2096898696,649420.67,712347.75,0,0,1,649420.67
1,380,PAYMENT,32851.57,C1915112886,0.00,0.00,M916879292,0.00,0.00,0,1,0,0.00
2,570,CASH_OUT,1131750.38,C1396198422,1131750.38,0.00,C1612235515,313070.53,1444820.92,1,0,1,818679.85
3,184,CASH_OUT,60519.74,C982551468,60519.74,0.00,C1378644910,54295.32,182654.50,1,0,1,6224.42
4,162,CASH_IN,46716.01,C1759889425,7668050.60,7714766.61,C2059152908,2125468.75,2078752.75,0,0,0,5542581.85
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,298,CASH_OUT,173833.78,C1112634506,16302.00,0.00,C996800768,316909.31,490743.09,0,0,1,300607.31
996,58,TRANSFER,561948.38,C856217790,561948.38,0.00,C1278181974,0.00,0.00,1,0,1,561948.38
997,72,CASH_OUT,622235.32,C615309889,622235.32,0.00,C755984599,3377968.96,4000204.28,1,0,1,2755733.64
998,178,CASH_OUT,119604.13,C42162938,30678.00,0.00,C540527919,22457787.17,22577391.30,0,0,1,22427109.17


In [10]:
# Create features and label 
features= Transaction[['amount', 'isPayment', 'isMovement','accountDiff']]
label=Transaction['isFraud']

In [11]:
# test train split model 

x_train, x_test, y_train, y_test = train_test_split(features, label, test_size=0.3, random_state=23)

In [12]:
# Normalize the features variables
scaler= StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

In [13]:
# Fit the model to the training data 
model=LogisticRegression()
model.fit(x_train, y_train)


In [14]:
# score the model on the training data

print(model.score(x_train, y_train))


0.8514285714285714


In [15]:
# print model coefficients

print(model.coef_)

[[ 2.83793645 -0.66541684  2.09425633 -1.94591828]]


In [16]:
# predict the label
x_train_prediction=model.predict(x_train)
y_pred=model.predict(x_test)

In [17]:
# find out the accuracy function of prediction train data

train_data_accuracy=accuracy_score(y_train, x_train_prediction)
train_data_accuracy

0.8514285714285714

In [18]:
# find out the accuracy function of prediction test data

accuracy_pred=accuracy_score(y_pred, y_test)
print(accuracy_pred)

0.8233333333333334
