In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
df = pd.read_csv("data/Fraud.csv")

df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [18]:
df["isFraud"] = df["isFraud"].astype("object")
df["isFlaggedFraud"] = df["isFlaggedFraud"].astype("object")

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         object 
 10  isFlaggedFraud  object 
dtypes: float64(5), int64(1), object(5)
memory usage: 534.0+ MB


In [20]:
df.drop_duplicates(inplace=True)

In [21]:
df.nunique()

step                  743
type                    5
amount            5316900
nameOrig          6353307
oldbalanceOrg     1845844
newbalanceOrig    2682586
nameDest          2722362
oldbalanceDest    3614697
newbalanceDest    3555499
isFraud                 2
isFlaggedFraud          2
dtype: int64

In [22]:
df.shape

(6362620, 11)

In [23]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [24]:
df.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [25]:
pd.set_option("display.float_format", "{:.2f}".format)

In [26]:
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.4,179861.9,833883.1,855113.67,1100701.67,1224996.4
std,142.33,603858.23,2888242.67,2924048.5,3399180.11,3674128.94
min,1.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.66,214661.44
75%,335.0,208721.48,107315.18,144258.41,943036.71,1111909.25
max,743.0,92445516.64,59585040.37,49585040.37,356015889.35,356179278.92


In [31]:
df.describe(include="object")

Unnamed: 0,type,nameOrig,nameDest,isFraud,isFlaggedFraud
count,6362620,6362620,6362620,6362620,6362620
unique,5,6353307,2722362,2,2
top,CASH_OUT,C1530544995,C1286084959,0,0
freq,2237500,3,113,6354407,6362604


In [33]:
df["type"].value_counts()

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

In [34]:
df.groupby("type")["isFraud"].value_counts()

type      isFraud
CASH_IN   0          1399284
CASH_OUT  0          2233384
          1             4116
DEBIT     0            41432
PAYMENT   0          2151495
TRANSFER  0           528812
          1             4097
Name: count, dtype: int64

In [35]:
from scipy.stats import chi2_contingency

contingency_nameOrig = pd.crosstab(df["nameOrig"], df["isFraud"])
contingency_nameDest = pd.crosstab(df["nameDest"], df["isFraud"])

In [43]:
statistic, p_value, dof, expected = chi2_contingency(contingency_nameOrig)

if p_value <= 0.05:
    print("Reject the null hypothesis: There is a significant association between nameOrig and isFraud.")
else:
    print("Fail to reject the null hypothesis: There is no significant association between nameOrig and isFraud.")

Fail to reject the null hypothesis: There is no significant association between nameOrig and isFraud.


In [44]:
statistic, p_value, dof, expected = chi2_contingency(contingency_nameDest)

if p_value <= 0.05:
    print("Reject the null hypothesis: There is a significant association between nameDest and isFraud.")
else:
    print("Fail to reject the null hypothesis: There is no significant association between nameDest and isFraud.")

Reject the null hypothesis: There is a significant association between nameDest and isFraud.


In [51]:
df.drop("nameOrig", axis=1, inplace=True)

In [52]:
nameDest_val_counts = df["nameDest"].value_counts()
frequent_vals = nameDest_val_counts[nameDest_val_counts > 1].index
df["dest_group"] = df["nameDest"].where(df["nameDest"].isin(frequent_vals), "other")
len(df["dest_group"].unique())

459659

In [54]:
df.drop("nameDest", axis=1, inplace=True)

In [55]:
df.head(20)

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,dest_group
0,1,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0,0,other
1,1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0,0,other
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1,0,C553264065
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1,0,C38997010
4,1,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0,0,other
5,1,PAYMENT,7817.71,53860.0,46042.29,0.0,0.0,0,0,other
6,1,PAYMENT,7107.77,183195.0,176087.23,0.0,0.0,0,0,other
7,1,PAYMENT,7861.64,176087.23,168225.59,0.0,0.0,0,0,other
8,1,PAYMENT,4024.36,2671.0,0.0,0.0,0.0,0,0,other
9,1,DEBIT,5337.77,41720.0,36382.23,41898.0,40348.79,0,0,C195600860


In [56]:
numerical_vals = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_vals = df.select_dtypes(include=["object"]).columns.tolist()