In [50]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
df = pd.read_csv('fraudTrain.csv')

In [4]:
df.head()

Unnamed: 0,ID,Time,Card Number,merchant,category,Amount,firstName,lastName,trans_num,is_fraud
0,0,1/1/2019 0:00,2700000000000000.0,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,0b242abb623afc578575680df30655b9,0.0
1,1,1/1/2019 0:00,630000000000.0,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,1f76529f8574734946361c461b024d99,0.0
2,2,1/1/2019 0:00,38900000000000.0,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,a1a22d70485983eac12b5b88dad1cf95,0.0
3,3,1/1/2019 0:01,3530000000000000.0,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,6b849c168bdad6f867558c3793159a81,0.0
4,4,1/1/2019 0:03,376000000000000.0,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,a41d7549acf90789359a9aa5346dcb46,0.0


# Exploratory Data Analysis

In [5]:
df.isnull().sum()

ID             0
Time           0
Card Number    0
merchant       0
category       0
Amount         1
firstName      1
lastName       1
trans_num      1
is_fraud       1
dtype: int64

In [6]:
df.dtypes

ID               int64
Time            object
Card Number    float64
merchant        object
category        object
Amount         float64
firstName       object
lastName        object
trans_num       object
is_fraud       float64
dtype: object

In [7]:
null_rows = df[df.isnull().any(axis=1)]

In [85]:
null_rows

Unnamed: 0,ID,Time,Card Number,merchant,category,Amount,firstName,lastName,trans_num,is_fraud
1002103,1002103,2/15/2020 16:43,2270000000000000.0,fraud_Brown Inc,kids_,,,,,


In [32]:
df.dropna(axis=0, how='any',inplace=True)

In [33]:
df.shape

(1002103, 10)

In [34]:
#Noticing that this is imbalanced data
df.is_fraud.value_counts()/df.shape[0] *100

is_fraud
0.0    99.429899
1.0     0.570101
Name: count, dtype: float64

In [35]:
df.category.value_counts()

category
gas_transport     101728
grocery_pos        95341
home               95175
shopping_pos       90140
kids_pets          87321
shopping_net       75411
entertainment      72707
food_dining        70862
personal_care      70309
health_fitness     66267
misc_pos           61658
misc_net           48756
grocery_net        35061
travel             31367
Name: count, dtype: int64

# Data Preproccesing 

#### Removing ID column

In [36]:
df.drop(columns='ID',inplace=True)

In [38]:
df.head()

Unnamed: 0,Time,Card Number,merchant,category,Amount,firstName,lastName,trans_num,is_fraud
0,1/1/2019 0:00,2700000000000000.0,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,0b242abb623afc578575680df30655b9,0.0
1,1/1/2019 0:00,630000000000.0,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,1f76529f8574734946361c461b024d99,0.0
2,1/1/2019 0:00,38900000000000.0,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,a1a22d70485983eac12b5b88dad1cf95,0.0
3,1/1/2019 0:01,3530000000000000.0,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,6b849c168bdad6f867558c3793159a81,0.0
4,1/1/2019 0:03,376000000000000.0,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,a41d7549acf90789359a9aa5346dcb46,0.0


In [46]:
df['Time_timestamp'] = pd.to_datetime(df['Time'])

In [72]:
df['Amount'].sum()/df['Amount'].value_counts().sum()

70.25615704174125

In [76]:
df.head(n=11)

Unnamed: 0,Time,Card Number,merchant,category,Amount,firstName,lastName,trans_num,is_fraud,Time_timestamp
0,1/1/2019 0:00,2700000000000000.0,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,0b242abb623afc578575680df30655b9,0.0,2019-01-01 00:00:00
1,1/1/2019 0:00,630000000000.0,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,1f76529f8574734946361c461b024d99,0.0,2019-01-01 00:00:00
2,1/1/2019 0:00,38900000000000.0,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,a1a22d70485983eac12b5b88dad1cf95,0.0,2019-01-01 00:00:00
3,1/1/2019 0:01,3530000000000000.0,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,6b849c168bdad6f867558c3793159a81,0.0,2019-01-01 00:01:00
4,1/1/2019 0:03,376000000000000.0,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,a41d7549acf90789359a9aa5346dcb46,0.0,2019-01-01 00:03:00
5,1/1/2019 0:04,4770000000000000.0,"fraud_Stroman, Hudson and Erdman",gas_transport,94.63,Jennifer,Conner,189a841a0a8ba03058526bcfe566aab5,0.0,2019-01-01 00:04:00
6,1/1/2019 0:04,30100000000000.0,fraud_Rowe-Vandervort,grocery_net,44.54,Kelsey,Richards,83ec1cc84142af6e2acf10c44949e720,0.0,2019-01-01 00:04:00
7,1/1/2019 0:05,6010000000000000.0,fraud_Corwin-Collins,gas_transport,71.65,Steven,Williams,6d294ed2cc447d2c71c7171a3d54967c,0.0,2019-01-01 00:05:00
8,1/1/2019 0:05,4920000000000000.0,fraud_Herzog Ltd,misc_pos,4.27,Heather,Chase,fc28024ce480f8ef21a32d64c93a29f5,0.0,2019-01-01 00:05:00
9,1/1/2019 0:06,2720000000000000.0,"fraud_Schoen, Kuphal and Nitzsche",grocery_pos,198.39,Melissa,Aguilar,3b9014ea8fb80bd65de0b1463b00b00e,0.0,2019-01-01 00:06:00


In [81]:
df['Time_timestamp:year'] = df['Time_timestamp'].dt.year
df['Time_timestamp:month'] = df['Time_timestamp'].dt.month
df['Time_timestamp:day'] = df['Time_timestamp'].dt.day
df['Time_timestamp:dayofweek'] = df['Time_timestamp'].dt.dayofweek
df['Time_timestamp:hour'] = df['Time_timestamp'].dt.hour
df[['Time_timestamp','Time_timestamp:year','Time_timestamp:month','Time_timestamp:day','Time_timestamp:dayofweek','Time_timestamp:hour']].head()

Unnamed: 0,Time_timestamp,Time_timestamp:year,Time_timestamp:month,Time_timestamp:day,Time_timestamp:dayofweek,Time_timestamp:hour
0,2019-01-01 00:00:00,2019,1,1,1,0
1,2019-01-01 00:00:00,2019,1,1,1,0
2,2019-01-01 00:00:00,2019,1,1,1,0
3,2019-01-01 00:01:00,2019,1,1,1,0
4,2019-01-01 00:03:00,2019,1,1,1,0


In [83]:
df.dtypes

Time                                object
Card Number                        float64
merchant                            object
category                            object
Amount                             float64
firstName                           object
lastName                            object
trans_num                           object
is_fraud                           float64
Time_timestamp              datetime64[ns]
Time_timestamp:year                  int32
Time_timestamp:month                 int32
Time_timestamp:day                   int32
Time_timestamp:dayofweek             int32
Time_timestamp:hour                  int32
dtype: object