In [3]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [4]:
data = pd.read_csv('fraudTrain.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [5]:
# convert to datetime object
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])
data['dob'] = pd.to_datetime(data['dob'])

In [6]:
data.dtypes

Unnamed: 0                        int64
trans_date_trans_time    datetime64[ns]
cc_num                            int64
merchant                         object
category                         object
amt                             float64
first                            object
last                             object
gender                           object
street                           object
city                             object
state                            object
zip                               int64
lat                             float64
long                            float64
city_pop                          int64
job                              object
dob                      datetime64[ns]
trans_num                        object
unix_time                         int64
merch_lat                       float64
merch_long                      float64
is_fraud                          int64
dtype: object

In [7]:
data.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [8]:
len(data)

1296675

In [9]:
data['is_fraud'].value_counts()

0    1289169
1       7506
Name: is_fraud, dtype: int64

In [11]:
data.isna().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [12]:
# feature engineering 
data['first_last'] = data['first'].str.lower() + ' ' + data['last'].str.lower()
data['cc_transaction_count'] = data.groupby('cc_num').cumcount() + 1
data['num_credit_cards'] = data.groupby('first_last')['cc_num'].transform('nunique')

In [None]:
data = data.drop(columns = [ 'Unnamed: 0', 'first', 'last'])

# transaction frequencies for different time periods 
data['trans_hour'] = data['trans_date_trans_time'].dt.hour
data['trans_dayofweek'] = data['trans_date_trans_time'].dt.dayofweek
data['hourly_txn_count'] = data.groupby('cc_num')['trans_date_trans_time'].transform(lambda x: x.diff().dt.total_seconds().lt(3600).cumsum())
data['daily_txn_count'] = data.groupby(['cc_num', data['trans_date_trans_time'].dt.date])['trans_date_trans_time'].transform('count')

In [None]:
# get the rolling mean amt from the last 5 transactions 
data['rolling_mean_amt'] = data.groupby('cc_num')['amt'].transform(lambda x: x.rolling(5, min_periods=1).mean())

# deviation of current transaction from the mean amt
data['amt_deviation'] = abs(data['amt'] - data['rolling_mean_amt'])

# z score of the amt
data['amt_zscore'] = data.groupby('cc_num')['amt'].transform(zscore)


In [None]:
# change data types
data['cc_num'] = data['cc_num'].astype(str)  
data['zip'] = data['zip'].astype(str)    


In [None]:
# transactions by time period
data['trans_year'] = data['trans_date_trans_time'].dt.year
data['trans_month'] = data['trans_date_trans_time'].dt.month
data['trans_day'] = data['trans_date_trans_time'].dt.day
data['dob_yr'] = data['dob'].dt.year

data = data.drop(columns=['dob','street','zip','unix_time'])

In [None]:
data['merchant'].str.lower()
data['merchant'] = data['merchant'].str.replace(r'^fraud_', '', regex=True)
data['merchant'].nunique()

EDA

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff

In [None]:
notfraud = data[data['is_fraud'] == 0]
isfraud = data[data['is_fraud'] == 1]

In [None]:
isfraud.describe()

In [None]:
notfraud.describe()

In [None]:
fraud_count = isfraud['city'].value_counts().sort_values(ascending=False).head(10)
notfraud_count = notfraud['city'].value_counts().sort_values(ascending=False).head(10)

plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
sns.barplot(x=fraud_count.index, y=fraud_count.values, palette='rocket_r')
plt.xticks(rotation=45)
plt.title("Top 10 cities (Fraud)")
plt.xlabel("Transaction Count")
plt.ylabel("City Name")


plt.subplot(1, 2, 2)
sns.barplot(x=notfraud_count.index, y=notfraud_count.values, palette='crest')
plt.xticks(rotation=45)
plt.title("Top 10 cities (Non-Fraud)")
plt.xlabel("Transaction Count")
plt.ylabel("City Name")

plt.tight_layout()
plt.show()

In [None]:
fraud_count = isfraud['merchant'].value_counts().sort_values(ascending=False).head(10)
notfraud_count = notfraud['merchant'].value_counts().sort_values(ascending=False).head(10)

plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
sns.barplot(x=fraud_count.index, y=fraud_count.values, palette='rocket_r')
plt.xticks(rotation=45)
plt.title("Top 10 merchants (Fraud)")
plt.xlabel("Transaction Count")
plt.ylabel("Merchant")


plt.subplot(1, 2, 2)
sns.barplot(x=notfraud_count.index, y=notfraud_count.values, palette='crest')
plt.xticks(rotation=45)
plt.xlabel("Transaction Count")
plt.ylabel("Merchant")
plt.title("Top 10 merchants (Non-Fraud)")

plt.tight_layout()
plt.show()

In [None]:

fraud_count = isfraud['state'].value_counts().sort_values(ascending=False).head(10)
notfraud_count = notfraud['state'].value_counts().sort_values(ascending=False).head(10)

plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
sns.barplot(x=fraud_count.index, y=fraud_count.values, palette='rocket_r')
plt.xticks(rotation=45)
plt.title("Top 10 states (Fraud)")
plt.xlabel("Transaction Count")
plt.ylabel("State")

plt.subplot(1, 2, 2)
sns.barplot(x=notfraud_count.index, y=notfraud_count.values, palette='crest')
plt.xticks(rotation=45)
plt.xlabel("Transaction Count")
plt.ylabel("State")
plt.title("Top 10 states (Non-Fraud)")

plt.tight_layout()
plt.show()

In [None]:
fraud_count = isfraud['trans_hour'].value_counts().sort_values(ascending=False)
notfraud_count = notfraud['trans_hour'].value_counts().sort_values(ascending=False)

plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
sns.barplot(x=fraud_count.index, y=fraud_count.values, palette='rocket_r')
plt.xticks(rotation=45)
plt.xlabel("Transaction Hour")
plt.ylabel("Transaction Count")
plt.title("Transaction Hours (Fraud)")

plt.subplot(1, 2, 2)
sns.barplot(x=notfraud_count.index, y=notfraud_count.values, palette='crest')
plt.xticks(rotation=45)
plt.xlabel("Transaction Hour")
plt.ylabel("Transaction Count")
plt.title("Transaction Hours (Non-Fraud)")

plt.tight_layout()
plt.show()

In [None]:

fraud_count = isfraud['job'].value_counts().sort_values(ascending=False).head(10)
notfraud_count = notfraud['job'].value_counts().sort_values(ascending=False).head(10)

plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
sns.barplot(x=fraud_count.index, y=fraud_count.values, palette='rocket_r')
plt.xticks(rotation=45)
plt.xlabel("Job title")
plt.ylabel("Transaction Count")
plt.title("Top 10 Jobs (Fraud)")


plt.subplot(1, 2, 2)
sns.barplot(x=notfraud_count.index, y=notfraud_count.values, palette='crest')
plt.xticks(rotation=45)
plt.xlabel("Job title")
plt.ylabel("Transaction Count")
plt.title("Top 10 Jobs (Non-Fraud)")

plt.tight_layout()
plt.show()

In [None]:
fraud_count = isfraud['category'].value_counts().sort_values(ascending=False).head(10)
notfraud_count = notfraud['category'].value_counts().sort_values(ascending=False).head(10)

plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
sns.barplot(x=fraud_count.index, y=fraud_count.values, palette='rocket_r')
plt.xticks(rotation=45)
plt.xlabel("Category of Transaction")
plt.ylabel("Transaction Count")
plt.title("Top 10 Transaction Categories (Fraud)")

plt.subplot(1, 2, 2)
sns.barplot(x=notfraud_count.index, y=notfraud_count.values, palette='crest')
plt.xticks(rotation=45)
plt.xlabel("Category of Transaction")
plt.ylabel("Transaction Count")
plt.title("Top 10 Transaction Categories (Non-Fraud)")

plt.tight_layout()
plt.show()

In [None]:
# create new col which computes time (sec) from of the transaction from the very first transaction
data['TimeInSeconds'] = (data['trans_date_trans_time'] - data['trans_date_trans_time'].min()).dt.total_seconds()
notfraud_t = data[data['is_fraud'] == 0]['TimeInSeconds'].values
isfraud_t = data[data['is_fraud'] == 1]['TimeInSeconds'].values

In [None]:

hist_data = [notfraud_t, isfraud_t]
labels = ['Not Fraud', 'Fraud']

fig = ff.create_distplot(hist_data, labels, show_hist=False, show_rug=False)

# Update layout
fig.update_layout(
    title='Credit Card Transactions Time Density Plot',
    xaxis_title='Time Since First Transaction [s]',
    yaxis_title='Density'
)

# Show the plot
fig.show()

In [None]:
# boxplot (without outliers displayed)
sns.boxplot(x="is_fraud", y="amt", hue="is_fraud",data=data, palette="PRGn",showfliers=False)

In [None]:
plt.figure(figsize=(10, 5))


sns.kdeplot(isfraud['cc_transaction_count'], shade=True, color='red', label="Fraud")
sns.kdeplot(notfraud['cc_transaction_count'], shade=True, color='blue', label="Non-Fraud")

plt.xlabel("Number of Transactions")
plt.ylabel("Density")
plt.title("Density Plot of Transaction Counts (Fraud vs Non-Fraud)")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10, 5))


sns.kdeplot(isfraud['rolling_mean_amt'], shade=True, color='red', label="Fraud")
sns.kdeplot(notfraud['rolling_mean_amt'], shade=True, color='blue', label="Non-Fraud")

plt.xlabel("Number of Transactions")
plt.ylabel("Density")
plt.title("Density Plot of Rolling Mean Amount (Fraud vs Non-Fraud)")
plt.legend()
plt.show()

In [None]:
data.to_pickle('fraud_data_2.pkl')