In [100]:
import pandas as pd
from pathlib import Path
import plotly.express as px

In [12]:
train_data_path = Path('.').resolve().parent / 'data' / 'fraudTrain.csv'
test_data_path = Path('.').resolve().parent / 'data' / 'fraudTest.csv'

train_data = pd.read_csv(train_data_path, index_col=0)
test_data = pd.read_csv(test_data_path, index_col=0)

# EDA

## dataset structures

same features?

In [21]:
sorted(train_data.columns) == sorted(test_data.columns)

True

same dtypes?

In [25]:
all((train_data.dtypes == test_data.dtypes))

True

duplicated?

In [31]:
train_data.duplicated().any(), test_data.duplicated().any()

(np.False_, np.False_)

null?

In [36]:
train_data.isna().any().any(), test_data.isna().any().any()

(np.False_, np.False_)

## features

### trans_date_trans_time

In [89]:
dict_assign = {
    "trans_date_trans_time": pd.to_datetime(train_data['trans_date_trans_time'], yearfirst=True)
}

train_data = train_data.assign(**dict_assign)

dict_assign = {
    "trans_date_trans_time": pd.to_datetime(test_data['trans_date_trans_time'], yearfirst=True)
}

test_data = test_data.assign(**dict_assign)

per day:

In [122]:
px.bar(
    train_data['trans_date_trans_time'].dt.date.value_counts().sort_index().rename("train_count").to_frame().join( \
        test_data['trans_date_trans_time'].dt.date.value_counts().sort_index().rename("test_count") \
            , how='outer')
        )

per month:

In [123]:
px.bar(
    train_data['trans_date_trans_time'].dt.strftime('%Y-%m').value_counts().sort_index().rename("train_count").to_frame().join( \
        test_data['trans_date_trans_time'].dt.strftime('%Y-%m').value_counts().sort_index().rename("test_count") \
            , how='outer')
        )

### merchant

all "merchants" have "fraud" on their names?

In [66]:
train_data['merchant'].apply(lambda x: 'fraud' in x.lower()).all(), test_data['merchant'].apply(lambda x: 'fraud' in x.lower()).all()

(np.True_, np.True_)

how many "merchants" have duplicated values?

In [86]:
str((train_data['merchant'].duplicated(keep=False).sum() / train_data.shape[0]) * 100) + "%", str((test_data['merchant'].duplicated(keep=False).sum() / test_data.shape[0]) * 100) + "%"

('100.0%', '100.0%')

top 10 merchants based on number of transactions:

In [133]:
px.bar(train_data['merchant'].value_counts(normalize=True).iloc[:10], orientation='h')

### category

In [87]:
train_data['category']

0               misc_net
1            grocery_pos
2          entertainment
3          gas_transport
4               misc_pos
               ...      
1296670    entertainment
1296671      food_dining
1296672      food_dining
1296673      food_dining
1296674      food_dining
Name: category, Length: 1296675, dtype: object

In [110]:
train_data

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.2620,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.00,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,2020-06-21 12:12:08,30263540414123,fraud_Reichel Inc,entertainment,15.56,Erik,Patterson,M,162 Jessica Row Apt. 072,Hatch,...,37.7175,-112.4777,258,Geoscientist,1961-11-24,440b587732da4dc1a6395aba5fb41669,1371816728,36.841266,-111.690765,0
1296671,2020-06-21 12:12:19,6011149206456997,fraud_Abernathy and Sons,food_dining,51.70,Jeffrey,White,M,8617 Holmes Terrace Suite 651,Tuscarora,...,39.2667,-77.5101,100,"Production assistant, television",1979-12-11,278000d2e0d2277d1de2f890067dcc0a,1371816739,38.906881,-78.246528,0
1296672,2020-06-21 12:12:32,3514865930894695,fraud_Stiedemann Ltd,food_dining,105.93,Christopher,Castaneda,M,1632 Cohen Drive Suite 639,High Rolls Mountain Park,...,32.9396,-105.8189,899,Naval architect,1967-08-30,483f52fe67fabef353d552c1e662974c,1371816752,33.619513,-105.130529,0
1296673,2020-06-21 12:13:36,2720012583106919,"fraud_Reinger, Weissnat and Strosin",food_dining,74.90,Joseph,Murray,M,42933 Ryan Underpass,Manderson,...,43.3526,-102.5411,1126,Volunteer coordinator,1980-08-18,d667cdcbadaaed3da3f4020e83591c83,1371816816,42.788940,-103.241160,0
