# 1. Imports

In [8]:
import pandas as pd
import numpy as np
from datetime import datetime
from geopy.distance import geodesic
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

In [2]:
train = pd.read_csv('Data/fraudTrain.csv')

In [3]:
test = pd.read_csv('Data/fraudTest.csv')

# 2. Data Sanity Check

In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [3]:
train.isna().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [4]:
train.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,648337.0,4.17192e+17,70.35104,48800.67,38.53762,-90.22634,88824.44,1349244000.0,38.53734,-90.22646,0.005788652
std,374318.0,1.308806e+18,160.316,26893.22,5.075808,13.75908,301956.4,12841280.0,5.109788,13.77109,0.07586269
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0
25%,324168.5,180042900000000.0,9.65,26237.0,34.6205,-96.798,743.0,1338751000.0,34.73357,-96.89728,0.0
50%,648337.0,3521417000000000.0,47.52,48174.0,39.3543,-87.4769,2456.0,1349250000.0,39.36568,-87.43839,0.0
75%,972505.5,4642255000000000.0,83.14,72042.0,41.9404,-80.158,20328.0,1359385000.0,41.95716,-80.2368,0.0
max,1296674.0,4.992346e+18,28948.9,99783.0,66.6933,-67.9503,2906700.0,1371817000.0,67.51027,-66.9509,1.0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [6]:
len(train)

1296675

# 3. Data Exploration

In [64]:
train["is_fraud"].value_counts()

0    1289169
1       7506
Name: is_fraud, dtype: int64

In [40]:
train[train['amt'] > 1000]

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,cc_length
232,2019-01-01 02:54:10,4311368326621416041,fraud_Kassulke PLC,shopping_net,1055.47,Phillip,Delacruz,M,26678 Lisa Locks Apt. 904,Iliff,...,-103.0968,648,Community arts worker,1969-09-21,adca67d0a40b26fb8afeccbae31b7828,1325386450,39.940564,-103.528231,0,19
511,2019-01-01 06:43:54,4509142395811241,fraud_McGlynn-Heathcote,misc_net,1636.87,Margaret,Williams,F,165 Jerry Meadows Suite 460,Surrency,...,-82.1982,1324,"Engineer, technical sales",1926-07-12,9b13fb1ce565b55afe4106ecdee9346b,1325400234,31.608415,-83.145823,0,16
723,2019-01-01 09:23:55,4265776278887457,"fraud_Rippin, Kub and Mann",misc_net,1047.52,Christine,Best,F,68248 Deanna Land,Enola,...,-92.2123,969,"Physicist, medical",1954-01-05,bb5654f0f82ace48aa6e4f45db7c64c5,1325409835,34.887449,-92.623326,0,16
824,2019-01-01 10:29:29,630469040731,fraud_Auer-West,shopping_net,1433.54,Meredith,Ayala,F,7107 Henderson Station,Cascade Locks,...,-121.8686,1288,Barrister,1936-05-01,cdf73aea45a42592972ad9668315b8c2,1325413769,45.873314,-121.589038,0,12
1480,2019-01-01 16:16:10,3528407217576457,fraud_Denesik and Sons,shopping_pos,1025.38,Patricia,Leach,F,71309 Martinez Stravenue,Kingsport,...,-82.4834,87124,Warden/ranger,1987-02-14,5cebc54b90d0a21e72fa18f6429a11d4,1325434570,37.421987,-81.873400,0,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1294390,2020-06-20 16:32:01,6564459919350820,"fraud_Mosciski, Ziemann and Farrell",shopping_net,1006.48,Douglas,Willis,M,619 Jeremy Garden Apt. 681,Benton,...,-90.3508,1306,Public relations officer,1958-09-10,726a693a52a8b3de32101c828feee042,1371745921,41.585645,-90.692572,1,16
1294714,2020-06-20 19:21:26,3596217206093829,fraud_Fisher Inc,shopping_net,2090.14,Sara,Ramirez,F,23843 Scott Island,Birmingham,...,-91.9534,888,Camera operator,1988-03-25,cc02b74a8f15809cabf60616cd5eae5a,1371756086,40.282456,-91.566151,0,16
1295108,2020-06-20 22:29:00,4986925034905735,fraud_Kassulke PLC,shopping_net,1064.44,Erika,Gonzalez,F,907 Courtney Via Apt. 896,Irvine,...,-83.9862,13061,"Editor, magazine features",1959-06-18,843c2d9939cd8bed31ca120ab175ad2b,1371767340,38.220812,-84.138702,1,16
1295255,2020-06-20 23:29:52,3560725013359375,fraud_Fisher-Schowalter,shopping_net,1063.03,Brooke,Smith,F,63542 Luna Brook Apt. 012,Notrees,...,-102.7413,23,Cytogeneticist,1969-09-15,88d038dce3add03666ab117a9d7225e6,1371770992,30.971658,-102.494584,1,16


# 4. Feature Extraction

In [7]:
train.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [4]:
train.drop(['Unnamed: 0', 'first', 'last', 'street', 'zip', 'trans_num', 'unix_time'], inplace=True, axis=1)

In [5]:
def calculate_distance(row):
    house_coords = (row['lat'], row['long'])
    merchant_coords = (row['merch_lat'], row['merch_long'])
    distance = geodesic(house_coords, merchant_coords).kilometers
    return distance

def calculate_age(row):
    dob = datetime.strptime(row['dob'], '%Y-%m-%d')
    trans_date = datetime.strptime(row['trans_date_trans_time'], '%Y-%m-%d %H:%M:%S')
    years_diff = trans_date.year - dob.year
    if ((trans_date.month, trans_date.day) < (dob.month, dob.day)):
        years_diff -= 1
    return years_diff

In [6]:
train['cc_length'] = train['cc_num'].apply(lambda x: len(str(x)))
train['year'] = train['trans_date_trans_time'].apply(lambda x: x[:4])
train['month'] = train['trans_date_trans_time'].apply(lambda x: x[5:7])
train['day'] = train['trans_date_trans_time'].apply(lambda x: x[8:10])
train['hour'] = train['trans_date_trans_time'].apply(lambda x: x[11:13])
train['day_of_week'] = train['trans_date_trans_time'].apply(lambda x: datetime.strptime(x[:10], "%Y-%m-%d").strftime("%A"))
train['transaction_over_1k'] = train['amt'].apply(lambda x: x > 1000)
train['distance'] = train.apply(calculate_distance, axis=1)
train['age'] = train.apply(calculate_age, axis=1)

In [9]:
columns_to_scale = ['age', 'distance', 'amt']
scaler = MinMaxScaler()
train[columns_to_scale] = scaler.fit_transform(train[columns_to_scale])

## Original Data 1M

In [11]:
categorical_cols = ['merchant', 'category', 'gender', 'state', 'job', 'year', 'month', 'day', 'hour', 'day_of_week']
train_encoded = pd.get_dummies(train, columns=categorical_cols, drop_first=False)

In [12]:
train_encoded.drop(['trans_date_trans_time', 'cc_num', 'city', 'lat', 'long', 'city_pop', 'dob', 'merch_lat', 'merch_long'], axis=1, inplace=True)

In [13]:
train_encoded.to_parquet('Data/fraudTrain_fullCleaned.parquet', index=False)

## SMOTE 100k

In [14]:
fraud_df = train_encoded[train_encoded['is_fraud'] == 1]
non_fraud_df = train_encoded[train_encoded['is_fraud'] == 0]
non_fraud_sample = non_fraud_df.sample(n=50000, random_state=42)
sampled_train = pd.concat([fraud_df, non_fraud_sample])

In [15]:
X = sampled_train.drop('is_fraud', axis=1)
y = sampled_train['is_fraud']
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
train_balanced = pd.DataFrame(X_resampled, columns=X.columns)
train_balanced['is_fraud'] = y_resampled

  train_balanced['is_fraud'] = y_resampled


In [16]:
train_balanced.to_parquet('Data/fraudTrain_sampledCleaned.parquet', index=False)

## Test

In [17]:
test.drop(['Unnamed: 0', 'first', 'last', 'street', 'zip', 'trans_num', 'unix_time'], inplace=True, axis=1)
test['cc_length'] = test['cc_num'].apply(lambda x: len(str(x)))
test['year'] = test['trans_date_trans_time'].apply(lambda x: x[:4])
test['month'] = test['trans_date_trans_time'].apply(lambda x: x[5:7])
test['day'] = test['trans_date_trans_time'].apply(lambda x: x[8:10])
test['hour'] = test['trans_date_trans_time'].apply(lambda x: x[11:13])
test['day_of_week'] = test['trans_date_trans_time'].apply(lambda x: datetime.strptime(x[:10], "%Y-%m-%d").strftime("%A"))
test['transaction_over_1k'] = test['amt'].apply(lambda x: x > 1000)
test['distance'] = test.apply(calculate_distance, axis=1)
test['age'] = test.apply(calculate_age, axis=1)
test[columns_to_scale] = scaler.transform(test[columns_to_scale])

In [18]:
test_encoded = pd.get_dummies(test, columns=categorical_cols, drop_first=False)
test_encoded.drop(['trans_date_trans_time', 'cc_num', 'city', 'lat', 'long', 'city_pop', 'dob', 'merch_lat', 'merch_long'], axis=1, inplace=True)

In [None]:
test_encoded.to_parquet('Data/fraudTest_cleaned.parquet', index=False)