In [2]:

### System
import os
import joblib

### Set seed
import random
random.seed(42)

### Mains
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import seaborn as sns
%matplotlib inline 


### Models:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

### Ensemble Models:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, VotingClassifier

### Embeddings:
from sklearn.manifold import TSNE

### Dats Splits 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

### Pipelines
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline, make_pipeline 
from sklearn.utils import shuffle

# Sampling Methods
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import NearMiss, RandomUnderSampler

### Metrics:
import sklearn.metrics as skm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

### Common datasets
import tensorflow as tf
import tensorflow_datasets as tfds

2024-04-05 20:45:37.209861: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-05 20:45:37.211130: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-05 20:45:37.238951: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-05 20:45:37.239433: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
df = pd.read_excel("data.csv.xlsx", sheet_name="train")

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181822 entries, 0 to 181821
Data columns (total 22 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   transDate       181822 non-null  datetime64[ns]
 1   creditCardNum   181822 non-null  int64         
 2   business        181822 non-null  object        
 3   category        181822 non-null  object        
 4   amount          181822 non-null  float64       
 5   firstName       181822 non-null  object        
 6   lastName        181822 non-null  object        
 7   gender          181822 non-null  object        
 8   street          181822 non-null  object        
 9   city            181822 non-null  object        
 10  state           181822 non-null  object        
 11  zip             181822 non-null  int64         
 12  latitude        181822 non-null  float64       
 13  longitude       181822 non-null  float64       
 14  cityPop         181822 non-null  int

In [None]:
# drop first & last name

df = df.drop(columns=['firstName', 'lastName', ])

In [10]:
df.describe()

Unnamed: 0,transDate,creditCardNum,amount,zip,latitude,longitude,cityPop,dateOfBirth,unixTime,merchLatitude,merchLongitude,isFraud
count,181822,181822.0,181822.0,181822.0,181822.0,181822.0,181822.0,181822,181822.0,181822.0,181822.0,181822.0
mean,2019-09-25 01:50:12.451688704,4.166126e+17,70.844358,48736.463723,38.521137,-90.167556,89103.8,1973-08-27 03:33:31.122966432,1348538000.0,38.519817,-90.168233,0.007348
min,2019-01-01 04:28:00,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1924-10-30 00:00:00,1325392000.0,19.034282,-166.657174,0.0
25%,2019-08-11 17:28:08,180042900000000.0,9.67,26237.0,34.5906,-96.7869,743.0,1962-06-04 00:00:00,1344706000.0,34.716436,-96.856711,0.0
50%,2019-10-07 16:17:32,3521417000000000.0,47.49,48088.0,39.3465,-87.4581,2457.0,1975-09-11 00:00:00,1349627000.0,39.358796,-87.39603,0.0
75%,2019-10-27 21:51:47.750000128,4642255000000000.0,83.47,72011.0,41.8467,-80.158,20478.0,1987-02-13 00:00:00,1351375000.0,41.933726,-80.23108,0.0
max,2019-12-15 13:49:26,4.992346e+18,10926.44,99783.0,65.6899,-67.9503,2906700.0,2005-01-29 00:00:00,1355579000.0,66.653465,-66.977475,1.0
std,,1.307939e+18,146.108136,26858.467937,5.066262,13.710668,302821.5,,3868345.0,5.101595,13.721601,0.085404


In [19]:
print(df.shape)
df.head()

(181822, 22)


Unnamed: 0,transDate,creditCardNum,business,category,amount,firstName,lastName,gender,street,city,...,latitude,longitude,cityPop,job,dateOfBirth,transNum,unixTime,merchLatitude,merchLongitude,isFraud
0,2019-01-01 04:28:00,4119762878330980,fraud_Welch Inc,misc_net,4.1,Melinda,Gutierrez,F,1497 West Gateway,Eureka,...,40.7152,-89.2706,6713,Archivist,1945-08-19,3fc1791ff1db94ce9540262d7eb973e4,1325392080,40.020043,-89.693412,0
1,2019-01-01 04:28:15,36078114201167,fraud_Auer-West,shopping_net,2.67,Christopher,Horn,M,956 Sanchez Highway,Mallie,...,37.2692,-82.9161,798,Facilities manager,1926-06-26,edbf65b8606b53fc48168c2daa5933e5,1325392095,37.622362,-82.676007,0
2,2019-01-01 04:29:14,4449530933957320,fraud_Balistreri-Nader,misc_pos,22.88,Felicia,Mckee,F,84079 Thomas Burgs,Tomales,...,38.2427,-122.9145,337,Occupational psychologist,1954-07-05,2fcbde131cff833751dde53da54da6da,1325392154,39.060101,-121.969417,0
3,2019-01-01 04:29:18,4302475216404890,"fraud_Boehm, Predovic and Reinger",misc_pos,9.59,Daniel,Cain,M,384 Newman Forks Apt. 370,Belmond,...,42.8511,-93.62,3032,Community pharmacist,1964-08-08,bcae564f9a1a299dfbd1efd917499968,1325392158,43.280848,-94.340312,0
4,2019-01-01 04:32:09,377234009633447,fraud_Hudson-Ratke,grocery_pos,129.06,Theresa,Blackwell,F,43576 Kristina Islands,Shenandoah Junction,...,39.3716,-77.8229,1925,Systems developer,1966-02-14,5e42e35c255edb55a192b3f1f0600936,1325392329,39.21163,-78.409044,0


In [6]:
df.isnull().sum()   

transDate         0
creditCardNum     0
business          0
category          0
amount            0
firstName         0
lastName          0
gender            0
street            0
city              0
state             0
zip               0
latitude          0
longitude         0
cityPop           0
job               0
dateOfBirth       0
transNum          0
unixTime          0
merchLatitude     0
merchLongitude    0
isFraud           0
dtype: int64

In [24]:
fraud = df[df.isFraud == 1]
normal = df[df.isFraud == 0]

print(f"Num Total txns: {df.shape[0]}")
print(f"Num fraud txns: {fraud.shape[0]}")
print(f"Num normal txns: {normal.shape[0]}")
print(f"% of fraud of all: {fraud.shape[0] / df.shape[0] * 100:.4f}%")


Num Total txns: 181822
Num fraud txns: 1336
Num normal txns: 180486
% of fraud of all: 0.7348%
