In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [107]:
train = pd.read_csv("Data/Main_Data/fraudTrain.csv")
test = pd.read_csv("Data/Main_Data/fraudTest.csv")

In [108]:
train.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [109]:
to_drop = ['first', 'last', 'street', 'city', 'street', 'job', 'trans_num']
train = train.drop(to_drop, axis=1)
test = test.drop(to_drop, axis=1)

In [110]:
train['dob'] = pd.to_datetime(train['dob'])
train['trans_date_trans_time'] = pd.to_datetime(train['trans_date_trans_time'])

test['dob'] = pd.to_datetime(test['dob'])
test['trans_date_trans_time'] = pd.to_datetime(test['trans_date_trans_time'])

In [111]:
train['age'] = 2023 - train['dob'].dt.year.astype(int)
test['age'] = 2023 - test['dob'].dt.year.astype(int)

In [112]:
train = train.drop('dob', axis = 1)
test = test.drop('dob', axis = 1)

In [113]:
train['trans_year'] = train['trans_date_trans_time'].dt.year.astype(int)
train['trans_month'] = train['trans_date_trans_time'].dt.month.astype(int)
train['trans_hour'] = train['trans_date_trans_time'].dt.hour.astype(int)

test['trans_year'] = test['trans_date_trans_time'].dt.year.astype(int)
test['trans_month'] = test['trans_date_trans_time'].dt.month.astype(int)
test['trans_hour'] = test['trans_date_trans_time'].dt.hour.astype(int)

In [114]:
train = train.drop('trans_date_trans_time', axis = 1)
test = test.drop('trans_date_trans_time', axis = 1)

In [115]:
train.head()

Unnamed: 0.1,Unnamed: 0,cc_num,merchant,category,amt,gender,state,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,age,trans_year,trans_month,trans_hour
0,0,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,NC,28654,36.0788,-81.1781,3495,1325376018,36.011293,-82.048315,0,35,2019,1,0
1,1,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,WA,99160,48.8878,-118.2105,149,1325376044,49.159047,-118.186462,0,45,2019,1,0
2,2,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,M,ID,83252,42.1808,-112.262,4154,1325376051,43.150704,-112.154481,0,61,2019,1,0
3,3,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,MT,59632,46.2306,-112.1138,1939,1325376076,47.034331,-112.561071,0,56,2019,1,0
4,4,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,M,VA,24433,38.4207,-79.4629,99,1325376186,38.674999,-78.632459,0,37,2019,1,0


In [116]:
gender_mapping = {'F': 0, 'M': 1}
train['gender'] = train['gender'].map(gender_mapping)
test['gender'] = test['gender'].map(gender_mapping)

In [117]:
label_encoder = LabelEncoder()

# Fit the LabelEncoder to the 'State' column
label_encoder.fit(train['state'])

# Transform the 'State' column to numerical values
train['state'] = label_encoder.transform(train['state'])
test['state'] = label_encoder.transform(test['state'])

In [None]:
label_encoder_cat = LabelEncoder()

# Fit the LabelEncoder to the 'State' column
label_encoder_cat.fit(train['category'])

# Transform the 'State' column to numerical values
train['category'] = label_encoder_cat.transform(train['category'])
test['category'] = label_encoder_cat.transform(test['category'])

In [None]:
len(train.merchant.unique())

In [None]:
label_encoder_merch = LabelEncoder()

# Fit the LabelEncoder to the 'State' column
label_encoder_merch.fit(train['merchant'])

# Transform the 'State' column to numerical values
train['merchant'] = label_encoder_merch.transform(train['merchant'])
test['merchant'] = label_encoder_merch.transform(test['merchant'])

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train["gender"] = train.gender.astype("category")
train["state"] = train.state.astype("category")
train["category"] = train.category.astype("category")

In [None]:
test["gender"] = test.gender.astype("category")
test["state"] = test.state.astype("category")
test["category"] = test.category.astype("category")

In [None]:
numerical_features = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']
scaler = StandardScaler()
train[numerical_features] = scaler.fit_transform(train[numerical_features])
test[numerical_features] = scaler.fit_transform(test[numerical_features])

In [None]:
from sklearn.model_selection import train_test_split

data = pd.concat([train, test])

# Separate fraud and non-fraud samples
fraud_data = data[data['is_fraud'] == 1]
non_fraud_data = data[data['is_fraud'] == 0]

# Split the fraud and non-fraud samples into training and testing sets separately
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    fraud_data.drop('is_fraud', axis=1), fraud_data['is_fraud'], test_size=0.12, random_state=42)

X_train_non_fraud, X_test_non_fraud, y_train_non_fraud, y_test_non_fraud = train_test_split(
    non_fraud_data.drop('is_fraud', axis=1), non_fraud_data['is_fraud'], test_size=0.12, random_state=42)

# Balance the training set by resampling non-fraud samples
non_fraud_sample_size = int(len(y_train_fraud) * 2) # Adjust the multiplier to control the balance
X_train_non_fraud_resampled = X_train_non_fraud.sample(n=non_fraud_sample_size, random_state=42)
y_train_non_fraud_resampled = y_train_non_fraud.sample(n=non_fraud_sample_size, random_state=42)


# Combine the fraud and resampled non-fraud samples for the training set
X_train = pd.concat([X_train_fraud, X_train_non_fraud_resampled])
y_train = pd.concat([y_train_fraud, y_train_non_fraud_resampled])

# Combine the fraud and non-fraud samples for the testing set
X_test = pd.concat([X_test_fraud, X_test_non_fraud_resampled])
y_test = pd.concat([y_test_fraud, y_test_non_fraud_resampled])

# Shuffle the training and testing sets
X_train, y_train = X_train.sample(frac=1, random_state=42), y_train.sample(frac=1, random_state=42)
X_test, y_test = X_test.sample(frac=1, random_state=42), y_test.sample(frac=1, random_state=42)


In [None]:
print(X_train.shape)
print(y_train.shape)
print("--")
print(X_test.shape)
print(y_test.shape)

In [None]:
print(len(y_train[y_train == 1]))
print(len(y_train[y_train == 0]))

In [None]:
print(len(y_test[y_test == 1]))
print(len(y_test[y_test == 0]))

In [None]:
X_train.to_csv("X_train.csv")
y_train.to_csv("y_train.csv")

X_test.to_csv("X_test.csv")
y_test.to_csv("y_test.csv")

In [None]:
y_train

In [None]:
X_train = pd.read_csv("Data/Sample_Data/X_train.csv")
y_train = pd.read_csv("Data/Sample_Data/y_train.csv")
len(X_train) == len(y_train)