In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from imblearn.combine import SMOTETomek
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from torcheval.metrics import R2Score
from torchmetrics import MeanSquaredError


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
INPUT_FEATURES = ['category_encoded',
                  'amt', 
                  'gender_encoded', 
                  'city_encoded', 
                  'state_encoded', 
                  'city_pop', 
                  'job_encoded', 
                  'age', 
                  'hour', 
                  'daily', 
                  'day', 
                  'month']
OUTPUT_FEATURE = ['is_fraud']

def preprocessing(df):
    # remove duplicates
    df.drop_duplicates(inplace=True)
    # drop nA rows containing values
    df.dropna(axis=0)
    # drop the 'Unnamed: 0'
    df = df.drop('Unnamed: 0', axis=1)
    # change the type of date time
    df['age'] = df['dob'].apply(lambda x: datetime.now().year - datetime.strptime(x, '%Y-%m-%d').year)
    df['trans_datetime'] = pd.to_datetime(df['trans_date_trans_time'])
    df['hour'] = df['trans_datetime'].dt.hour
    df['daily'] = df['trans_datetime'].dt.day
    df['day'] = df['trans_datetime'].dt.dayofweek
    df['month'] = df['trans_datetime'].dt.month
    df.drop('trans_date_trans_time', axis=1, inplace=True)

    return df

def encoding_columns(df):

    labelencoder = LabelEncoder()
    df['category_encoded'] = labelencoder.fit_transform(df['category'])
    df['gender_encoded'] = labelencoder.fit_transform(df['gender'])
    df['city_encoded'] = labelencoder.fit_transform(df['city'])
    df['state_encoded'] =labelencoder.fit_transform(df['state'])
    df['job_encoded'] = labelencoder.fit_transform(df['job'])

    return df
class MapStyleFraudDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, index):
        return self.X[index], self.y[index]
    
def dataloading(for_training, input_features, output_feature):

    if for_training:
        df = pd.read_csv('../data/fraudTrain.csv')
    else:
        df = pd.read_csv('../data/fraudTest.csv')
    
    df = df[df['is_fraud']==0]
    df = preprocessing(df)
    df = encoding_columns(df)

    if for_training:
        # split btwn training data and validation with ratio 90%
        df_train, df_val = train_test_split(df, test_size=0.1, random_state=42, stratify=df['is_fraud'])

        # scale the data
        scaler = StandardScaler()
        scaler.fit(df_train[input_features])

        df_train[input_features]=scaler.transform(df_train[input_features])
        df_val[input_features]=scaler.transform(df_val[input_features])

        # separate Input and Label
        X_train = df_train[input_features]
        y_train = df_train[output_feature]

        X_val = df_val[input_features]
        y_val = df_val[output_feature]  

        X_train= torch.Tensor(X_train.values)
        y_train = torch.Tensor(y_train.values)
        X_val= torch.Tensor(X_val.values)
        y_val = torch.Tensor(y_val.values)

        # change it to Dataloader objects
        train_set = MapStyleFraudDataset(X_train,y_train )
        val_set = MapStyleFraudDataset(X_val,y_val)

        trainloader = DataLoader(train_set, batch_size=64, num_workers=0)
        validloader = DataLoader(val_set, batch_size=64, num_workers=0)

        return trainloader, validloader

    else:

        # scale the data
        scaler = StandardScaler()
        scaler.fit(df[input_features])
        df[input_features]=scaler.transform(df[input_features])

        X_test = df[input_features]
        y_test = df[output_feature]  

        X_test= torch.Tensor(X_test.values)
        y_test = torch.Tensor(y_test.values)

        test_set = MapStyleFraudDataset(X_test,y_test)

        testloader = DataLoader(test_set, batch_size=64, num_workers=0)

        return testloader

In [3]:
trainloader, validloader = dataloading(for_training=True, input_features=INPUT_FEATURES, output_feature=OUTPUT_FEATURE)

In [8]:

from AE_model import *

def train(num_epoch,train_loader, valid_loader, patience, lr, deepAut, intermediate, latent_size):
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if deepAut == True:
        model = DeepAutoencoder(len(INPUT_FEATURES), intermediate, latent_size )
    else:
        model = SimpleAutoencoder(len(INPUT_FEATURES), latent_size )
        
    # Define optimizer
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    best_val_loss = np.inf
    model.train()
    # Train the model for the specified number of epochs
    for epoch in range(num_epoch):
        running_loss = 0.0
        for i,data in enumerate(train_loader,0):
            inputs,_ = data
            inputs = inputs.to(device)
            optimizer.zero_grad()
            outputs = model.forward(inputs)
            loss = nn.MSELoss()(outputs, inputs)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            print('\rEpoch: {}\tbatch: {}\tLoss =  {:.3f}'.format(epoch, i+1, loss), end="")

        print('\n')
        # validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():

            for data in valid_loader:
                inputs,_ = data
                inputs = inputs.to(device)
                outputs = model.forward(inputs)
                loss = nn.MSELoss()(outputs, inputs)
                val_loss += loss.item()

            val_loss *= (1/len(valid_loader))   
            print(f"Epoch {epoch+1}: train MSE loss = {running_loss/len(trainloader)}", f"|| Valid: MSE loss = {val_loss}")
            
        # early-stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            dict_model = model.state_dict()
            pat = 0
                
        else:
            pat += 1
            print("pat ", pat)
            if pat == patience:
                print("Early Stopping: Validation Loss did not decrease for", patience, "epochs.")
                break

    if deepAut == True:
        torch.save(dict_model, f'DeepAutoEncoder_int{intermediate}_lat{latent_size}.pt')
    else:
        torch.save(dict_model, f'SimpleAutoEncoder_lat{latent_size}.pt')
    
    return model

In [9]:
deepAuts = [True, True, False, False]
intermediates = [50, 100, 1, 1]
latents = [10, 10, 6, 4]

for i in range(len(deepAuts)):
    model = train(num_epoch=20, train_loader=trainloader, valid_loader=validloader, patience=5, lr=2e-4, deepAut=deepAuts[i], intermediate=intermediates[i], latent_size = latents[i])

Epoch: 0	batch: 18129	Loss =  0.0640

Epoch 1: train MSE loss = 0.14532930321752666 || Valid: MSE loss = 0.03739593166016763
Epoch: 1	batch: 18129	Loss =  0.014

Epoch 2: train MSE loss = 0.01866840498401816 || Valid: MSE loss = 0.013486175624556637
Epoch: 2	batch: 18129	Loss =  0.010

Epoch 3: train MSE loss = 0.01062702740968156 || Valid: MSE loss = 0.008321654979167299
Epoch: 3	batch: 18129	Loss =  0.007

Epoch 4: train MSE loss = 0.007199002961722715 || Valid: MSE loss = 0.006302227003952816
Epoch: 4	batch: 18129	Loss =  0.006

Epoch 5: train MSE loss = 0.005745906753178261 || Valid: MSE loss = 0.005340712857473106
Epoch: 5	batch: 18129	Loss =  0.005

Epoch 6: train MSE loss = 0.004906558307551009 || Valid: MSE loss = 0.004638771385401626
Epoch: 6	batch: 18129	Loss =  0.004

Epoch 7: train MSE loss = 0.004401786664249469 || Valid: MSE loss = 0.004203202307776596
Epoch: 7	batch: 18129	Loss =  0.004

Epoch 8: train MSE loss = 0.004115319992410855 || Valid: MSE loss = 0.00396782506575