In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from imblearn.combine import SMOTETomek
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from torcheval.metrics import R2Score

  from .autonotebook import tqdm as notebook_tqdm


<h1>Data Loader + Data preprocessing + Data Augmentation</h1>

In [2]:
INPUT_FEATURES = ['category_encoded',
                  'amt', 
                  'gender_encoded', 
                  'city_encoded', 
                  'state_encoded', 
                  'city_pop', 
                  'job_encoded', 
                  'age', 
                  'hour', 
                  'daily', 
                  'day', 
                  'month']
OUTPUT_FEATURE = ['is_fraud']

def preprocessing(df):
    # remove duplicates
    df.drop_duplicates(inplace=True)
    # drop nA rows containing values
    df.dropna(axis=0)
    # drop the 'Unnamed: 0'
    df = df.drop('Unnamed: 0', axis=1)
    # change the type of date time
    df['age'] = df['dob'].apply(lambda x: datetime.now().year - datetime.strptime(x, '%Y-%m-%d').year)
    df['trans_datetime'] = pd.to_datetime(df['trans_date_trans_time'])
    df['hour'] = df['trans_datetime'].dt.hour
    df['daily'] = df['trans_datetime'].dt.day
    df['day'] = df['trans_datetime'].dt.dayofweek
    df['month'] = df['trans_datetime'].dt.month
    df.drop('trans_date_trans_time', axis=1, inplace=True)

    return df

def encoding_columns(df):

    labelencoder = LabelEncoder()
    df['category_encoded'] = labelencoder.fit_transform(df['category'])
    df['gender_encoded'] = labelencoder.fit_transform(df['gender'])
    df['city_encoded'] = labelencoder.fit_transform(df['city'])
    df['state_encoded'] =labelencoder.fit_transform(df['state'])
    df['job_encoded'] = labelencoder.fit_transform(df['job'])

    return df

def data_augmentation(X,y, sampling_strategy):
    smote_tomek = SMOTETomek(random_state=42, sampling_strategy=sampling_strategy)
    X_Augmented, y_augmented = smote_tomek.fit_resample(X,y)
    return X_Augmented, y_augmented

def smoteTomek_augmentation(df, sampling_strategy):
    # define the model
    smote_tomek = SMOTETomek(random_state=42, sampling_strategy=sampling_strategy)

    X_Augmented, y_augmented = smote_tomek.fit_resample(df[INPUT_FEATURES],df[OUTPUT_FEATURE])

    new_df = pd.concat([X_Augmented, y_augmented], axis=1)
    
    return new_df

class MapStyleFraudDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, index):
        return self.X[index], self.y[index]
    
def dataloading(for_training, input_features, output_feature, sampling_strategy, augmented=False):

    if for_training:
        df = pd.read_csv('../data/fraudTrain.csv')
    else:
        df = pd.read_csv('../data/fraudTest.csv')
    
    df = preprocessing(df)
    df = encoding_columns(df)
    # Augment the data
    if augmented == True:
        df = smoteTomek_augmentation(df, sampling_strategy)

    if for_training:
        # split btwn training data and validation with ratio 90%
        df_train, df_val = train_test_split(df, test_size=0.1, random_state=42, stratify=df['is_fraud'])

        # scale the data
        scaler = StandardScaler()
        scaler.fit(df_train[input_features])

        df_train[input_features]=scaler.transform(df_train[input_features])
        df_val[input_features]=scaler.transform(df_val[input_features])

        if augmented==True:
            # save augmented data
            df_train.to_csv('../data/fraudTrain_Augmented.csv')
            df_val.to_csv('../data/fraudVal_Augmented.csv')

        # separate Input and Label
        X_train = df_train[input_features]
        y_train = df_train[output_feature]

        X_val = df_val[input_features]
        y_val = df_val[output_feature]  

        X_train= torch.Tensor(X_train.values)
        y_train = torch.Tensor(y_train.values)
        X_val= torch.Tensor(X_val.values)
        y_val = torch.Tensor(y_val.values)

        # change it to Dataloader objects
        train_set = MapStyleFraudDataset(X_train,y_train )
        val_set = MapStyleFraudDataset(X_val,y_val)

        trainloader = DataLoader(train_set, batch_size=64, num_workers=0)
        validloader = DataLoader(val_set, batch_size=64, num_workers=0)

        return trainloader, validloader

    else:

        # scale the data
        scaler = StandardScaler()
        scaler.fit(df[input_features])
        df[input_features]=scaler.transform(df[input_features])

        X_test = df[input_features]
        y_test = df[output_feature]  

        X_test= torch.Tensor(X_test.values)
        y_test = torch.Tensor(y_test.values)

        test_set = MapStyleFraudDataset(X_test,y_test)

        testloader = DataLoader(test_set, batch_size=64, num_workers=0)

        return testloader

In [3]:
trainloader, validloader = dataloading(for_training=True, input_features=INPUT_FEATURES, output_feature=OUTPUT_FEATURE, sampling_strategy=0.3)

<h1>Define MLP Binary Classifier + Train</h1>

In [3]:
class BinaryMLPClassifier(nn.Module):
    def __init__(self, n_inputs):
        super(BinaryMLPClassifier, self).__init__()
        # First hidden layer
        self.hidden1 = nn.Linear(n_inputs, 20)
        self.act1 = nn.ReLU()
        # Second hidden layer
        self.hidden2 = nn.Linear(20, 10)
        self.act2 = nn.ReLU()
        # Third hidden layer
        self.hidden3 = nn.Linear(10,1)
        self.act3 = nn.Sigmoid()
        
    def forward(self, X):
        #Input to the first hidden layer
        X = self.hidden1(X)
        X = self.act1(X)
        # Second hidden layer
        X = self.hidden2(X)
        X = self.act2(X)
        # Third hidden layer
        X = self.hidden3(X)
        X = self.act3(X)
        return X

# not used here because of bugs
def weighted_binary_cross_entropy(output, target, weights=None):
        
    if weights is not None:
        assert len(weights) == 2
        
        loss = weights[1] * (target * torch.log(output)) + \
               weights[0] * ((1 - target) * torch.log(1 - output))
    else:
        loss = target * torch.log(output) + (1 - target) * torch.log(1 - output)

    return torch.neg(torch.mean(loss))

def train_MLP(trainloader, validloader, num_epochs, patience):

    model = BinaryMLPClassifier(len(INPUT_FEATURES))
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    best_val_loss = np.inf
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        print(f'Epoch {epoch+1}')
        model.train()
        # Iterate through training data loader
        for i, data in enumerate(trainloader):
            X, y = data
            y.to('cuda')
            X.to('cuda')
            optimizer.zero_grad()
            outputs = model.forward(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            running_loss+=loss.item()
            print('\rEpoch: {}\tbatch: {}\tLoss =  {:.3f}'.format(epoch, i, loss), end="")
        
        print("\n")
        # validation
        model.eval()
        with torch.no_grad():

            val_loss = validation(validloader,model)
            print(f"Epoch {epoch+1}: train CE loss = {running_loss/len(trainloader)}", f"|| Valid: CE loss = {val_loss}")
            
        # early-stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            dict_model = model.state_dict()
            pat = 0
        else:
            pat += 1
            print("pat ", pat)
            if pat == patience:
                print("Early Stopping: Validation Loss did not decrease for", patience, "epochs.")
                break

    torch.save(dict_model, 'mlp_classifier_no_SMOTE.pt')
    
    return model

def validation(validloader, model):

    running_loss = 0.0
    
    for _, data in enumerate(validloader,0):
        X,y = data
        # forward the input
        output= model.forward(X)
        # loss
        loss = nn.BCELoss()(output, y)
        running_loss+=loss.item()

    return running_loss/len(validloader)



In [None]:
model = train_MLP(trainloader=trainloader, validloader=validloader, num_epochs=30, patience=10)

<h2>Testing results of SMTK + MLPC / MLPC <h2>

In [4]:
def compute_metrics(dataloader, model_path):
    """
    
    This function returns the accuracy of the model on the data given in inputs
    """
    model = BinaryMLPClassifier(len(INPUT_FEATURES))
    model.load_state_dict(torch.load(model_path))
    model.eval()
    
    list_labels = []
    list_pred = []

    loss = 0.0
    for (inputs, labels) in dataloader:
        outputs = model.forward(inputs)
        loss += nn.BCELoss()(outputs, labels)
        _,predicted = torch.max(outputs, 1)
        list_labels.append(labels)
        list_pred.append(predicted)

    y_true = torch.cat(list_labels).numpy()
    y_pred = torch.cat(list_pred).numpy()
    
    loss = loss / len(dataloader)
    acc = accuracy_score(y_true, y_pred) * 100
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    
    return loss, acc, macro_f1

In [5]:
testloader = dataloading(for_training=False, input_features=INPUT_FEATURES, output_feature=OUTPUT_FEATURE, sampling_strategy=0.3)

In [12]:
SMTK_MLPC = '../model/mlp_classifier_SMOTE.pt'
MLPC = '../model/mlp_classifier_trained.pt'
loss_smtk_mlpc, acc_smtk_mlpc, macro_f1_smtk_mlpc = compute_metrics(testloader, SMTK_MLPC)
loss_mlpc, acc_mlpc, macro_f1_mlpc = compute_metrics(testloader, MLPC)

In [13]:
loss_smtk_mlpc, acc_smtk_mlpc, macro_f1_smtk_mlpc


(tensor(0.7018, grad_fn=<DivBackward0>), 99.61401355721146, 0.4990331679727538)

In [14]:
loss_mlpc, acc_mlpc, macro_f1_mlpc

(tensor(0.1657, grad_fn=<DivBackward0>), 99.61401355721146, 0.4990331679727538)

<h1>Define AutoEncoder model + Train</h1>

In [12]:
class Autoencoder(nn.Module):
    def __init__(self, dim_input, dim_output):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(dim_input, 10),
            # nn.BatchNorm1d(10),
            # nn.LeakyReLU(),
            nn.ReLU(),
            nn.Linear(10, 8),
            # nn.BatchNorm1d(8),
            # nn.LeakyReLU()
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(8, 10),
            # nn.BatchNorm1d(10),
            # nn.LeakyReLU(),
            nn.ReLU(),
            nn.Linear(10, dim_output),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def feature_extration(self, x):
        return self.encoder(x)


def train(num_epoch,train_loader, valid_loader, patience, lr):
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = Autoencoder(len(INPUT_FEATURES), len(INPUT_FEATURES))
    # Define optimizer
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    best_val_loss = np.inf

    model.train()
    # Train the model for the specified number of epochs
    for epoch in range(num_epoch):
        running_loss = 0.0
        for i,data in enumerate(train_loader,0):
            inputs,_ = data
            inputs = inputs.to(device)
            optimizer.zero_grad()
            outputs = model.forward(inputs)
            loss = nn.MSELoss()(outputs, inputs)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            print('\rEpoch: {}\tbatch: {}\tLoss =  {:.3f}'.format(epoch, i, loss), end="")
        
        # validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():

            for data in valid_loader:
                inputs,_ = data
                inputs = inputs.to(device)
                outputs = model.forward(inputs)
                loss = nn.MSELoss()(outputs, inputs)
                val_loss += loss.item()

            val_loss *= (1/len(valid_loader))   
            print(f"Epoch {epoch+1}: train MSE loss = {running_loss/len(trainloader)}", f"|| Valid: MSE loss = {val_loss}")
            
        # early-stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            dict_model = model.state_dict()
            pat = 0
        else:
            pat += 1
            print("pat ", pat)
            if pat == patience:
                print("Early Stopping: Validation Loss did not decrease for", patience, "epochs.")
                break

    torch.save(dict_model, 'AutoEncoder.pt')
    
    return model


In [13]:
model = train(num_epoch=100, train_loader=trainloader, valid_loader=validloader, patience=10, lr=2e-4)

Epoch: 0	batch: 18234	Loss =  0.6193Epoch 1: train MSE loss = 0.7811439971417834 || Valid: MSE loss = 0.7248542141784985
Epoch: 1	batch: 18234	Loss =  0.5950Epoch 2: train MSE loss = 0.6641287059531592 || Valid: MSE loss = 0.695713406104317
Epoch: 2	batch: 18234	Loss =  0.5647Epoch 3: train MSE loss = 0.6387061586159695 || Valid: MSE loss = 0.6603172763145553
Epoch: 3	batch: 18234	Loss =  0.5492Epoch 4: train MSE loss = 0.6071388844908313 || Valid: MSE loss = 0.6458642280742813
Epoch: 4	batch: 18234	Loss =  0.5471Epoch 5: train MSE loss = 0.6018924112740954 || Valid: MSE loss = 0.6443058320555007
Epoch: 5	batch: 18234	Loss =  0.5460Epoch 6: train MSE loss = 0.6010374129619342 || Valid: MSE loss = 0.643734077749784
Epoch: 6	batch: 18234	Loss =  0.5460Epoch 7: train MSE loss = 0.6005818438637835 || Valid: MSE loss = 0.643412300354751
Epoch: 7	batch: 3036	Loss =  0.560

KeyboardInterrupt: 

In [7]:
testloader = dataloading(for_training=False, input_features=INPUT_FEATURES, output_feature=OUTPUT_FEATURE, sampling_strategy=0.3)

<h1> Feature extraction + Training MLPC on reduced data</h1>

In [15]:
def feature_extraction_AE(dataloader, model_path):

    # load pre trained AE model
    model = Autoencoder(len(INPUT_FEATURES), len(INPUT_FEATURES))
    model.load_state_dict(torch.load(model_path))
    model.eval()

    list_outputs = []
    list_labels = []

    for (inputs, labels) in dataloader:
        
        outputs = model.feature_extration(inputs)
        list_labels.append(labels)
        list_outputs.append(outputs)
    
    y = torch.cat(list_labels)
    features_extracted = torch.cat(list_outputs)

    return features_extracted, y

class BinaryMLPClassifierWithFeaturedExtracted(nn.Module):
    def __init__(self, n_inputs):
        super(BinaryMLPClassifierWithFeaturedExtracted, self).__init__()
        # First hidden layer
        self.hidden1 = nn.Linear(n_inputs, 6)
        self.act1 = nn.ReLU()
        # Second hidden layer
        self.hidden2 = nn.Linear(6, 3)
        self.act2 = nn.ReLU()
        # Third hidden layer
        self.hidden3 = nn.Linear(3,1)
        self.act3 = nn.Sigmoid()
        
    def forward(self, X):
        #Input to the first hidden layer
        X = self.hidden1(X)
        X = self.act1(X)
        # Second hidden layer
        X = self.hidden2(X)
        X = self.act2(X)
        # Third hidden layer
        X = self.hidden3(X)
        X = self.act3(X)
        return X

def get_reduced_dataloader(is_AE, dataloader, path_model):
    if is_AE==True:
        # features extracted thanks to AE
        features_extracted, y = feature_extraction_AE(dataloader, path_model)

        X_train, X_val, y_train, y_val = train_test_split(features_extracted,
                                                          y,
                                                          test_size=0.1,
                                                          random_state=42)
        
        # change it to Dataloader objects
        train_set = MapStyleFraudDataset(X_train,y_train )
        val_set = MapStyleFraudDataset(X_val,y_val)

        trainloader = DataLoader(train_set, batch_size=32, num_workers=0)
        validloader = DataLoader(val_set, batch_size=32, num_workers=0)
        
        return trainloader, validloader
    else:
        # feature extracted thanks to PCA
        print('Not yet implemented ! ')
        return None
        
def train_AE_MLPC(trainloader, is_augmented=False):
    
    # Using the pretrained AE on the right data
    if is_augmented == True:
        model_path = '../model/AutoEncoder_SMOTE.pt'
    else:
        model_path = '../model/AutoEncoder_no_SMOTE.pt'

    trainloader, validloader = get_reduced_dataloader(True, trainloader, model_path)




In [20]:
AE = '../model/AutoEncoder_no_SMOTE.pt'
SMTK_AE = '../model/AutoEncoder_SMOTE.pt'

feature_extracted, y = feature_extraction_AE(trainloader, AE)

In [27]:
testloader

<torch.utils.data.dataloader.DataLoader at 0x255f97aa4d0>

<h1>Evaluation + Results of SMTK + AE / AE / SMTK + AE + MLPC / AE + MLPC</h1>

In [6]:

# AutoEncoder are basically regression so : 
# calculate R2 = 1 - SSE/SST where SST = som(xi - mean(xi))

def compute_metrics_AE(dataloader, model_path):
    """
    
    This function returns the accuracy of the model on the data given in inputs
    """
    model = Autoencoder(len(INPUT_FEATURES), len(INPUT_FEATURES))
    model.load_state_dict(torch.load(model_path))
    model.eval()
    
    list_inputs = []
    list_outputs = []

    loss = 0.0
    for (inputs, labels) in dataloader:
        outputs = model.forward(inputs)
        loss += nn.MSELoss()(outputs, inputs)
        list_inputs.append(inputs)
        list_outputs.append(outputs)

    inputs_fin = torch.cat(list_inputs)
    outputs_fin = torch.cat(list_outputs)
    
    metric = R2Score()
    metric.update(inputs_fin, outputs_fin)
    r2 = metric.compute()

    return r2

In [8]:
AE = 'AutoEncoder.pt'
SMT_AE = '../model/AutoEncoder_SMOTE.pt'

r2 = compute_metrics_AE(testloader, AE)

In [9]:
r2

tensor(-5.2656)