In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline
import tqdm
import torch.nn.functional as F
import tqdm
import impute
import PMLE
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Data Preprocessing

In [None]:
# Create dfs and split into train and val
identity = pd.read_csv('train_identity.csv')
trans = pd.read_csv('train_transaction.csv')
df = identity.merge(trans, on='TransactionID')
df = pd.get_dummies(df,drop_first=True)

#drop columns that are all NA or have only one value
for col in df.columns:
    try:
        if (df[col].isna().sum()==df.shape[0]) | (df[col].min()==df[col].max()):
            df.drop(col,axis=1,inplace=True)
    except:
        pass

#Obtain indices for non-fraud rows
normal_inds = df[df.isFraud==0].index

#Train test split
X = df.drop('isFraud',axis=1)
y = df['isFraud']

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=.2,random_state=0)

#Establish floors and ceilings for columns
limited_range = {}
for col in X_train.columns:
    if (X_train[col].nunique()<10)|(X_train[col].min()==0):
        limited_range[col]=(X_train[col].min(),X_train[col].max())

#Impute missing values and standardize
X_train, problems = impute.impute_missing_values(X_train,col_floor_ceiling_dict=limited_range)
X_val, problems = impute.impute_missing_values(X_train,col_floor_ceiling_dict=limited_range)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.fit_transform(X_val)

# Penalized Maximum Likelihood Estimation
Using Firth's Logistic Regression with Intercept Correction

In [None]:
#Get estimates using Firth's Logistic Regression with Intercept Correction
FLIC = PMLE.Firth_Logit(FLIC=True)
FLIC.fit(X_train,y_train)
FLIC_train_preds = FLIC.predict_proba(X_train)
FLIC_val_preds = FLIC.predict_proba(X_train)

# SMOTE Dataset Resampling
Estimation with XGBoost

In [None]:
#Get XGBoost estimate of SMOTE dataset
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train,y_train)
X_val_smote, y_val_smote = smote.fit_resample(X_val,y_val)


xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
xgb_gs = GridSearchCV(xgb,params,cv=5)
xgb_gs.fit(X_train_smote,y_train_smote)
smote_train_preds = xgb_gs.predict_proba(X_train_smote)
smote_train_preds = xgb_gs.predict_proba(X_val_smote)

# Autoencoder Anomaly Detection
With standard, regularized and denoising autoencoders using pytorch

In [None]:
#Convert non-fraud rows to pytorch tensor
normal_train = normal_train[normal_inds,:]
normal_torch = torch.from_numpy(val_x).type(torch.FloatTensor)

In [None]:
# Create autoencoder class and training function
class AutoEncoder(nn.Module):
    
    def __init__(self,n_features,hidden_nodes,dropout=False):
        super(AutoEncoder, self).__init__()
        self.n_features=n_features
        self.n_hidden = hidden_nodes
        self.encoder = nn.Linear(n_features,hidden_nodes)
        self.decoder = nn.Linear(hidden_nodes,n_features)
        self.output_layer = nn.Linear(n_features,n_features)
        self.dropout = dropout
        
        
    def forward(self,x):
        if self.dropout!=False:
            x = F.relu(F.dropout(self.encoder(x)))
        else:
            x = F.relu(self.encoder(x))
        self.hidden_layer=x
        x = F.relu(self.decoder(x))
        x = self.output_layer(x)
        return x


def train_autoencoder(model, dataset,loss_func,optimizer, epochs=5, batch_size=32, 
                      lr=1e-3,noise=False,noise_factor=None):
    torch.manual_seed(0)
    rows = dataset.shape[0]
    train_loader = torch.utils.data.DataLoader(dataset, 
                                               batch_size=batch_size, 
                                               shuffle=True)
    outputs = []
    for epoch in range(epochs):
        counter = 0
        print('\nEPOCH:{}\n'.format(epoch+1))
        for batch in train_loader:
            
            if noise == True:
                batch = batch + noise_factor * torch.randn(*batch.shape)
            batch = torch.autograd.Variable(batch)
            optimizer.zero_grad()
            recon = model(batch)
            
            loss = loss_func(recon, batch)
            if (counter%50==0):
                print('Batch Loss: {:.4f}'.format(float(loss)))
            loss.backward()
            optimizer.step()
            counter+=1
        epoch_loss = loss_func(model(dataset), dataset)
        print('Epoch {}: Loss: {:.4f}'.format(epoch+1, float(epoch_loss)))
        
        outputs.append((epoch, X, recon),)
        

In [None]:
#standard autoencoder
ae1 = AutoEncoder(2595,2595)
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(ae.parameters(), lr=1e-3)
train_autoencoder(model=ae1,
                   dataset=output2,
                   loss_func=loss_func,
                   optimizer=optimizer,
                   batch_size=512,
                   epochs=10)
output1 = ae(X_train_torch)


In [None]:
#dropout autoencoder
ae1 = AutoEncoder(2595,4000,dropout=0.5)
loss_func = nn.MSELoss()
optimizer = torch.optim.SGD(ae.parameters(), lr=1e-3,momentum=0.9,nesterov=True)
train_autoencoder(model=ae1,
                   dataset=normal_train_torch,
                   loss_func=loss_func,
                   optimizer=optimizer,
                   batch_size=1024,
                   epochs=40)
output2 = ae(output1)

In [None]:
#L2 autoencoder
ae2 = AutoEncoder(2595,2595)
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(ae.parameters(), lr=1e-3,weight_decay=0.2)
train_autoencoder(model=ae2,
                   dataset=output2,
                   loss_func=loss_func,
                   optimizer=optimizer,
                   batch_size=512,
                   epochs=10)
output3 = ae(output2)

In [None]:
#standard autoencoder
ae3 = AutoEncoder(2595,1000)
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(ae.parameters(), lr=1e-3)
train_autoencoder(model=ae3,
                   dataset=output2,
                   loss_func=loss_func,
                   optimizer=optimizer,
                   batch_size=512,
                   epochs=10)
output4 = ae(output3)

In [None]:
#denoising autoencoder
ae4 = AutoEncoder(2595,2595)
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(ae.parameters(), lr=1e-3)
train_autoencoder(model=ae4,
                   dataset=output1,
                   loss_func=loss_func,
                   optimizer=optimizer,
                   batch_size=512,
                   epochs=5,
                   noise=True,
                  noise_factor=0.25)
output5 = ae(output4)

In [None]:
#standard autoencoder
ae5 = AutoEncoder(2595,500)
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(ae.parameters(), lr=1e-3)
train_autoencoder(model=ae5,
                   dataset=output2,
                   loss_func=loss_func,
                   optimizer=optimizer,
                   batch_size=512,
                   epochs=10)
output6 = ae(output5)

In [None]:
#L1 Regularization autoencoder
def L1_loss(recon,inputs):
    MSELoss = nn.MSELoss()
    loss = MSELoss(recon,inputs)
    for param in ae.parameters():
        loss += torch.sum(torch.abs(param))
    return loss

ae6 = AutoEncoder(2595,2595)
optimizer = torch.optim.Adam(ae.parameters(), lr=1e-3)
train_autoencoder(model=ae6,
                   dataset=output3,
                   loss_func=L1_Loss,
                   optimizer=optimizer,
                   batch_size=512,
                   epochs=10)
output7 = ae(output6)

In [None]:
output = output7.numpy()
val_output = val_output7.numpy()
anomaly_score = np.mean(np.power(output - X_train, 2), axis=1)
val_anomaly_score = np.mean(np.power(val_output - X_val, 2), axis=1)

# Ensemble results
Estimation using FLIC

In [None]:
ensemble_X_train = pd.DataFrame(None)
ensemble_X_train['anomaly_score'] = anomaly_score
ensemble_X_train['FLIC'] = FLIC_train_preds
ensemble_X_train['smote'] = smote_train_preds
ensemble_X_val = pd.DataFrame(None)
ensemble_X_val['anomaly_score'] = val_anomaly_score
ensemble_X_val['FLIC'] = FLIC_val_preds
ensemble_X_val['smote'] = smote_val_preds

In [None]:
FLIC.fit(ensemble_X_train,y_train)
preds = FLIC.predict_proba(ensemble_X_val)