In [56]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, precision_score, f1_score

In [72]:
INPUT_FEATURES = ['category_encoded',
                  'amt', 
                  'gender_encoded', 
                  'city_encoded', 
                  'state_encoded', 
                  'city_pop', 
                  'job_encoded', 
                  'age', 
                  'hour', 
                  'daily', 
                  'day', 
                  'month']
OUTPUT_FEATURE = ['is_fraud']

def preprocessing(df):
    # remove duplicates
    df.drop_duplicates(inplace=True)
    # drop nA rows containing values
    df.dropna(axis=0)
    # drop the 'Unnamed: 0'
    df = df.drop('Unnamed: 0', axis=1)
    # change the type of date time
    df['age'] = df['dob'].apply(lambda x: datetime.now().year - datetime.strptime(x, '%Y-%m-%d').year)
    df['trans_datetime'] = pd.to_datetime(df['trans_date_trans_time'])
    df['hour'] = df['trans_datetime'].dt.hour
    df['daily'] = df['trans_datetime'].dt.day
    df['day'] = df['trans_datetime'].dt.dayofweek
    df['month'] = df['trans_datetime'].dt.month
    df.drop('trans_date_trans_time', axis=1, inplace=True)

    return df

def smoteTomek_augmentation(df, sampling_strategy):
    # define the model
    smote_tomek = SMOTETomek(random_state=42, sampling_strategy=sampling_strategy)

    X_Augmented, y_augmented = smote_tomek.fit_resample(df[INPUT_FEATURES],df[OUTPUT_FEATURE])

    new_df = pd.concat([X_Augmented, y_augmented], axis=1)
    
    return new_df

def encoding_columns(df):

    labelencoder = LabelEncoder()
    df['category_encoded'] = labelencoder.fit_transform(df['category'])
    df['gender_encoded'] = labelencoder.fit_transform(df['gender'])
    df['city_encoded'] = labelencoder.fit_transform(df['city'])
    df['state_encoded'] =labelencoder.fit_transform(df['state'])
    df['job_encoded'] = labelencoder.fit_transform(df['job'])

    return df

def dataloading(for_training, input_features, output_feature, augmented=False, sampling_strategy=0.3):

    if for_training:
        df = pd.read_csv('../data/fraudTrain.csv')
        df = df[df['is_fraud']==0]
    else:
        df = pd.read_csv('../data/fraudTest.csv')
    
    df = preprocessing(df)
    df = encoding_columns(df)

        # Augment the data
    if augmented == True:
        df = smoteTomek_augmentation(df, sampling_strategy)


    if for_training:
        # split btwn training data and validation with ratio 90%
        df_train, df_val = train_test_split(df, test_size=0.1, random_state=42, stratify=df['is_fraud'])

        # scale the data
        scaler = StandardScaler()
        scaler.fit(df_train[input_features])

        df_train[input_features]=scaler.transform(df_train[input_features])
        df_val[input_features]=scaler.transform(df_val[input_features])

        # separate Input and Label
        X_train = df_train[input_features]
        y_train = df_train[output_feature]

        X_val = df_val[input_features]
        y_val = df_val[output_feature]  


        return X_train, y_train,X_val, y_val

    else:

        # scale the data
        scaler = StandardScaler()
        scaler.fit(df[input_features])
        df[input_features]=scaler.transform(df[input_features])

        X_test = df[input_features]
        y_test = df[output_feature]  

        return X_test, y_test


In [73]:
X_train, y_train, X_val, y_val = dataloading(True, INPUT_FEATURES, OUTPUT_FEATURE)
X_test, y_test = dataloading(False, INPUT_FEATURES, OUTPUT_FEATURE, augmented=True)

In [62]:
def get_anomaly_scores(df_original, df_restored):
    loss = np.sum((np.array(df_original) - np.array(df_restored)) ** 2, axis=1)
    loss = pd.Series(data=loss, index=df_original.index)
    return loss

def is_anomaly(data, pca, threshold):
    pca_data = pca.transform(data)
    restored_data = pca.inverse_transform(pca_data)
    loss = np.sum((data - restored_data) ** 2)
    return loss > threshold


# fit to the i principal components
pca = PCA(n_components=8, random_state=0)
pca.fit_transform(X_train)

df_pca = pd.DataFrame(pca.transform(X_test), index=X_test.index)
df_restored = pd.DataFrame(pca.inverse_transform(df_pca), index=df_pca.index)

reconstruction_errors = get_anomaly_scores(X_test, df_restored)

In [63]:
def train(nm_PCA, X_train, X_val):
    pca = PCA(n_components=nm_PCA, random_state=0)
    pca.fit_transform(X_train)

    df_pca = pd.DataFrame(pca.transform(X_val), index=X_val.index)
    df_restored = pd.DataFrame(pca.inverse_transform(df_pca), index=df_pca.index)

    reconstruction_errors = get_anomaly_scores(X_val, df_restored)

    theta = np.mean(reconstruction_errors)

    return pca, theta

def test(pca, X_test, y_test, threshold):
    
    df_pca = pd.DataFrame(pca.transform(X_test), index=X_test.index)
    df_restored = pd.DataFrame(pca.inverse_transform(df_pca), index=df_pca.index)
    
    reconstruction_errors = get_anomaly_scores(X_test, df_restored)

    pred = (reconstruction_errors > threshold).astype(int)
    error_df = pd.DataFrame({'pred': pred.to_numpy(),
                           'True_class': y_test['is_fraud'].to_numpy()})
    
    return error_df

In [64]:
pca, theta = train(4, X_train, X_val)
error_df = test(pca, X_test, y_test, theta)

In [74]:
report = []
for i in range(2, 9):
    
    pca, theta = train(i, X_train, X_val)
    error_df = test(pca, X_test, y_test, theta)
    values = [i,accuracy_score(error_df['True_class'], error_df['pred']),recall_score(error_df['True_class'], error_df['pred']),precision_score(error_df['True_class'], error_df['pred']),f1_score(error_df['True_class'], error_df['pred'])]
    report.append(values)

In [None]:
report

In [66]:
print(" Accuracy: ",accuracy_score(error_df['True_class'], error_df['pred']))
print(" Recall: ",recall_score(error_df['True_class'], error_df['pred']))
print(" Precision: ",precision_score(error_df['True_class'], error_df['pred']))
print(" F1_score: ",f1_score(error_df['True_class'], error_df['pred']))

 Accuracy:  0.7149386650447438
 Recall:  0.6983682983682984
 Precision:  0.009405707468684268
 F1_score:  0.018561427420853725
