In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
import mlprepare as mlp
import deep_tabular_augmentation as dta
from torch.utils.data import Dataset, DataLoader
from sklearn.ensemble import RandomForestClassifier

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

DATA_PATH = 'data/creditcard.csv'

df = pd.read_csv(DATA_PATH)

In [3]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


We want a balanced dataset, hence we want to create as many fake data as the difference between the two classes.

In [4]:
difference_in_class_occurences = df['Class'].value_counts()[0]-df['Class'].value_counts()[1]
difference_in_class_occurences

283823

In [5]:
cols = df.columns

In [6]:
train_df, test_df = train_test_split(df[df['Class']==1], test_size=0.3, random_state=42)

In [7]:
X_train_fraud, X_test_fraud, y_train, y_test, scaler_fraud_data = mlp.cont_standardize(train_df.iloc[:,:30], test_df.iloc[:,:30], train_df.iloc[:,30], test_df.iloc[:,30], cat_type=None, transform_y=False, path='scaler_objects/fraud_scaler', standardizer='StandardScaler')

In [8]:
class DataBuilder(Dataset):
    def __init__(self, X_train, X_test, train=True):
        self.X_train, self.X_test = X_train, X_test
        if train:
            self.X_train['class']=1
            self.x = torch.from_numpy(self.X_train.values).type(torch.FloatTensor)
            self.len=self.x.shape[0]
        else:
            self.X_test['class']=1
            self.x = torch.from_numpy(self.X_test.values).type(torch.FloatTensor)
            self.len=self.x.shape[0]
        del self.X_train
        del self.X_test
    def __getitem__(self,index):      
        return self.x[index]
    def __len__(self):
        return self.len

In [9]:
traindata_set=DataBuilder(X_train_fraud, X_test_fraud, train=True)
testdata_set=DataBuilder(X_train_fraud, X_test_fraud, train=False)

trainloader=DataLoader(dataset=traindata_set,batch_size=1024)
testloader=DataLoader(dataset=testdata_set,batch_size=1024)

In [10]:
trainloader.dataset.x[0]

tensor([ 0.6384,  0.5539, -0.0310,  0.1643,  0.8084,  0.5360,  0.7849,  0.4121,
         0.1132, -0.0959,  0.2249,  0.3387, -0.5321,  0.9803, -0.0817, -0.9667,
         0.2035,  0.3364,  0.6211,  0.6760,  0.6165, -0.0287, -0.6102,  0.2108,
        -1.0567,  0.0951,  0.8003,  0.5116,  0.4682,  0.3214,  1.0000])

In [11]:
D_in = traindata_set.x.shape[1]
H = 50
H2 = 12

autoenc_model = dta.AutoencoderModel(trainloader, testloader, device, D_in, H, H2, latent_dim=5)

In [12]:
autoenc_model_fit = autoenc_model.fit(epochs=1000)

====> Epoch: 200 Average training loss: 24.5862
====> Epoch: 200 Average test loss: 30.0532
====> Epoch: 400 Average training loss: 19.9307
====> Epoch: 400 Average test loss: 25.6350
====> Epoch: 600 Average training loss: 17.9302
====> Epoch: 600 Average test loss: 23.6630
====> Epoch: 800 Average training loss: 16.8525
====> Epoch: 800 Average test loss: 22.6172
====> Epoch: 1000 Average training loss: 15.6852
====> Epoch: 1000 Average test loss: 21.8004


Let's see how the created data looks like:

In [13]:
cols_fake = cols.to_list()
cols_fake.remove('Class')

In [15]:
df_fake = autoenc_model_fit.predict_df(no_samples=difference_in_class_occurences, cols=cols, scaler=scaler_fraud_data, cont_vars=cols_fake)
df_fake['Class']=1
df_fake.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,109272.789062,-2.674968,2.742105,-4.766198,3.369049,-1.304484,-1.170368,-4.006643,1.435492,-1.821178,...,0.911522,0.156942,-0.264695,-0.043318,-0.04109,0.26936,-0.080869,0.239243,76.993843,1
1,94793.429688,-2.037335,1.953935,-2.815907,2.604125,-0.827629,-0.885306,-0.543145,-0.141685,-0.728007,...,0.591901,-0.439533,-0.196395,-0.257738,0.354994,-0.065376,0.232742,0.274049,46.49649,1
2,91415.085938,-3.184056,3.275564,-6.183428,4.601014,-2.728669,-1.287761,-4.666717,1.131586,-2.747678,...,1.36509,0.214571,-0.216123,-0.06478,0.00436,0.211144,0.318165,0.215923,145.913071,1
3,65620.359375,1.819347,1.418351,-1.374298,9.095325,-0.347908,-3.275415,-7.162862,1.061794,-5.659923,...,2.559348,0.823909,-0.383635,0.02803,-0.833282,0.600049,0.402766,-0.342388,44.143433,1
4,95402.164062,-0.664518,0.981763,-2.693872,2.449151,0.843871,-0.846841,-1.351984,-0.287119,-0.594204,...,0.606944,-0.335811,-0.310156,-0.227214,0.233913,-0.070585,0.072488,0.163716,30.300911,1


Now the data with random noise:

In [16]:
df_fake_with_noise = autoenc_model_fit.predict_with_noise_df(no_samples=difference_in_class_occurences, cols=cols, mu=0, sigma=0.05, scaler=scaler_fraud_data, cont_vars=cols_fake)
df_fake_with_noise['Class']=1
df_fake_with_noise.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,50437.527344,-9.556958,4.605006,-12.132256,7.313379,-5.711418,-2.756941,-11.479061,0.793535,-6.283787,...,-1.163612,0.995176,1.936402,-0.22192,-0.612316,0.673865,1.273914,0.103194,-119.967636,1
1,96943.054688,-1.593202,1.897814,-3.886051,3.293553,0.368639,-1.002483,-1.777758,-0.862877,-1.318851,...,0.87076,-0.275088,0.182934,-0.312897,0.197908,0.027455,0.259814,0.142093,73.28186,1
2,119730.992188,-2.192357,2.673203,-4.548784,3.282219,-0.932281,-1.227347,-3.483977,1.202486,-1.830701,...,0.765426,0.013292,-0.271053,-0.155405,0.023373,0.255064,-0.005519,0.217258,66.22625,1
3,56635.769531,-2.057692,2.908793,-4.869972,8.96898,-2.642726,-2.814321,-9.148726,2.110916,-5.442397,...,2.330713,0.437243,-0.77163,-0.182962,-0.346962,0.425942,0.876128,-0.11728,88.198532,1
4,93458.359375,-1.829165,2.178387,-3.044882,2.72024,-0.768339,-0.923028,-0.331809,0.547264,-0.655354,...,0.856194,-0.499558,-0.074313,-0.281293,0.32752,-0.066577,0.219212,0.253152,67.516815,1


In [27]:
df[df['Class']==1].groupby('Class').describe().loc[:,(slice(None),['mean'])]

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,...,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
Class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [28]:
df_fake.describe().loc[['mean']]

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
mean,80303.828125,-4.467331,3.469041,-6.645338,4.469844,-3.003313,-1.322752,-5.325488,0.590965,-2.527091,...,0.536949,0.127002,-0.052788,-0.096175,0.028246,0.057165,0.1588,0.067378,115.180275,1.0


In [29]:
df_fake_with_noise.describe().loc[['mean']]

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
mean,80299.735488,-4.467342,3.468569,-6.646258,4.469444,-3.003841,-1.322793,-5.325636,0.591395,-2.527251,...,0.53693,0.127035,-0.052864,-0.096146,0.02826,0.057161,0.158841,0.067445,115.172265,1.0


## Train Random Forest

### Create three dataframes: the original, the original appended with fake_data, the original appended with fake data with noise 

In [30]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
train_df_fake = train_df.append(df_fake)
train_df_fake_with_noise = train_df.append(df_fake_with_noise)

### Train model on original data

In [31]:
clf_original=RandomForestClassifier(n_estimators=100, max_features=0.5, min_samples_leaf=5, n_jobs=-1)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf_original.fit(train_df.iloc[:,:30].values,train_df.iloc[:,30].values)

y_pred_original=clf_original.predict(test_df.iloc[:,:30].values)

confusion_matrix(test_df.iloc[:,30], y_pred_original)

array([[85300,     7],
       [   27,   109]], dtype=int64)

In [32]:
clf_fake=RandomForestClassifier(n_estimators=100, max_features=0.5, min_samples_leaf=5, n_jobs=-1)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf_fake.fit(train_df_fake.iloc[:,:30].values,train_df_fake.iloc[:,30].values)

y_pred_fake=clf_fake.predict(test_df.iloc[:,:30].values)

confusion_matrix(test_df.iloc[:,30], y_pred_fake)

array([[85297,    10],
       [   26,   110]], dtype=int64)

In [33]:
clf_fake_with_noise=RandomForestClassifier(n_estimators=100, max_features=0.5, min_samples_leaf=5, n_jobs=-1)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf_fake_with_noise.fit(train_df_fake_with_noise.iloc[:,:30].values,train_df_fake_with_noise.iloc[:,30].values)

y_pred_fake_with_noise=clf_fake.predict(test_df.iloc[:,:30].values)

confusion_matrix(test_df.iloc[:,30], y_pred_fake_with_noise)

array([[85297,    10],
       [   26,   110]], dtype=int64)