In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
import mlprepare as mlp
import deep_tabular_augmentation as dta
from torch.utils.data import Dataset, DataLoader
from sklearn.ensemble import RandomForestClassifier

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

DATA_PATH = 'data/breast-cancer.csv'

df = pd.read_csv(DATA_PATH)

In [3]:
df.head()

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,40-49,premefalse,15-19,0-2,True,3,right,left_up,False,recurrence-events
1,50-59,ge40,15-19,0-2,False,1,right,central,False,false-recurrence-events
2,50-59,ge40,35-39,0-2,False,2,left,left_low,False,recurrence-events
3,40-49,premefalse,35-39,0-2,True,3,right,left_low,True,false-recurrence-events
4,40-49,premefalse,30-34,3-5,True,2,left,right_up,False,recurrence-events


In [4]:
cols = df.columns

In [5]:
df['class'] = np.where(df['class']=='false-recurrence-events', 0, 1)

In [6]:
difference_in_class_occurences = df['class'].value_counts()[0]-df['class'].value_counts()[1]
difference_in_class_occurences

110

I would actually need another Autoencoder for that which can handle categorical values as input via embedding layer.

In [7]:
continuous_type = ['deg-malig']
categorical_type = ['age', 'mefalsepause', 'tumor-size', 'inv-falsedes', 'breast', 'breast-quad']
target_var = 'class'

In [8]:
df = mlp.df_to_type(df, cont_type=continuous_type, cat_type=categorical_type)

In [9]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [10]:
train_df, test_df, dict_list, dict_inv_list = mlp.cat_transform(train_df, test_df, cat_type = categorical_type)

In [11]:
train_df=train_df.fillna(0)
test_df=test_df.fillna(0)
train_df.head()

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
251,3,3,4,5,True,2,2,5,True,1
92,2,3,8,1,False,1,1,3,False,1
201,3,1,6,5,False,3,1,2,False,1
126,1,3,7,1,False,2,2,5,False,0
124,5,1,3,1,False,2,2,2,False,0


Here, I treat categorical variables as if they were continuous

In [12]:
X_train, X_test, y_train, y_test, scaler = mlp.cont_standardize(train_df[train_df['class']==1].iloc[:,:9], test_df[test_df['class']==1].iloc[:,:9], train_df[train_df['class']==1].iloc[:,9], test_df[test_df['class']==1].iloc[:,9], cat_type=None, transform_y=False, path='scaler_objects', standardizer='StandardScaler')


Inbalanced Class -> recurrence-events 

I want to create data for the underrepresented class -> Class=1

In [13]:
class DataBuilder(Dataset):
    def __init__(self, X_train, X_test, y_train, y_test, train=True):
        self.X_train, self.X_test, self.y_train, self.y_test = X_train, X_test, y_train, y_test
        if train:
            self.X_train['class']=self.y_train
            self.x = torch.from_numpy(self.X_train.values).type(torch.FloatTensor)
            self.len=self.x.shape[0]
        else:
            self.X_test['class']=self.y_test
            self.x = torch.from_numpy(self.X_test.values).type(torch.FloatTensor)
            self.len=self.x.shape[0]
        del self.X_train
        del self.X_test
        del self.y_train
        del self.y_test 
    def __getitem__(self,index):      
        return self.x[index]
    def __len__(self):
        return self.len

In [14]:
traindata_set=DataBuilder(X_train, X_test, y_train, y_test, train=True)
testdata_set=DataBuilder(X_train, X_test, y_train, y_test, train=False)

trainloader=DataLoader(dataset=traindata_set,batch_size=1024)
testloader=DataLoader(dataset=testdata_set,batch_size=1024)

In [15]:
trainloader.dataset.x[:1]

tensor([[-0.5152,  0.9253, -0.7024,  0.8244,  1.3784, -0.4921,  1.1094,  1.8507,
          1.1902,  1.0000]])

In [16]:
D_in = traindata_set.x.shape[1]
H = 50
H2 = 12

autoenc_model = dta.AutoencoderModel(trainloader, testloader, device, D_in, H, H2, latent_dim=5)

In [17]:
autoenc_model_fit = autoenc_model.fit(epochs=1000)

====> Epoch: 200 Average training loss: 11.6488
====> Epoch: 200 Average test loss: 13.1494
====> Epoch: 400 Average training loss: 9.4948
====> Epoch: 400 Average test loss: 11.8729
====> Epoch: 600 Average training loss: 8.5165
====> Epoch: 600 Average test loss: 9.9916
====> Epoch: 800 Average training loss: 7.6626
====> Epoch: 800 Average test loss: 9.1584
====> Epoch: 1000 Average training loss: 7.5808
====> Epoch: 1000 Average test loss: 8.8292


Let's see how the created data looks like:

In [18]:
cols_fake = cols.to_list()
cols_fake.remove('class')

In [19]:
df_fake = autoenc_model_fit.predict_df(no_samples=difference_in_class_occurences, scaler=scaler, cols=cols, target_class=target_var, cont_vars=cols_fake)

df_fake.head()

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,3.768765,1.789621,5.954432,3.557696,0.335189,2.816264,1.749215,2.806577,0.251433,1
1,3.601908,1.738391,5.308001,4.684163,0.876896,2.745014,1.343948,3.642051,0.706808,1
2,3.67529,1.79595,5.635921,4.462675,0.721442,2.633552,1.641513,3.716826,0.451245,1
3,3.9763,1.491766,4.638551,5.092431,0.850323,2.709341,0.989822,3.11736,0.951091,1
4,3.609183,1.797875,5.511171,4.203802,0.681211,2.723525,1.467588,3.781805,0.583659,1


Now the data with random noise:

In [20]:
df_fake_with_noise = autoenc_model_fit.predict_with_noise_df(no_samples=difference_in_class_occurences, mu=0, sigma=0.05, scaler=scaler, cols=cols, cont_vars=cols_fake, target_class=target_var, group_var='class')
df_fake.head()

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
0,3.768765,1.789621,5.954432,3.557696,0.335189,2.816264,1.749215,2.806577,0.251433,1
1,3.601908,1.738391,5.308001,4.684163,0.876896,2.745014,1.343948,3.642051,0.706808,1
2,3.67529,1.79595,5.635921,4.462675,0.721442,2.633552,1.641513,3.716826,0.451245,1
3,3.9763,1.491766,4.638551,5.092431,0.850323,2.709341,0.989822,3.11736,0.951091,1
4,3.609183,1.797875,5.511171,4.203802,0.681211,2.723525,1.467588,3.781805,0.583659,1


In [21]:
df_fake.describe().loc[['mean']]

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
mean,3.534483,2.086207,5.379311,3.086207,0.344828,2.362069,1.448276,2.87931,0.413793,1.0


In [22]:
df_fake_with_noise.describe().loc[['mean']]

Unnamed: 0,age,mefalsepause,tumor-size,inv-falsedes,falsede-caps,deg-malig,breast,breast-quad,irradiat,class
mean,3.543073,2.091206,5.415247,3.092878,0.345563,2.365449,1.444211,2.89474,0.409515,1.0


## Train Random Forest

### Create three dataframes: the original, the original appended with fake_data, the original appended with fake data with noise 

In [23]:
train_df_fake = train_df.append(df_fake)
train_df_fake_with_noise = train_df.append(df_fake_with_noise)

### Train model on original data

In [24]:
clf_original=RandomForestClassifier(n_estimators=100, max_features=0.5, min_samples_leaf=5, n_jobs=-1)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf_original.fit(train_df.iloc[:,:9].values,train_df.iloc[:,9].values)

y_pred_original=clf_original.predict(test_df.iloc[:,:9].values)

confusion_matrix(test_df.iloc[:,9], y_pred_original)

array([[55,  4],
       [15,  8]], dtype=int64)

In [28]:
clf_fake=RandomForestClassifier(n_estimators=100, max_features=0.5, min_samples_leaf=5, n_jobs=-1)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf_fake.fit(train_df_fake.iloc[:,:9].values,train_df_fake.iloc[:,9].values)

y_pred_fake=clf_fake.predict(test_df.iloc[:,:9].values)

confusion_matrix(test_df.iloc[:,9], y_pred_fake)

array([[55,  4],
       [14,  9]], dtype=int64)

In [29]:
#Create a Gaussian Classifier
clf_fake_with_noise=RandomForestClassifier(n_estimators=100, max_features=0.5, min_samples_leaf=5, n_jobs=-1)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf_fake_with_noise.fit(train_df_fake_with_noise.iloc[:,:9].values,train_df_fake_with_noise.iloc[:,9].values)

y_pred_fake_with_noise=clf_fake.predict(test_df.iloc[:,:9].values)

confusion_matrix(test_df.iloc[:,9], y_pred_fake_with_noise)

array([[55,  4],
       [14,  9]], dtype=int64)