In [1]:
# Imports

# General
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer

# Keras
import keras
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.layers import Dropout
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam, SGD, Adadelta
from keras.regularizers import L1L2, L2

# SMOTE
import imblearn
from collections import Counter
from imblearn.over_sampling import SMOTENC

In [3]:
# Load in the data
df = pd.read_csv('./train_data.csv')
print(df.shape)

# Make boolean columns into integer columns
df.replace({False: 0, True: 1}, inplace=True)

# Split the data into target labels y and features X
num_train = df.select_dtypes(include=np.number)
train_y = df.target
train_X = df.drop(['target'], axis = 1)

# Fill empty values in the train data 
train_X_columns = train_X.columns
train_X = train_X.fillna(-1)

# Custom z-score method because scipy caused NaN problems in some columns
def z_score(arr):
    mean = np.mean(arr, axis = 0)
    std = np.std(arr, axis = 0)
    std[std == 0] = 1
    return (arr-mean)/std


def add_gauss_noise(arr, var = 0.1, mu = 0):
    row,col= arr.shape
    sigma = var**0.5
    gauss = np.random.normal(mu,sigma,(row,col))
    gauss = gauss.reshape(row,col)
    return arr + gauss


# Normalize the data using z-score standardization
train_X = z_score(train_X.to_numpy(dtype = np.float64)) #.to_numpy(dtype = np.float64)
train_df = pd.DataFrame(train_X)

(1593, 2651)


In [5]:
train_var_small = add_gauss_noise(train_X)
display(train_var_small)
display(train_X)


array([[-1.68970364e+00, -2.02631022e+00,  1.57732746e+00, ...,
        -8.72301530e-02,  1.36950778e+00, -4.46717192e-01],
       [-1.55671590e+00, -4.55929064e-01,  1.29160594e+00, ...,
         2.55400966e+00,  1.20845579e+00, -1.43960528e-01],
       [-1.89665517e+00, -5.10209010e-02,  5.47264916e-01, ...,
        -7.59881362e-01,  1.23738075e+00,  6.05179357e-02],
       ...,
       [ 1.98056497e+00, -7.06639014e-01,  1.52785752e+00, ...,
        -6.32700040e-01,  1.22049083e+00, -5.64888352e-01],
       [ 1.70563724e+00, -1.95781635e+00,  1.56265578e-01, ...,
        -4.92090825e+00,  6.20935594e-04, -3.02502800e-01],
       [ 2.01379528e+00, -1.27265698e+00,  1.02410476e+00, ...,
        -7.03241496e-01,  1.98124807e+00, -8.03031747e-01]])

array([[-1.74945491, -1.68096043,  1.19733109, ..., -0.43697435,
         1.24677947, -0.33181567],
       [-1.74945491, -0.86619951,  1.19733109, ...,  1.81491955,
         1.24677947, -0.44238454],
       [-1.73403852, -0.63341067,  0.88485592, ..., -0.43697435,
         1.24677947, -0.60034006],
       ...,
       [ 1.9301415 , -1.21538276,  1.50980626, ..., -0.43697435,
         1.24677947, -0.66352227],
       [ 1.93106856, -1.79735484, -0.05256958, ..., -4.94076215,
         0.29784507, -0.30022457],
       [ 1.93842739, -1.44817159,  1.19733109, ..., -0.43697435,
         1.24677947, -0.33181567]])

In [None]:
def create_model(input_dim):
    # create model
    model = Sequential()
    
    # Use 3 dense layers with relu activation, with batchnormalization and dropout inbetween
    model.add(Dense(128, input_dim=input_dim, activation='relu', kernel_initializer = initializer))
    model.add(BatchNormalization())
    model.add(Dense(64, activation='relu', kernel_initializer = initializer, kernel_regularizer = regularizer)) 
    model.add(BatchNormalization())
    
    # Last layer is softmax
    model.add(Dense(3, activation='softmax'))
    
    # SGD optimizer
    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    return model

