In [1]:
import sklearn
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest,f_classif
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import ( Dense, Dropout,Input)
from tensorflow.keras.models import load_model
import warnings
warnings.filterwarnings("ignore")

In [2]:
#%% convert ',' to '.' in the csv files
f = open("training.csv", "r")
string=f.read()
f.close()

string=string.replace(',','.')

f = open("training.csv", "w")
f.write(string)
f.close()

f = open("validation.csv", "r")
string=f.read()
f.close()

string=string.replace(',','.')

f = open("validation.csv", "w")
f.write(string)
f.close()


In [3]:
#%%reading data
train = pd.read_csv('training.csv',";")
val=pd.read_csv('validation.csv',";")

train.shape
train.head()
val.shape
val.head()

X_train = train.drop(["classLabel"], axis=1, inplace=False)
y_train = train["classLabel"]
X_val= val.drop(["classLabel"], axis=1, inplace=False)
y_val = val["classLabel"]

X_train.shape
X_val.shape
y_train.shape
y_val.shape

X_train = X_train.reindex(sorted(X_train.columns), axis=1) #sort the data alphabetically 
X_val = X_val.reindex(sorted(X_val.columns), axis=1)


In [4]:
#%%clean the data
y_train.isnull().sum()
X_train.isnull().sum()
y_val.isnull().sum()
X_val.isnull().sum()

num=X_train._get_numeric_data().columns #numeric data columns names
cat=set(X_train.columns)-set(num) #categorical data columns names

SIn=SimpleImputer(missing_values=np.nan, strategy='mean')#replace nan values with the colum mean
SIn.fit(X_train[num])
SIc=SimpleImputer(missing_values=np.nan, strategy='most_frequent')#replace nan values with the colum most_frequent
SIc.fit(X_train[cat])


X_train.at[:,num]=pd.DataFrame(SIn.transform(X_train[num]),columns=num) #transform
X_val.at[:,num]=pd.DataFrame(SIn.transform(X_val[num]),columns=num) 
X_train.at[:,cat]=pd.DataFrame(SIc.transform(X_train[cat]),columns=cat) 
X_val.at[:,cat]=pd.DataFrame(SIc.transform(X_val[cat]),columns=cat) 


In [5]:
#%%normalization
X_train[num].agg(['max','min','std','mean']) 
X_val[num].agg(['max','min','std','mean']) 


SS=StandardScaler(copy=True, with_mean=True, with_std=True)
SS.fit(X_train[num])
X_train.at[:,num]=SS.transform(X_train[num])
X_val.at[:,num]=SS.transform(X_val[num])

In [6]:
#%%categorical encoding
X_train=pd.get_dummies(X_train)
X_val=pd.get_dummies(X_val)
missed_features=set(X_train.columns)-set(X_val.columns)

for i in missed_features:
    X_val[i]=0
    
y_train=[1 if 'yes' in i else 0 for i in y_train] #convert yes to 1 and no to 0
y_val=[1 if 'yes' in i else 0 for i in y_val]

num=X_train._get_numeric_data().columns
cat=set(X_train.columns)-set(num)
len(cat)
len(num)
X_train.shape
X_val.shape
X_train = X_train.reindex(sorted(X_train.columns), axis=1)#sort the data again 
X_val = X_val.reindex(sorted(X_val.columns), axis=1)


In [7]:
#%%feature selection
k=45
SP=SelectKBest(score_func=f_classif,k=k)
SP.fit(X_train,y_train)
f=SP. get_support () # get the remaining columns from feature selection
columns=X_train.columns[f]
len(columns)#k
X_train=pd.DataFrame(SP.transform(X_train),columns=columns)
X_val=pd.DataFrame(SP.transform(X_val),columns=columns)

X_val.shape
X_train.shape
X_train = X_train.reindex(sorted(X_train.columns), axis=1)
X_val = X_val.reindex(sorted(X_val.columns), axis=1)
# X_train.describe()

In [8]:
#%%model


class myCallback(tf.keras.callbacks.Callback):
    best_val_acc=0 #stores the best_val_acc reached while training
    i=0 #counter to stop training if reaches to 50 without any improvement
    
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('val_accuracy')>self.best_val_acc):#save the model if the val_acc increased
          self.best_val_acc=logs.get('val_accuracy')
          self.model.save('cp.ckpt')
          print(f"model_saved at epoch:{epoch} with accuracy:{self.best_val_acc}")
          
          self.i=0 #restart the counter
          
        elif self.i<50: 
          
          self.i=self.i+1
        else:
          
          self.model.stop_training = True #stop training if i>=50
        
callbacks = myCallback()




input_layer= Input(shape=(X_train.shape[1]))
x=Dense(256,'tanh')(input_layer)
x=Dropout(0.9)(x)
x=Dense(128,'tanh')(x)
x=Dropout(0.8)(x)
x=Dense(64,'tanh')(x)
x=Dropout(0.7)(x)
x=Dense(32,'tanh')(x)
x=Dropout(0.6)(x)
x=Dense(16,'tanh')(x)
x=Dropout(0.5)(x)
x=Dense(8,'tanh')(x)
x=Dropout(0.4)(x)
x=Dense(4,'tanh')(x)
x=Dropout(0.3)(x)
x=Dense(1,'sigmoid')(x)


model = Model(inputs=input_layer, outputs=x)
model.compile(optimizer ='adam',
  loss='binary_crossentropy',
  metrics=['accuracy'])

# model.summary()


print('start')
model.fit(np.array(X_train), np.array(y_train), epochs=1000,
          validation_data=(np.array(X_val), np.array(y_val)),
          callbacks=[callbacks],verbose=0)


model=load_model('cp.ckpt')
print(model.evaluate(np.array(X_train), np.array(y_train)))#acc=91.7%
print(model.evaluate(np.array(X_val), np.array(y_val)))#acc=86%

start
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: cp.ckpt/assets
model_saved at epoch:0 with accuracy:0.4650000035762787
INFO:tensorflow:Assets written to: cp.ckpt/assets
model_saved at epoch:20 with accuracy:0.4699999988079071
INFO:tensorflow:Assets written to: cp.ckpt/assets
model_saved at epoch:21 with accuracy:0.5049999952316284
INFO:tensorflow:Assets written to: cp.ckpt/assets
model_saved at epoch:22 with accuracy:0.6850000023841858
INFO:tensorflow:Assets written to: cp.ckpt/assets
model_saved at epoch:23 with accuracy:0.699999988079071
INFO:tensorflow:Assets written to: cp.ckpt/assets
model_saved at epoch:24 with accuracy:0.7900000214576721
INFO:tensorflow:Assets written to: cp.ckpt/assets
model_saved at epoch:25 with accuracy:0.8100000023841858
INFO:tensorfl