In [1]:
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization
from keras.optimizers import SGD, Adam
import numpy as np
from keras import backend as K
import pandas as pd
from sklearn.utils import class_weight

Using TensorFlow backend.


# Preparing data

In [2]:
data = pd.read_csv("/weights/train_set2.csv", delimiter = "\t")

In [3]:
print (data.shape)

(1077826, 1191)


In [4]:
number_of_features = len(data.values[0])-1

In [5]:
input_features = data.values[:,0:number_of_features]
labels = data.values[:,number_of_features] #last column consists of labels

In [6]:
size_of_dataset = len(labels) #number of examples in training set

In [7]:
print (size_of_dataset)

1077826


Now let's split data into trainig and validation set (99%/1%)

In [None]:
np.random.seed(999) #seed fixed for reproducibility
mask = np.random.rand(size_of_dataset) < 0.99  #array of boolean variables

training_set = input_features[mask]
training_labels = labels[mask]

validation_set = input_features[~mask]
validation_labels = labels[~mask]

# Neural network model

In [8]:
inputs = Input(shape=(number_of_features,))

output_from_1st_layer = Dense(500)(inputs)
output_from_1st_layer = BatchNormalization()(output_from_1st_layer)
output_from_1st_layer = Activation('relu')(output_from_1st_layer)
output_from_1st_layer = Dropout(0.2)(output_from_1st_layer)

output_from_2nd_layer = Dense(200)(output_from_1st_layer)
output_from_2nd_layer = BatchNormalization()(output_from_2nd_layer)
output_from_2nd_layer = Activation('relu')(output_from_2nd_layer)
output_from_2nd_layer = Dropout(0.2)(output_from_2nd_layer)


output_from_3rd_layer = Dense(100)(output_from_2nd_layer)
output_from_3rd_layer = BatchNormalization()(output_from_3rd_layer)
output_from_3rd_layer = Activation('relu')(output_from_3rd_layer)
output_from_3rd_layer = Dropout(0.2)(output_from_3rd_layer)

output = Dense(1, activation='sigmoid')(output_from_3rd_layer)

model = Model(inputs, output)

Model compilation with Adam optimizer

In [9]:
model.compile(optimizer='Adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

Now, let's train the model

In [None]:
model.summary()

In [10]:
class_weight = class_weight.compute_class_weight('balanced', np.unique(labels), labels)

In [11]:
print (class_weight)

[ 0.62377511  2.51979221]


In [40]:
model.fit(input_features, labels, batch_size=128, epochs=1, class_weight=class_weight)

Epoch 1/1


<keras.callbacks.History at 0x7f72e3ffc8d0>

Saving weights

In [104]:
model.save_weights("model_with_singletons.h5")

# Evaluation

In [118]:
test_data = pd.read_csv("/weights/test_set.csv", delimiter = "\t")

In [119]:
test_set = test_data.values[:,0:number_of_features]
test_labels = test_data.values[:,number_of_features] #last column consists of labels

In [125]:
mask = test_labels < 1
negative_examples = test_set[mask]
negative_labels = test_labels[mask]

mask_2 = np.random.rand(len(negative_labels)) < 0.2  #array of boolean variables
negative_examples = negative_examples[mask_2]
negative_labels = negative_labels[mask_2]

positive_examples = test_set[~mask]
positive_labels = test_labels[~mask]

print (len(negative_examples))
print (len(positive_examples))

20253
20424


In [126]:
test_set = np.concatenate((positive_examples, negative_examples), axis=0)
test_labels = np.concatenate((positive_labels, negative_labels), axis=0)

In [127]:
pred = model.predict(test_set, verbose=1)

threshold = 0.20
true_positives = 0.0
false_negatives = 0.0
false_positives = 0.0
true_negatives = 0.0
total = len(pred) 

for i in range(0,len(pred)):
    if (test_labels[i]==1 and pred[i]>threshold):
        true_positives+=1
    if (test_labels[i]==1 and pred[i]<=threshold):
        false_negatives+=1
    if (test_labels[i]==0 and pred[i]>threshold):
        false_positives+=1
    if (test_labels[i]==0 and pred[i]<=threshold):
        true_negatives+=1     
        
print('Precision: %0.3f%%' % (true_positives/(true_positives+false_positives))) 
print('Recall: %0.3f%%' % (true_positives/(true_positives+false_negatives)))
print('Accuracy: %0.3f%%' % ((true_positives+true_negatives)/(total)))

Recall: 0.851%
Accuracy: 0.867%


Let's load weights

In [None]:
model.load_weights("model_with_singletons.h5")

and generate prediction for a single example

In [131]:
single_example = test_set[4:5,:] #example number 5 from the test set
prediction = model.predict(single_example)
print ('%.8f' % prediction[0])

0.98887503
