# Simple cancer binary classification

## Imports

In [1]:
import numpy as np

from tensorflow.keras.datasets import imdb

from neuralnetlib.models import Sequential
from neuralnetlib.layers import Input, Dense, Embedding, LSTM, Bidirectional, Attention, GlobalAveragePooling1D
from neuralnetlib.preprocessing import pad_sequences
from neuralnetlib.metrics import accuracy_score
from neuralnetlib.utils import train_test_split

## 1. Loading the dataset

In [2]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)

## 2. Preprocessing

In [3]:
max_words = 10000
max_len = 200

x_train = pad_sequences(x_train, max_length=max_len)
x_test = pad_sequences(x_test, max_length=max_len)

# cuz we don't want to overfit on test data
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

print(f'x_train shape: {x_train.shape}')
print(f'x_test shape: {x_test.shape}')

print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

print(f'x_train[0]: {x_train[0]}')
print(f'y_train[0]: {y_train[0]}')

x_train shape: (20000, 200)
x_test shape: (25000, 200)
y_train shape: (20000,)
y_test shape: (25000,)
x_train[0]: [4.500e+01 1.080e+02 1.000e+01 1.000e+01 1.100e+01 4.000e+00 6.500e+01
 3.960e+03 9.000e+00 1.100e+01 4.100e+01 4.020e+02 2.000e+00 7.800e+02
 3.300e+01 2.000e+00 6.130e+03 1.100e+01 2.000e+00 4.000e+00 2.763e+03
 8.440e+02 2.600e+01 2.000e+00 2.240e+02 5.000e+00 1.930e+02 3.960e+03
 3.900e+01 4.400e+01 7.900e+02 1.530e+02 1.540e+02 1.430e+02 4.100e+01
 2.521e+03 5.600e+01 8.000e+00 4.100e+01 2.028e+03 5.590e+02 1.100e+01
 4.000e+00 2.000e+01 4.400e+01 6.383e+03 5.284e+03 4.740e+02 4.820e+02
 1.300e+01 6.600e+01 9.200e+01 1.040e+02 2.250e+02 6.000e+00 4.040e+02
 5.240e+02 1.800e+01 3.960e+03 1.800e+01 1.110e+02 7.000e+00 1.780e+02
 3.960e+03 4.510e+02 4.420e+02 7.600e+01 9.900e+01 9.760e+02 6.000e+00
 1.369e+03 1.100e+01 2.630e+02 2.000e+00 4.600e+02 8.519e+03 2.000e+00
 9.000e+00 3.084e+03 5.900e+01 9.000e+00 5.500e+01 7.207e+03 2.000e+00
 5.000e+00 2.000e+00 5.900e+01 4.7

## 3. Model definition

In [4]:
model = Sequential()
model.add(Input(max_len))
model.add(Embedding(max_words, 100, weights_init='xavier'))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Attention())
model.add(Dense(1, activation='sigmoid'))

## 4. Model compilation

In [5]:
model.compile(optimizer='adam', loss_function='binary_crossentropy')

model.summary()

Sequential(temperature=1.0, gradient_clip_threshold=5.0, enable_padding=False, padding_size=32, random_state=1731611806261338000)
-------------------------------------------------
Layer 1: Input(input_shape=(200,))
Layer 2: Embedding(input_dim=10000, output_dim=100)
Layer 3: Bidirectional(layer=LSTM(units=32, return_sequences=True, return_state=False, random_state=None, clip_value=5.0))
Layer 4: Attention(use_scale=True, score_mode=dot, return_sequences=False)
Layer 5: Dense(units=1)
Layer 6: Activation(Sigmoid)
-------------------------------------------------
Loss function: BinaryCrossentropy
Optimizer: Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, clip_norm=None, clip_value=None)
-------------------------------------------------


## 5. Model training

In [6]:
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test), metrics=['accuracy'], random_state=42)





## 6. Model evaluation and prediction

In [7]:
loss, _ = model.evaluate(x_val, y_val)
print(f'Loss: {loss}')

predictions = model.predict(x_val)
y_pred = np.where(predictions > 0.5, 1, 0)
accuracy = accuracy_score(y_pred, y_val)
print(f'Accuracy: {accuracy}')

Loss: 11.780504039134605
Accuracy: 0.831
