# Simple cancer binary classification

## Imports

In [1]:
import numpy as np

from tensorflow.keras.datasets import imdb

from neuralnetlib.models import Sequential
from neuralnetlib.layers import Input, Dense, Embedding, LSTM, Bidirectional, Attention, GlobalAveragePooling1D
from neuralnetlib.preprocessing import pad_sequences
from neuralnetlib.metrics import accuracy_score
from neuralnetlib.utils import train_test_split

## 1. Loading the dataset

In [2]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)

## 2. Preprocessing

In [3]:
max_words = 10000
max_len = 200

x_train = pad_sequences(x_train, max_length=max_len)
x_test = pad_sequences(x_test, max_length=max_len)

# cuz we don't want to overfit on test data
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

print(f'x_train shape: {x_train.shape}')
print(f'x_test shape: {x_test.shape}')

print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

print(f'x_train[0]: {x_train[0]}')
print(f'y_train[0]: {y_train[0]}')

x_train shape: (20000, 200)
x_test shape: (25000, 200)
y_train shape: (20000,)
y_test shape: (25000,)
x_train[0]: [  45  108   10   10   11    4   65 3960    9   11   41  402    2  780
   33    2 6130   11    2    4 2763  844   26    2  224    5  193 3960
   39   44  790  153  154  143   41 2521   56    8   41 2028  559   11
    4   20   44 6383 5284  474  482   13   66   92  104  225    6  404
  524   18 3960   18  111    7  178 3960  451  442   76   99  976    6
 1369   11  263    2  460 8519    2    9 3084   59    9   55 7207    2
    5    2   59   47  775    7 9963   59   47    6   87  393   31   15
 3775   11  129  330   73  103    4   20    9  120 1793    8    2    2
    2 5071 3960   47  247    6 5879  822   74    2   21  146 1688    8
 4909   15   48    2 1999   11    4  217   13  104   59   80 2700   83
   12   43   17 3960 3418   53  976    5 6861   17   59  214  922    2
  460 5603    2  486    5 1557    2   55   73  140 1404    5  851   14
   20   45   24   40  233  334  87

## 3. Model definition

In [4]:
model = Sequential()
model.add(Input(max_len))
model.add(Embedding(max_words, 100, weights_init='xavier'))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(Attention())
model.add(Dense(1, activation='sigmoid'))

## 4. Model compilation

In [5]:
model.compile(optimizer='adam', loss_function='binary_crossentropy')

model.summary()

Sequential(gradient_clip_threshold=5.0, enable_padding=False, padding_size=32, random_state=1733520050429276600)
-------------------------------------------------
Layer 1: Input(input_shape=(200,))
Layer 2: Embedding(input_dim=10000, output_dim=100)
Layer 3: Bidirectional(layer=LSTM(units=32, return_sequences=True, return_state=False, clip_value=5.0, random_state=None))
Layer 4: Attention(use_scale=True, score_mode=dot, return_sequences=False)
Layer 5: Dense(units=1)
Layer 6: Activation(Sigmoid)
-------------------------------------------------
Loss function: BinaryCrossentropy
Optimizer: Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, clip_norm=None, clip_value=None)
-------------------------------------------------



## 5. Model training

In [None]:
model.fit(x_train, y_train, epochs=5, batch_size=32, validation_data=(x_test, y_test), metrics=['accuracy'], random_state=42)




## 6. Model evaluation and prediction

In [9]:
loss, _ = model.evaluate(x_val, y_val)
print(f'Loss: {loss}')

predictions = model.predict(x_val)
y_pred = np.where(predictions > 0.5, 1, 0)
accuracy = accuracy_score(y_pred, y_val)
print(f'Accuracy: {accuracy}')

Loss: 2.6114417790014
Accuracy: 0.8712
