In [76]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import normalize

### 1
Preprocess and normalize (or standardize) the dataset. Split it into training and testing subsets with 80% of the data in the training dataset.

In [77]:
# Dataset info:

data_info_path = '/Users/maxperozek/ML-CP341/4/breast-cancer-wisconsin.names'

with open(data_info_path, 'r') as f:
    print(f.read())

Citation Request:
   This breast cancer databases was obtained from the University of Wisconsin
   Hospitals, Madison from Dr. William H. Wolberg.  If you publish results
   when using this database, then please include this information in your
   acknowledgements.  Also, please cite one or more of:

   1. O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear 
      programming", SIAM News, Volume 23, Number 5, September 1990, pp 1 & 18.

   2. William H. Wolberg and O.L. Mangasarian: "Multisurface method of 
      pattern separation for medical diagnosis applied to breast cytology", 
      Proceedings of the National Academy of Sciences, U.S.A., Volume 87, 
      December 1990, pp 9193-9196.

   3. O. L. Mangasarian, R. Setiono, and W.H. Wolberg: "Pattern recognition 
      via linear programming: Theory and application to medical diagnosis", 
      in: "Large-scale numerical optimization", Thomas F. Coleman and Yuying
      Li, editors, SIAM Publications, Philadelphia 199

In [78]:
data_path = '/Users/maxperozek/ML-CP341/4/breast-cancer-wisconsin.data'

data = np.genfromtxt(data_path, delimiter=',')

In [79]:
data.shape

(699, 11)

In [80]:
def fill_empty_mean(X):
    feat_means = np.nanmean(X, axis=0)
    nan_idx = np.argwhere(np.isnan(X))
    
    for i in range(nan_idx.shape[0]):
        X[nan_idx[i,:]] = feat_means[nan_idx[i,1]]
        
    return X 

In [81]:
def normalize_dataset(X):
    normalized_array = normalize(X, norm="l2")
    return normalized_array

In [82]:
data_X = data[:,1:10]
data_y = data[:,10]

In [83]:
data_X = fill_empty_mean(data_X)

In [84]:
print(data_X.shape)
print(data_y.shape)

(699, 9)
(699,)


In [85]:
data_X = normalize_dataset(data_X)

In [86]:
data_y[np.where(data_y == 2.0)] = 0
data_y[np.where(data_y == 4.0)] = 1

In [87]:
np.unique(data_y)

array([0., 1.])

In [88]:
pct_train = 0.8

train_idx = np.random.choice(np.arange(data_X.shape[0]), int(data_X.shape[0] * pct_train),  replace=False)
test_idx = np.array([i for i in range(data_X.shape[0]) if i not in list(train_idx)])

In [89]:
test_idx.shape[0] + train_idx.shape[0] == data_X.shape[0]

True

In [90]:
train_X = data_X[train_idx, :]
train_y = data_y[train_idx]

test_X = data_X[test_idx, :]
test_y = data_y[test_idx]

In [91]:
print(train_X.shape)
print(train_y.shape)
print(test_X.shape)
print(test_y.shape)

(559, 9)
(559,)
(140, 9)
(140,)


### 2 
Create your ANN model using Keras. You can choose the various parameters of the network, such as number of layers, number of hidden nodes per layer, activation functions, etc. Be prepared to adjust your parameters if you encounter issues when training.

In [92]:
import keras_tuner as kt

In [93]:
def buildModel(X, y):
    model = Sequential()
    model.add(Dense(10, input_dim=9, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [94]:
model = buildModel(train_X, train_y)

### 3
Train your model.

In [95]:
model.fit(train_X, train_y, epochs=500, batch_size=20, verbose=0)

<keras.callbacks.History at 0x7f7bea4271c0>

### 4
Report the test dataset accuracy.

In [96]:
_, accuracy = model.evaluate(test_X, test_y)
print(accuracy)

0.8999999761581421


### 5 
Try at least one other ANN design and see how the final accuracy compares to your first run.

In [118]:
def build_ANN(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int("units", min_value=32, max_value=512, step=8),
                    input_dim=9,
                    activation=hp.Choice("activation", ["relu", "tanh"]),))
    
    for i in range(hp.Int("num_layers", 1, 3)):
        model.add(
            layers.Dense(
                # Tune number of units separately.
                units=hp.Int(f"units_{i}", min_value=32, max_value=512, step=32),
                activation=hp.Choice("activation", ["relu", "tanh"]),
            )
        )
    
    if hp.Boolean("dropout"):
        model.add(layers.Dropout(rate=0.25))
        
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [119]:
model = build_ANN(kt.HyperParameters())

In [120]:
model.fit(train_X, train_y, epochs=500, batch_size=20, verbose=0)

<keras.callbacks.History at 0x7f7bebc68fd0>

In [121]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 32)                320       
                                                                 
 dense_5 (Dense)             (None, 32)                1056      
                                                                 
 dense_6 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1,409
Trainable params: 1,409
Non-trainable params: 0
_________________________________________________________________


In [122]:
_, accuracy = model.evaluate(test_X, test_y)
print(accuracy)

0.9071428775787354


In [134]:
tuner = kt.RandomSearch(
    hypermodel=build_ANN,
    objective="val_accuracy",
    max_trials=50,
    executions_per_trial=2,
)

INFO:tensorflow:Reloading Oracle from existing project ./untitled_project/oracle.json
INFO:tensorflow:Reloading Tuner from ./untitled_project/tuner0.json


In [135]:
# tuner = kt.BayesianOptimization(
#     hypermodel=build_ANN,
#     objective="val_accuracy",
#     max_trials=50,
#     executions_per_trial=2
# )

In [136]:
tuner.search(train_X, train_y, epochs=2, validation_data=(test_X, test_y))

Trial 50 Complete [00h 00m 01s]
val_accuracy: 0.875

Best val_accuracy So Far: 0.9035714268684387
Total elapsed time: 00h 01m 34s
INFO:tensorflow:Oracle triggered exit


In [137]:
best_hyps = tuner.get_best_hyperparameters(5)

In [142]:
for i in range(len(best_hyps)):
    
    print(best_hyps[i].values)

{'units': 288, 'activation': 'tanh', 'num_layers': 3, 'units_0': 384, 'dropout': False, 'units_1': 352, 'units_2': 352}
{'units': 240, 'activation': 'tanh', 'num_layers': 3, 'units_0': 96, 'dropout': False, 'units_1': 384, 'units_2': 224}
{'units': 240, 'activation': 'tanh', 'num_layers': 2, 'units_0': 256, 'dropout': True, 'units_1': 416, 'units_2': 224}
{'units': 208, 'activation': 'relu', 'num_layers': 3, 'units_0': 288, 'dropout': True, 'units_1': 384, 'units_2': 416}
{'units': 456, 'activation': 'tanh', 'num_layers': 2, 'units_0': 224, 'dropout': True, 'units_1': 480}
