# Import libaries needed initally

In [49]:
import numpy as np
import pandas as pd
from random import randint
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import tensorflow as tf

# Importing Data

Mounting google drive folder

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Reading the csv file

In [106]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/breast-cancer-wisconsin1.csv')

Displaying the shape of the data

In [None]:
df.shape

Plotting the data

In [None]:
df.hist()

# Data pre-process

Displaying the first 20 values

In [None]:
df.head(20)

Looking for unique values

In [None]:
for col in df:
    print(df[col].unique())

Removing the rows with missing data

In [108]:
indexNames = df[ df['bare_nuclei'] == '?' ].index

In [109]:
df.drop(indexNames , inplace=True)

In [110]:
df['bare_nuclei'] = df['bare_nuclei'].astype(int)

In [None]:
df

In [None]:
for col in df:
    print(df[col].unique())

Splitting data into samples and labels

In [113]:
samples = df.iloc[:, 0:9]
labels = df.iloc[:, 9]

Encoding the labels

In [114]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
labels = lb.fit_transform(labels)

Splitting the data into training and test data

In [165]:
from sklearn.model_selection import train_test_split
train_samples, test_samples, train_labels, test_labels = train_test_split(samples,labels, test_size = 0.30, random_state = 42)

Transforming the data into scales

In [166]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
train_samples = sc.fit_transform(train_samples)
test_samples = sc.transform(test_samples)

Printing the final version of the data

In [None]:
train_samples

In [None]:
test_samples

In [None]:
train_labels

In [None]:
test_labels

# Use GPU for computing (optional)

In [None]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print("Num GPU", len(physical_devices))
tf.config.experimental.set_memory_growth(physical_devices[0], True)

# Plotting functions

In [21]:
def plotacc(history,epochs): #function to plot training accuracy
  loss_train = history.history['accuracy']
  epochs = range(1,epochs+1)
  plt.plot(epochs, loss_train, 'g', label='Training accuracy')
  plt.title('Training accuracy')
  plt.xlabel('Epochs')
  plt.ylabel('Accuracy')
  plt.legend()
  plt.show()

In [12]:
def plotloss(history,epochs): #function to plot training loss
  loss_train = history.history['loss']
  epochs = range(1,epochs+1)
  plt.plot(epochs, loss_train, 'r', label='Training loss')
  plt.title('Training loss')
  plt.xlabel('Epochs')
  plt.ylabel('Loss')
  plt.legend()
  plt.show()

In [13]:
def plotlosswval(history, epochs): #function to plot training loss against validation loss
  loss_train = history.history['loss']
  loss_val = history.history['val_loss']
  epochs = range(1,epochs+1)
  plt.plot(epochs, loss_train, 'g', label='Training loss')
  plt.plot(epochs, loss_val, 'b', label='validation loss')
  plt.title('Training and Validation loss')
  plt.xlabel('Epochs')
  plt.ylabel('Loss')
  plt.legend()
  plt.show()

In [14]:
def plotaccwval(history,epochs): #function to plot training accuracy against validation accuracy 
  loss_train = history.history['accuracy']
  loss_val = history.history['val_accuracy']
  epochs = range(1,epochs+1)
  plt.plot(epochs, loss_train, 'g', label='Training accuracy')
  plt.plot(epochs, loss_val, 'b', label='validation accuracy')
  plt.title('Training and Validation accuracy')
  plt.xlabel('Epochs')
  plt.ylabel('Accuracy')
  plt.legend()
  plt.show()

#Keras Sequential Model

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, InputLayer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers

Defineing the model

In [285]:
model = keras.Sequential(
    [
        layers.Dense(32, kernel_initializer='HeUniform', activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(16, kernel_initializer='HeUniform', activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(1, kernel_initializer='HeUniform', activation='sigmoid')
    ]
)

Compile the model

In [286]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Choosing the number of epochs

In [287]:
epochs = 200

Evaluating the model with basic validation

In [None]:
history = model.fit(x=train_samples, y=train_labels, batch_size=256, epochs=epochs, validation_data=(test_samples, test_labels), shuffle=True, verbose=2)

Plotting results

In [None]:
plotaccwval(history, epochs)

In [None]:
plotlosswval(history, epochs)

#Defining K-Fold cross validator, metrics and data

In [118]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=4, shuffle=True, random_state= 42) # creating the 4 fold validator

over_acc = np.array # variable to keep the overall accuracy
over_loss = np.array # variable to keep the overall loss

Because the k-fold validator splits the data into train and test by itself, samples will be scaled alone before the start of the algorithm

In [183]:
#extracting samples from the dataframe
samples = df.iloc[:, 0:9]
#reinitalising the sclaer
sc = StandardScaler()
#fitting the scaler
sc.fit(samples)
#transforming(scaling) the data
samples = sc.transform(samples)

# K-Fold cross validation

In [None]:
fold_no = 1
acc_per_fold = [] # array to record the accuracy per fold
loss_per_fold = [] # array to record the loss per fold
for train, test in kfold.split(samples, labels):


  #Defining the model
  model = keras.Sequential(
    [
        layers.Dense(32, kernel_initializer='HeUniform', activation='relu', input_dim=9),
        layers.Dropout(0.1),
        layers.Dense(16, kernel_initializer='HeUniform', activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(1, kernel_initializer='HeUniform', activation='sigmoid')
        
    ]
  )
 
  

  #Compiling the model
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  #Printing the fold number
  print('\n-------------------------------------------------------------------------------------------------')
  print(f'Training for fold {fold_no}')
  
  #Declaring the number of epochs
  epochs = 200

  #Fiting the data to the model
  history = model.fit(x=samples[train], y=labels[train], batch_size=256, epochs=epochs, shuffle=True, verbose=0)
  #Extracting the results
  scores = model.evaluate(x = samples[test], y = labels[test], verbose=2)
  print('\n')
  print(f'Prediction score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1]*100) 
  loss_per_fold.append(scores[0])

  #Plotting the results for each fold
  plotacc(history, epochs)
  plotloss(history,epochs)

  #Incrementing the fold number
  fold_no = fold_no + 1

# Results of the K-fold validation model

Printing the accuracy for each fold

In [None]:
print("Accuracy for each fold : ")
k=1
for i in acc_per_fold:
    l = "%.4f" % i
    print('Fold ', k,'is ', l)
    k=k+1

Accuracy for each fold : 
Fold  1 is  94.7368
Fold  2 is  98.2456
Fold  3 is  98.2456
Fold  4 is  98.2353


Printing the loss for each fold

In [None]:
print("Loss for each fold : ")
k=1
for i in loss_per_fold:
    l = "%.4f" % i
    print('Fold ', k,'is ', l)
    k=k+1

Loss for each fold : 
Fold  1 is  0.1299
Fold  2 is  0.0703
Fold  3 is  0.0601
Fold  4 is  0.0512


Plotting the accuracy per fold

In [None]:
folds = range(1,fold_no)
plt.plot(folds, acc_per_fold, 'g', label='Accuracy')
plt.title('Accuracy per fold')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

Plotting the loss per fold

In [None]:
folds = range(1,fold_no)
plt.plot(folds, loss_per_fold, 'r', label='Loss')
plt.title('Loss per fold')
plt.xlabel('Fold')
plt.ylabel('Loss')
plt.legend()
plt.show()