## Building a Feed-Forward Neural Network

Also, one thing we may consider is divide the attributes into small sets to test which ones are best fit attributes

In [60]:
import pandas as pd
import numpy as np

dataset = pd.read_csv("Cancer_Data.csv")

print("Dataset :")
print(dataset.head())
print("Diagnosis : ")
print(dataset['diagnosis'].unique())

print("Dimensions of the dataset : ", dataset.shape)
print("Features of the dataset :")
print(dataset.describe(include = 'all'))
 

Dataset :
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  radius_worst  texture_worst  perimeter_worst

In [61]:
dataset['diagnosis']

0      M
1      M
2      M
3      M
4      M
      ..
564    M
565    M
566    M
567    M
568    B
Name: diagnosis, Length: 569, dtype: object

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler

X = dataset.drop('diagnosis', axis = 1).drop('id', axis = 1)
y = dataset['diagnosis']

print(X)
print(y)
# normalize data
scaler = MinMaxScaler(feature_range=(0, 1))
X_rescaled = scaler.fit_transform(X)
X = pd.DataFrame(data = X_rescaled, columns = X.columns)

set_of_classes = y.value_counts().index.tolist()
set_of_classes= pd.DataFrame({'diagnosis': set_of_classes})
y = pd.get_dummies(y)

print("Pre-processed data :")
print(X)

print("Pre-processed class :")
print(y)

#splitting data into ratio 70:30
data_train, data_test, class_train, class_test = train_test_split(X, y, test_size=0.3)

# Number of nodes in each hidden layer should be (10, 2)
# Learning rate should be 0.4
# Number of epochs should be 600
mlp = MLPClassifier(solver = 'sgd', random_state = 42, activation = 'logistic', learning_rate_init = 0.4, batch_size = 100, hidden_layer_sizes = (10, 2), max_iter = 600)
mlp

     radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     compactness_mean  concavity_mean  concave points_mean  sym

In [63]:
y = dataset['diagnosis']
y = pd.get_dummies(y)

In [64]:
mlp.fit(data_train, class_train)

pred = mlp.predict(data_test)
pred
#prediction on the test data. species are represented using the hot-keys

array([[1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [1, 0],
       [0,

In [65]:
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(class_test, pred))
print("Mean Square Error : ", mean_squared_error(class_test, pred))

print(pred[:5])
print("Confusion Matrix for each label : ")
print(multilabel_confusion_matrix(class_test, pred))

print("Classification Report : ")
print(classification_report(class_test, pred))

Accuracy :  0.9941520467836257
Mean Square Error :  0.005847953216374269
[[1 0]
 [1 0]
 [1 0]
 [1 0]
 [1 0]]
Confusion Matrix for each label : 
[[[ 62   0]
  [  1 108]]

 [[108   1]
  [  0  62]]]
Classification Report : 
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       109
           1       0.98      1.00      0.99        62

   micro avg       0.99      0.99      0.99       171
   macro avg       0.99      1.00      0.99       171
weighted avg       0.99      0.99      0.99       171
 samples avg       0.99      0.99      0.99       171



## We use k-fold Cross Validation to validate the FFNN model

https://machinelearningmastery.com/how-to-configure-k-fold-cross-validation/#:~:text=The%20key%20configuration%20parameter%20for,evaluate%20models%20is%20k%3D10.

They say k = 10 is the most popular


In [66]:
# Using sklearn function cross_validate()

from sklearn.model_selection import cross_validate

CV = cross_validate(mlp, X, y, cv=8, scoring=['accuracy', 'neg_mean_squared_error'])
print('Accuracy')
print(CV['test_accuracy'])
print('MSE')
print(-1*CV['test_neg_mean_squared_error'])

Accuracy
[0.95833333 0.97183099 0.97183099 0.98591549 0.97183099 0.98591549
 1.         0.97183099]
MSE
[0.04166667 0.02816901 0.02816901 0.01408451 0.02816901 0.01408451
 0.         0.02816901]


In [67]:
print('Average Accuracy = ', sum(CV['test_accuracy']) / len(CV['test_accuracy']))
print('Average MSE = ', sum(-1 * CV['test_neg_mean_squared_error']) / len(CV['test_neg_mean_squared_error']))

Average Accuracy =  0.9771860328638499
Average MSE =  0.022813967136150234


In [68]:
# To find list of accuracy and MSE values
# Without using the sklearn function cross_validate()

from sklearn.model_selection import KFold

n_splits=10
# step 1: randomize the dataset and create k equal size partitions
kf = KFold(n_splits=n_splits)

acc = 0
mse = 0

i = 0 #keep track of batch number
# step 5: iterate k times with a different testing subset
for train_indices, test_indices in kf.split(X):

    # step 2-3: use k-1/k^th partition for the training/testing model
    start_train, stop_train = train_indices[0], train_indices[-1]+1
    start_test, stop_test = test_indices[0], test_indices[-1]+1
    
    # perform the training similar to Q1
    #this was based on the requirements in Q1
    mlp = MLPClassifier(solver = 'sgd', random_state = 42, activation = 'logistic', learning_rate_init = 0.4, batch_size = 100, hidden_layer_sizes = (10, 2), max_iter = 600)
    mlp.fit(X[start_train:stop_train], y[start_train:stop_train])
    pred = mlp.predict(X[start_test:stop_test])
    
    # step 4: record the evaluating scores
    i+=1
    acc += accuracy_score(y[start_test:stop_test], pred)
    mse += mean_squared_error(y[start_test:stop_test], pred)
    
    print("\nAccuracy for batch ", i, " : ", accuracy_score(y[start_test:stop_test], pred))
    print("Mean Square Error for batch ", i, " : ", mean_squared_error(y[start_test:stop_test], pred))

# step 6: find the average and select the batch with highest evaluation scores
print('\nAverage Accuracy = ', acc / n_splits)
print('Average MSE = ', mse / n_splits)


Accuracy for batch  1  :  0.7719298245614035
Mean Square Error for batch  1  :  0.22807017543859648

Accuracy for batch  2  :  0.9649122807017544
Mean Square Error for batch  2  :  0.03508771929824561

Accuracy for batch  3  :  0.9824561403508771
Mean Square Error for batch  3  :  0.017543859649122806

Accuracy for batch  4  :  0.9824561403508771
Mean Square Error for batch  4  :  0.017543859649122806

Accuracy for batch  5  :  1.0
Mean Square Error for batch  5  :  0.0

Accuracy for batch  6  :  0.9649122807017544
Mean Square Error for batch  6  :  0.03508771929824561

Accuracy for batch  7  :  0.9824561403508771
Mean Square Error for batch  7  :  0.017543859649122806

Accuracy for batch  8  :  0.9824561403508771
Mean Square Error for batch  8  :  0.017543859649122806

Accuracy for batch  9  :  1.0
Mean Square Error for batch  9  :  0.0

Accuracy for batch  10  :  0.9642857142857143
Mean Square Error for batch  10  :  0.03571428571428571

Average Accuracy =  0.9595864661654134
Averag