In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
# Load dataset
mammogram = pd.read_csv('mammogram_dataset.csv')

In [None]:
mammogram.head(10)

In [None]:
# Handling missing values
# In this dataset, missing values can be observed as -100000
# Since our program can't really detect -100000 as a missing value as it might be read as a normal value, we are marking them
# with 'NaN' value using numpy.replace
mammogram[['BI_RADS_assessment', 'age', 'shape', 'margin', 'density', 'severity']] = \
    mammogram[['BI_RADS_assessment', 'age', 'shape', 'margin', 'density', 'severity']].replace(-100000, np.nan)
# Hence, we now are able to check how many missing values are there for each attribute
print(mammogram.isnull().sum())

In [None]:
# Observe dataset
mammogram.head(10)

In [None]:
# Our program should now be able to replace those missing values, in this case we are
# replacing it with mean values for each column/attribute
mammogram.fillna(mammogram.mean(), inplace=True)
# Check missing values again just to be sure
print(mammogram.isnull().sum())
# Observe dataset
mammogram.head(10)
#print(mammogram.shape)

In [None]:
# Dalam semua data tu, kita check berapa banyak instances bagi setiap class
# How many instances are there for each class?
# 0-Benign, 2-Malignant
mammogram['severity'].value_counts()

In [None]:
# Graph plot
sns.countplot(mammogram['severity'])

In [None]:
# Setting features variable and target variable
# X -> features that we are working with
# y -> target, in this case is severity
X = mammogram.drop('severity', axis=1)
y = mammogram['severity']

In [None]:
# Splitting the dataset into training and testing set
# Ratio 66% / 33%
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.66)
# Observe how many instances for each variable
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
# In testing set, observe number of instances for each class
y_test.value_counts()

In [None]:
# Graph plot
sns.countplot(y_test)

In [None]:
# Observe 10 samples of training set
X_train[:10]

In [None]:
# Data Normalization using Min-Max normalization
# Why do we perform normalization after splitting the data?
# -> To avoid data leakage, which means that the created model learns something other than the training set.
# -> This allows the model to learn something it would not learn, and in turn invalidate the accuracy and performance of the model
# Can also be said as: 
# -> The test set is supposed to be a fresh unseen data, and should not be modified at the training stage;
# -> Doing so would cause potential bias in evaluating the performance
normalization = MinMaxScaler()
X_train = normalization.fit_transform(X_train)
X_test = normalization.transform(X_test)

# View first 10 instances of normalized data
X_train[:10]

In [None]:
# Neural Network
# 1st case
#mlp = MLPClassifier(hidden_layer_sizes=[4], activation='logistic', learning_rate='constant', learning_rate_init=0.8, max_iter=30, solver='sgd', verbose=False, random_state=1)
# 2nd case
# mlp = MLPClassifier(hidden_layer_sizes=[4], activation='logistic', learning_rate='constant', learning_rate_init=0.8, max_iter=50, solver='sgd', verbose=False, random_state=1)
# 3rd case
mlp = MLPClassifier(hidden_layer_sizes=[4], activation='logistic', learning_rate='constant', learning_rate_init=0.50, max_iter=100, \
                    solver='sgd',verbose=True, random_state=1, momentum=0.92)
# 2nd case w/ momentum
# mlp = MLPClassifier(hidden_layer_sizes=4, activation='logistic', learning_rate='constant', learning_rate_init=0.02, max_iter=1000, verbose=False, solver='sgd', momentum=0.25)
# 3rd case w/ adaptive learning rate
# mlp = MLPClassifier(hidden_layer_sizes=5, activation='logistic', learning_rate='adaptive', learning_rate_init=0.02, max_iter=1000)

mlp.fit(X_train, y_train)
predict_mlp = mlp.predict(X_test)

In [None]:
# Tengok mse, accuracy & confusion matrix of model 
# Using sklearn classification_report and confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import plot_confusion_matrix
mse = mean_squared_error(y_test, predict_mlp, squared=True)

print("\nMean Squared Error (MSE): ", mse, "\n")
#print("\t===================================================")
#print(classification_report(y_test, predict_mlp))
#print("\t===================================================")
print(confusion_matrix(y_test, predict_mlp))

In [None]:
print(plot_confusion_matrix(mlp, X_test, y_test))

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predict_mlp)
print("Model accuracy: ", accuracy, "-> ", round(accuracy*100, 2), " %")