In [None]:
# Below is the code to calculate the logistic regression for the WDBC dataset.
# The equations are mentioned below along with a number which correspond to 
# their equivalent explanation in the report. The hyperparamters have been
# explicitly stated and the results of the experiment of accuracy, precision,
# recall and F1 score have been explained in the report as well.

# The standardization technique which was the best fit for this data set is
# uncommented. The other methods have been commented and were used as part of the
# analysis process. Tuning of the hyperparameters was done on a trial and error
# basis, plugging in the values as deemed necessary.

import pandas
import numpy
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix


# Function to load data from CSV and map 0s and 1s
def load_data():
    # Load data from CSV
    data_frame = pandas.read_csv('wdbc.csv', header=None)
    # Drop Patient ID column
    data_frame.drop(data_frame.columns[0], axis=1, inplace=True)

    # Map M to 1 and B to 0
    data_frame = data_frame.replace('M', value=1)
    data_frame = data_frame.replace('B', value=0)
    return data_frame

# Function to calculate sigmoid (Equation 2)
def sigmoid(z):
    return 1 / (1 + numpy.exp(-1 * z))


# Function to calculate loss (Equation 3)
def loss(h, y, m):
    return (-1 / m) * numpy.sum(
        numpy.dot(y.transpose(), numpy.log(h)) + numpy.dot((1 - y.transpose()), numpy.log(1 - h)))


# Function to calculate delta theta (Equation 6)
def update_weight(y, a, x, m):
    return (-1 / m) * (numpy.dot((y - a).transpose(), x))


In [None]:
    # Fetch data frame from the csv file
    data_frame = load_data()
    # Split data frame to 80-10-10 ratio for train, validation and test respectively
    train, validate, test = numpy.split(data_frame.sample(frac=1),
                                        [int(.8 * len(data_frame)), int(.9 * len(data_frame))])

    # Normalize the training data - using min max scaler
    min_max_scalar = preprocessing.MinMaxScaler()
    # min_max_scalar = preprocessing.StandardScaler()
    x_scaled_train = min_max_scalar.fit_transform(train.values)
    df_train = pandas.DataFrame(x_scaled_train)
    
    # Normalize the validation data - using min max scaler
    x_scaled_validate = min_max_scalar.fit_transform(validate.values)
    df_validate = pandas.DataFrame(x_scaled_validate)
    
    # Normalize the testing data - using min max scaler
    x_scaled_test = min_max_scalar.fit_transform(test.values)
    df_test = pandas.DataFrame(x_scaled_test)

    # Split to feature vectors and target vectors for each of the datasets
    X_train = df_train[df_train.columns[1:]]
    Y_train = df_train[df_train.columns[:1]]
    X_test = df_test[df_test.columns[1:]]
    Y_test = df_test[df_test.columns[:1]]
    X_validate = df_validate[df_validate.columns[1:]]
    Y_validate = df_validate[df_validate.columns[:1]]

In [None]:
# Bias
bias = 0
# Weights
theta = numpy.zeros((30, 1))
# Lists to track loss values for each epoch for both training and validation data
loss_train_tracker = []
loss_validation_tracker = []
# Hyperparameter 1 : Epoch
epoch_val = 500
for epoch in range(0, epoch_val):
    # Training data forward pass
    # hypothesis function (Equation 1)
    predicted_train_vector = numpy.dot(X_train, theta) + bias
    a_train_vector = sigmoid(predicted_train_vector)
    m1 = X_train[1].shape[0]
    values_for_loss_function_train = loss(a_train_vector, Y_train, m1)
    loss_value_train = values_for_loss_function_train
    loss_train_tracker.append(loss_value_train)

    # Validation data forward pass
    predicted_validate_vector = numpy.dot(X_validate, theta) + bias
    a_validate_vector = sigmoid(predicted_validate_vector)
    m2 = X_validate[1].shape[0]
    values_for_loss_function_validate = loss(a_validate_vector, Y_validate, m2)
    loss_value_validate = values_for_loss_function_validate
    loss_validation_tracker.append(loss_value_validate)
    
    # Training data backward pass
    # Hyperparamter 2 : Learning Rate
    learning_rate = 0.5
    for i in range(0, 30):
        # Update each weight (Equation 4)
        theta[i] -= (
                learning_rate * update_weight(Y_train, a_train_vector, X_train[i + 1], X_train[i + 1].shape[0]))
    # Update bias (Equation 5 and 7)
    bias -= (learning_rate * (-1 / m1 * numpy.sum(Y_train - a_train_vector)))
    bias = bias.__float__()
# Plot the graphs
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss versus Epoch, LR = 0.5')
plt.plot(range(epoch_val), loss_train_tracker, 'r', range(epoch_val), loss_validation_tracker, 'b')

In [None]:
# Test data 
predicted_test_vector = numpy.dot(X_test, theta) + bias
a_test_vector = numpy.array(sigmoid(predicted_test_vector))
predicted_test_result = []
FP = 0 # False Positive
TP = 0 # True Positive
FN = 0 # False Negative
TN = 0 # True Negative
y_test_data = numpy.array(Y_test)
for i in range(0, len(a_test_vector)):
    # Classify output of the sigmoid as 1 or 0
    if a_test_vector[i] >= 0.5:
        predicted_test_result.append(1)
    else:
        predicted_test_result.append(0)
    if y_test_data[i] == predicted_test_result[i]:
        if predicted_test_result[i] == 1:
            TP += 1
        else:
            TN += 1
    elif y_test_data[i] == 1 and predicted_test_result[i] == 0:
        FN += 1
    else:
        FP += 1
# Accuracy calculation
accuracy = (TP + TN) / (TP + TN + FP + FN)
# Precision calculation
precision = TP / (TP + FP)
# Recall calculation
recall = TP / (TP + FN)
# F1 Score calculation
f1_score = 2*((precision * recall)/(precision + recall)) # 2*TP / (2*TP + FP + FN)
print(f"Accuracy : {accuracy}\nPrecision : {precision}\nRecall : {recall}\nF1 Score : {f1_score}")