In [1]:
# IMPORT PANDAS TO READ THE DATASET

import pandas as pd

data = pd.read_csv("wdbc.csv") # READ THE DATASET

In [4]:
# DATA NORMALIZATION

x = data.copy() # COPY THE DATA INTO A VARIABLE x

# Delete non-numerical values
del x["ID Number"]
del x["Diagnosis"]

# Normalize to unit variance
data_normalized = (x - x.mean())/x.std()
#data_normalized.to_csv("Normalized Data.csv")
data_normalized.insert(0, column = "Diagnosis", value = data["Diagnosis"])

total_data_normalized = data_normalized.groupby(data["Diagnosis"])

data_benign = total_data_normalized.get_group("B")

data_malignant = total_data_normalized.get_group("M")

del data_benign['Diagnosis']
del data_malignant['Diagnosis']


In [5]:
# IMPORT NUMPY FOR MATHEMATICAL CALCULATIONS

import numpy as np

# Convert Benign and Malignant datasets into array
b = np.array(data_benign)
m = np.array(data_malignant)


In [7]:
# FISHER'S LINEAR DISCRIMINANT ANALYSIS EQUATION:
    #   J(W) = ((MEAN_b - MEAN_m)^2)/((COV_b)^2 + (COV_m)^2)


# COMPLETE THE NUMERATOR
numerator = (np.mean(m,axis=0).reshape(-1,1) - (np.mean(b,axis=0).reshape(-1,1)))
numerator = numerator * numerator.T

# Find covariance for Denominator
covariance_b = np.cov(b.T)
covariance_m = np.cov(m.T)

denominator = (covariance_b * covariance_b) + (covariance_m * covariance_m)
inv_denominator = np.linalg.inv(denominator)
eig_vals, eig_vecs = np.linalg.eig(inv_denominator.dot(numerator))

# CONVERT INTO REAL VALUES
eig_vals = eig_vals.real
eig_vecs = eig_vecs.real

In [8]:
# FOR THE SPLIT FOR TRAIN/TEST

x_data = data_normalized.drop('Diagnosis', axis = 1)

y_data = data_normalized['Diagnosis'] # Target Data

# Convert into arrays
x = x_data.to_numpy()
y = y_data.to_numpy()


#take only eigenvec corresponding to largest(and the only one)eigenvalue
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the (eigenvalue, eigenvector) in descending order
eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True)

# Initialize the w vector
W = eig_vecs[:, :1]

#   The final transformed values are in x_transform
#   The Fisher's Linear Discriminant has been successfully applied
x_transform = np.dot(x, W)

In [9]:
#Train and test the data
from sklearn.model_selection import train_test_split

# Give the values to be trained in the dataset
x_train,x_test,y_train,y_test = train_test_split(x_transform, y, test_size = 0.4, random_state = 0)

# To check how many are being tested and how many are being trained
print("These many are being trained: ", x_train.shape[0])
print("These many are being tested: ", x_test.shape[0])

These many are being trained:  341
These many are being tested:  228


In [14]:
#   USE A  MODEL ON THE TRANFORMED (FISHER LINEAR DISCRIMINAT) DATA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

flda = LinearDiscriminantAnalysis()

# Fit and Predict the data
flda.fit(x_train, y_train)
flda_predicted = flda.predict(x_test)

# Attain accuracy of the model buy importing accuracy_score
from sklearn.metrics import accuracy_score

print("\nAccuracy of Linear Discriminant Analysis on the transformed data by Fisher's Linear Discriminant Analysis: {}".format(accuracy_score(y_test,flda_predicted)))


Accuracy of Linear Discriminant Analysis on the transformed data by Fisher's Linear Discriminant Analysis: 0.9692982456140351


In [12]:
from sklearn.metrics import confusion_matrix

confusion_matrix_flda = pd.DataFrame(confusion_matrix(y_test, flda_predicted), index = ['Actual Negative','Actual Positive'], columns = ['Predicted Negative','Predicted Positive'] )

print("\nFisher's Linear discriminant Model Confusion Matrix\n")
confusion_matrix_flda


Fisher's Linear discriminant Model Confusion Matrix



Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,144,1
Actual Positive,6,77
