In [1]:
import numpy as np
import pandas as pd
from FischersLinearDiscriminant import FischersLinearDiscriminant
import sys
sys.path.append("..")
from preprocessor import Preprocessor

In [2]:
dataset = pd.read_csv("../dataset.csv")
dataset.drop(columns = ["id"], inplace=True)

In [3]:
preprocessor = Preprocessor(dataset, "diagnosis")
preprocessor.preprocess()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,1.094879,-2.074763,1.267062,0.981944,1.566682,3.267937,2.640012,2.524862,2.210508,...,1.879756,-1.359868,2.294663,1.992752,1.299133,2.601250,2.095524,2.284458,2.736119,1.924898
1,1,1.826349,-0.353230,1.682331,1.904450,-0.826446,-0.489576,-0.027425,0.544898,-0.000317,...,1.799228,-0.370226,1.528487,1.882378,-0.376439,-0.434738,-0.151893,1.080081,-0.246999,0.274810
2,1,1.576855,0.457451,1.563095,1.555317,0.941027,1.045082,1.355088,2.030708,0.935732,...,1.506022,-0.025153,1.341397,1.449643,0.522436,1.073115,0.845897,1.944684,1.143833,0.195287
3,1,-0.767816,0.254781,-0.592194,-0.763436,3.280121,3.386918,1.905591,1.446473,2.858822,...,-0.282118,0.132734,-0.251239,-0.549875,3.376144,3.873322,1.976057,2.164627,6.019000,4.912506
4,1,1.746964,-1.152264,1.772786,1.822134,0.279824,0.533277,1.362595,1.423309,-0.011244,...,1.293345,-1.467297,1.332488,1.214880,0.216993,-0.318115,0.605051,0.723621,-0.869087,-0.401130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,1,2.107029,0.723019,2.056485,2.338738,1.040563,0.214108,1.936869,2.313817,-0.313548,...,1.894210,0.116457,1.745273,2.006768,0.374078,-0.278185,0.656183,1.620079,-1.359021,-0.712040
565,1,1.701602,2.088132,1.612434,1.719949,0.102082,-0.021965,0.686978,1.258848,-0.218850,...,1.530800,2.045284,1.415639,1.488186,-0.690608,-0.399244,0.229924,0.728172,-0.533869,-0.976010
566,1,0.700793,2.048530,0.670883,0.576326,-0.839955,-0.042739,0.042764,0.103504,-0.808889,...,0.558267,1.373043,0.575221,0.424743,-0.808422,0.343591,0.319764,0.409634,-1.104384,-0.322712
567,1,1.834854,2.339723,1.978365,1.731303,1.524023,3.256606,3.281848,2.650975,2.130379,...,1.954090,2.235725,2.294663,1.645863,1.421310,3.884731,3.179331,2.278391,1.907744,2.206539


In [4]:
accuracies: list[float] = []
precisions: list[tuple[float, float]] = []
recalls: list[tuple[float, float]] = []

In [5]:
for _ in range(1000):

    splits = preprocessor.get_folds(100)

    train = pd.concat(splits[:67])
    test = pd.concat(splits[67:])

    X_train = train.drop(columns=["diagnosis"]).to_numpy()
    y_train = train["diagnosis"].to_numpy()
    X_test = test.drop(columns=["diagnosis"]).to_numpy()
    y_test = test["diagnosis"].to_numpy()

    flda = FischersLinearDiscriminant()
    flda.fit(X_train, y_train)
    tp, tn, fp, fn = flda.score(X_test, y_test)
    accuracies.append((tp+tn)/(tp+tn+fp+fn))
    precisions.append((tp/(tp+fp), tn/(tn+fn)))
    recalls.append((tp/(tp+fn), tn/(tn+fp)))


In [6]:
print('--------Results--------')
print(f"Accuracy mean: {np.mean(accuracies).round(4)*100}%, stdev: {np.std(accuracies).round(4)*100}%")

print("----Class 1----")
print(f"Precision mean: {np.mean([precision[0] for precision in precisions]).round(4)*100}%, stdev: {np.std([precision[0] for precision in precisions]).round(4)*100}%")
print(f"Recall mean: {np.mean([recall[0] for recall in recalls]).round(4)*100}%, stdev: {np.std([recall[0] for recall in recalls]).round(4)*100}%")

print("----Class -1----")
print(f"Precision mean: {np.mean([precision[1] for precision in precisions]).round(4)*100}%, stdev: {np.std([precision[1] for precision in precisions]).round(4)*100}%")
print(f"Recall mean: {np.mean([recall[1] for recall in recalls]).round(4)*100}%, stdev: {np.std([recall[1] for recall in recalls]).round(4)*100}%")

--------Results--------
Accuracy mean: 96.37%, stdev: 1.35%
----Class 1----
Precision mean: 97.98%, stdev: 1.7399999999999998%
Recall mean: 92.17999999999999%, stdev: 3.2300000000000004%
----Class -1----
Precision mean: 95.5%, stdev: 1.8900000000000001%
Recall mean: 98.87%, stdev: 0.98%


In [7]:
shuffled_dataset = dataset.sample(frac=1, axis=1)


In [8]:
dataset.columns

Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [9]:
shuffled_dataset.columns

Index(['compactness_se', 'concavity_mean', 'compactness_mean', 'texture_se',
       'fractal_dimension_se', 'smoothness_mean', 'radius_worst',
       'concave points_mean', 'fractal_dimension_worst',
       'concave points_worst', 'symmetry_se', 'symmetry_mean',
       'symmetry_worst', 'area_worst', 'perimeter_mean', 'concavity_se',
       'texture_mean', 'smoothness_se', 'radius_se', 'area_mean',
       'compactness_worst', 'concavity_worst', 'smoothness_worst', 'diagnosis',
       'radius_mean', 'texture_worst', 'perimeter_worst', 'concave points_se',
       'area_se', 'fractal_dimension_mean', 'perimeter_se'],
      dtype='object')

In [10]:
preprocessor = Preprocessor(shuffled_dataset, "diagnosis")
preprocessor.preprocess()

Unnamed: 0,compactness_se,concavity_mean,compactness_mean,texture_se,fractal_dimension_se,smoothness_mean,radius_worst,concave points_mean,fractal_dimension_worst,concave points_worst,...,concavity_worst,smoothness_worst,diagnosis,radius_mean,texture_worst,perimeter_worst,concave points_se,area_se,fractal_dimension_mean,perimeter_se
0,1.308033,2.640012,3.267937,-0.561566,0.900823,1.566682,1.879756,2.524862,1.924898,2.284458,...,2.095524,1.299133,1,1.094879,-1.359868,2.294663,0.665918,2.477199,2.245538,2.822172
1,-0.693745,-0.027425,-0.489576,-0.873226,-0.101131,-0.826446,1.799228,0.544898,0.274810,1.080081,...,-0.151893,-0.376439,1,1.826349,-0.370226,1.528487,0.264316,0.738863,-0.867962,0.261722
2,0.808145,1.355088,1.045082,-0.776855,0.290087,0.941027,1.506022,2.030708,0.195287,1.944684,...,0.845897,0.522436,1,1.576855,-0.025153,1.341397,1.431726,1.176077,-0.398959,0.847207
3,2.729763,1.905591,3.386918,-0.105713,2.036070,3.280121,-0.282118,1.446473,4.912506,2.164627,...,1.976057,3.376144,1,-0.767816,0.132734,-0.251239,1.121175,-0.287878,4.891447,0.284905
4,-0.051907,1.362595,0.533277,-0.787038,0.494921,0.279824,1.293345,1.423309,-0.401130,0.723621,...,0.605051,0.216993,1,1.746964,-1.467297,1.332488,1.150442,1.185062,-0.562828,1.267948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,0.187460,1.936869,0.214108,0.076119,0.165078,1.040563,1.894210,2.313817,-0.712040,1.620079,...,0.656183,0.374078,1,2.107029,0.116457,1.745273,2.075590,2.593351,-0.930119,2.370357
565,-0.073060,0.686978,-0.021965,2.270833,-0.490466,0.102082,1.530800,1.258848,-0.976010,0.728172,...,0.229924,-0.690608,1,1.701602,2.045284,1.415639,0.813876,1.285873,-1.057258,1.152035
566,0.655061,0.042764,-0.042739,-0.252997,0.034421,-0.839955,0.558267,0.103504,-0.322712,0.409634,...,0.319764,-0.808422,1,0.700793,1.373043,0.575221,0.617140,0.179360,-0.894803,0.275040
567,2.006095,3.281848,3.256606,0.692530,0.897811,1.524023,1.954090,2.650975,2.206539,2.278391,...,3.179331,1.421310,1,1.834854,2.235725,2.294663,0.791114,1.004917,1.037714,1.432693


In [11]:
accuracies: list[float] = []
precisions: list[tuple[float, float]] = []
recalls: list[tuple[float, float]] = []

In [12]:
for _ in range(1000):

    splits = preprocessor.get_folds(100)

    train = pd.concat(splits[:67])
    test = pd.concat(splits[67:])

    X_train = train.drop(columns=["diagnosis"]).to_numpy()
    y_train = train["diagnosis"].to_numpy()
    X_test = test.drop(columns=["diagnosis"]).to_numpy()
    y_test = test["diagnosis"].to_numpy()

    flda = FischersLinearDiscriminant()
    flda.fit(X_train, y_train)
    tp, tn, fp, fn = flda.score(X_test, y_test)
    accuracies.append((tp+tn)/(tp+tn+fp+fn))
    precisions.append((tp/(tp+fp), tn/(tn+fn)))
    recalls.append((tp/(tp+fn), tn/(tn+fp)))

In [13]:
print('--------Results--------')
print(f"Accuracy mean: {np.mean(accuracies).round(4)*100}%, stdev: {np.std(accuracies).round(4)*100}%")

print("----Class 1----")
print(f"Precision mean: {np.mean([precision[0] for precision in precisions]).round(4)*100}%, stdev: {np.std([precision[0] for precision in precisions]).round(4)*100}%")
print(f"Recall mean: {np.mean([recall[0] for recall in recalls]).round(4)*100}%, stdev: {np.std([recall[0] for recall in recalls]).round(4)*100}%")

print("----Class -1----")
print(f"Precision mean: {np.mean([precision[1] for precision in precisions]).round(4)*100}%, stdev: {np.std([precision[1] for precision in precisions]).round(4)*100}%")
print(f"Recall mean: {np.mean([recall[1] for recall in recalls]).round(4)*100}%, stdev: {np.std([recall[1] for recall in recalls]).round(4)*100}%")

--------Results--------
Accuracy mean: 96.3%, stdev: 1.4000000000000001%
----Class 1----
Precision mean: 97.96000000000001%, stdev: 1.71%
Recall mean: 92.03%, stdev: 3.4099999999999997%
----Class -1----
Precision mean: 95.42%, stdev: 1.96%
Recall mean: 98.86%, stdev: 0.96%
