In [1]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score

import pandas as pd
import numpy as np

In [2]:
# reading our dataframe

dataset = pd.read_csv('data.csv').dropna(axis=1, how='all')

In [3]:
# getting the data structure

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [4]:
dataset = dataset.drop('id', axis=1)

In [5]:
dataset['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [6]:
dataset['diagnosis'].value_counts() / len(dataset) * 100

B    62.741652
M    37.258348
Name: diagnosis, dtype: float64

In [7]:
dataset.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [8]:
iv_names = dataset.columns.tolist()[1:]
dataset[iv_names] = dataset[iv_names].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [9]:
train_data = dataset.iloc[:470]
test_data = dataset.iloc[470:]

In [10]:
X_train = train_data[iv_names].to_numpy()
X_test = test_data[iv_names].to_numpy()

y_train = train_data['diagnosis'].to_numpy()
y_test = test_data['diagnosis'].to_numpy()

In [11]:
#LDA

lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
lda_predictions = lda.predict(X_test)

f1 = f1_score(y_test, lda_predictions, average='weighted')
print('LDA f1 score:', f1)

LDA F1 score: 0.9694592988710635


In [13]:
#QDA

qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
qda_predictions = qda.predict(X_test)

f1 = f1_score(y_test, qda_predictions, average='weighted')
print('QDA f1 score:', f1)

QDA F1 score: 0.9606661206661207


In [14]:
#Logistic Regression Classifier

lrc = LogisticRegression()
lrc.fit(X_train, y_train)
lrc_predictions = lrc.predict(X_test)

f1 = f1_score(y_test, lrc_predictions, average='weighted')
print('LRC f1 score:', f1)

LRC F1 score: 0.9694592988710635


In [16]:
#k-NN

for i in range(1, 10):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    knn_predictions = knn.predict(X_test)

    f1 = f1_score(y_test, knn_predictions, average='weighted')
    print('KNN with k=', i,' f1 score:', f1)

KNN with k= 1  f1 score: 0.9307456153093736
KNN with k= 2  f1 score: 0.9689331423765032
KNN with k= 3  f1 score: 0.9601683501683501
KNN with k= 4  f1 score: 0.969919001199824
KNN with k= 5  f1 score: 0.9800841750841751
KNN with k= 6  f1 score: 0.9899730003999414
KNN with k= 7  f1 score: 0.9800841750841751
KNN with k= 8  f1 score: 0.9899730003999414
KNN with k= 9  f1 score: 0.9899730003999414


In [None]:
#Random Forest

lrc = LogisticRegression()
lrc.fit(X_train, y_train)
lrc_predictions = lrc.predict(X_test)

f1 = f1_score(y_test, lrc_predictions, average='weighted')
print('LRC f1 score:', f1)