<a href="https://colab.research.google.com/github/parisazeynaly/Breast-cancer-diagnosis/blob/main/breast_cancer_diagnosis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
bc = load_breast_cancer()
df = pd.DataFrame(data=bc.data, columns=bc.feature_names)
df['target'] = bc.target
df['target_name'] = df['target'].map({0: 'malignant', 1: 'benign'})

In [None]:
print(bc.DESCR)


.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:
    - radius (mean of distances from center to points on the perimeter)
    - texture (standard deviation of gray-scale values)
    - perimeter
    - area
    - smoothness (local variation in radius lengths)
    - compactness (perimeter^2 / area - 1.0)
    - concavity (severity of concave portions of the contour)
    - concave points (number of concave portions of the contour)
    - symmetry
    - fractal dimension ("coastline approximation" - 1)

    The mean, standard error, and "worst" or largest (mean of the three
    worst/largest values) of these features were computed for each image,
    resulting in 30 features.  For instance, field 0 is Mean Radius, field
    10 is Radius SE, field 20 is Worst Radius.

    - 

Dataset Info

In [None]:
bc.target[450]

1

In [None]:
bc.target.shape

(569,)

In [None]:
bc.data[0]

array([1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
       3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
       8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
       3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
       1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01])

In [None]:
bc.data.shape

(569, 30)

**Preprocessing**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(bc.data, bc.target,test_size= 0.2, random_state=0)

In [None]:
print(f"feature=> train:{X_train.shape} - test:{X_test.shape}")
print(f"target=> train:{y_train.shape} - test:{y_test.shape}")

feature=> train:(455, 30) - test:(114, 30)
target=> train:(455,) - test:(114,)


In [None]:
X_train[0]

array([1.005e+01, 1.753e+01, 6.441e+01, 3.108e+02, 1.007e-01, 7.326e-02,
       2.511e-02, 1.775e-02, 1.890e-01, 6.331e-02, 2.619e-01, 2.015e+00,
       1.778e+00, 1.685e+01, 7.803e-03, 1.449e-02, 1.690e-02, 8.043e-03,
       2.100e-02, 2.778e-03, 1.116e+01, 2.684e+01, 7.198e+01, 3.840e+02,
       1.402e-01, 1.402e-01, 1.055e-01, 6.499e-02, 2.894e-01, 7.664e-02])

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler= MinMaxScaler(feature_range=(0,1))
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [None]:
X_train[0]

array([0.1452506 , 0.32448133, 0.14249188, 0.07096501, 0.52210275,
       0.18450791, 0.05883318, 0.08822068, 0.41919192, 0.28117102,
       0.05446315, 0.36571782, 0.04810818, 0.01798599, 0.28172272,
       0.09191276, 0.04267677, 0.1523584 , 0.2448393 , 0.06506087,
       0.11490573, 0.39498934, 0.10742567, 0.04885961, 0.45585419,
       0.10954585, 0.08426518, 0.22387186, 0.26197516, 0.14167651])

Classification

In [None]:
from re import A
from sklearn.metrics import accuracy_score , precision_score, recall_score, f1_score
def calculate_metrics(y_test,y_train,y_pred_train, y_pred_test):
    acc_train=accuracy_score(y_true=y_train, y_pred=y_pred_train)
    acc_test=accuracy_score(y_true=y_test, y_pred=y_pred_test)

    p=precision_score(y_true=y_test, y_pred=y_pred_test)
    r=recall_score(y_true=y_test, y_pred=y_pred_test)
    print(f'acc_train:{acc_train} - acc_test:{acc_test}')
    print(f'precision:{p} - recall:{r}')




In [None]:
from sklearn.naive_bayes import GaussianNB
Gnb=GaussianNB()
Gnb.fit(X_train,y_train)


In [None]:
y_pred_train = Gnb.predict(X_train)
y_pred_test =Gnb.predict(X_test)

acc_train=accuracy_score(y_true=y_train, y_pred=y_pred_train)
acc_test=accuracy_score(y_true=y_test, y_pred=y_pred_test)
# acc_train, acc_test, p, r =calculate_metrics(y_train,y_test,y_pred_train, y_pred_test)

KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)


In [None]:
y_pred_train = knn.predict(X_train)
y_pred_test =knn.predict(X_test)

acc_train=accuracy_score(y_true=y_train, y_pred=y_pred_train)
acc_test=accuracy_score(y_true=y_test, y_pred=y_pred_test)

Desion tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(max_depth=100,min_samples_split=5,criterion='entropy')
dtc.fit(X_train,y_train)

In [None]:
y_pred_train = dtc.predict(X_train)
y_pred_test =dtc.predict(X_test)

acc_train=accuracy_score(y_true=y_train, y_pred=y_pred_train)
acc_test=accuracy_score(y_true=y_test, y_pred=y_pred_test)