# Loading and Binarising Data

In [9]:
import sklearn.datasets
import numpy as np
import pandas as pd

In [4]:
breast_cancer =  sklearn.datasets.load_breast_cancer()

In [8]:
x = breast_cancer.data
y = breast_cancer.target

In [10]:
data = pd.DataFrame(x, columns = breast_cancer.feature_names)
data['class'] = y

In [11]:
data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,class
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [13]:
from sklearn.model_selection import train_test_split

In [18]:
x = data.drop('class', axis=1)
y = data['class']

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state=1, stratify=y)

In [28]:
data.groupby('class').mean()

Unnamed: 0_level_0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,17.46283,21.604906,115.365377,978.376415,0.102898,0.145188,0.160775,0.08799,0.192909,0.06268,...,21.134811,29.318208,141.37033,1422.286321,0.144845,0.374824,0.450606,0.182237,0.323468,0.09153
1,12.146524,17.914762,78.075406,462.790196,0.092478,0.080085,0.046058,0.025717,0.174186,0.062867,...,13.379801,23.51507,87.005938,558.89944,0.124959,0.182673,0.166238,0.074444,0.270246,0.079442


In [20]:
print(x_train.shape, x_test.shape)

(512, 30) (57, 30)


In [22]:
x_binarised_train = x_train.apply(pd.cut, bins=2, labels=[0,1])
x_binarised_test = x_test.apply(pd.cut, bins=2, labels=[0,1])

In [29]:
x_binarised_train = x_binarised_train.values
x_binarised_test = x_binarised_test.values

---
# Applying Model

In [33]:
# b is the only parameter that we need to find
max_acc = 0
max_acc_param=0
for b in range(x_binarised_train.shape[1] + 1):
    accurate_rows = 0
    for x, y in zip(x_binarised_train, y_train):
        y_pred = (np.sum(x) >= b)
        accurate_rows += (y_pred == y)
    accuracy = accurate_rows/x_binarised_train.shape[0]*100
    print("For b =", b, ", Acc = ", (accuracy), "%")
    if accuracy >  max_acc:
        max_acc = accuracy
        max_acc_param = b
print("The Maximum accurcy for this model is", max_acc, "% for parameter =", max_acc_param)

For b = 0 , Acc =  62.6953125 %
For b = 1 , Acc =  24.0234375 %
For b = 2 , Acc =  18.5546875 %
For b = 3 , Acc =  15.0390625 %
For b = 4 , Acc =  15.4296875 %
For b = 5 , Acc =  18.1640625 %
For b = 6 , Acc =  21.484375 %
For b = 7 , Acc =  24.4140625 %
For b = 8 , Acc =  27.5390625 %
For b = 9 , Acc =  29.8828125 %
For b = 10 , Acc =  31.25 %
For b = 11 , Acc =  32.8125 %
For b = 12 , Acc =  33.59375 %
For b = 13 , Acc =  34.375 %
For b = 14 , Acc =  35.15625 %
For b = 15 , Acc =  35.7421875 %
For b = 16 , Acc =  36.71875 %
For b = 17 , Acc =  36.9140625 %
For b = 18 , Acc =  37.3046875 %
For b = 19 , Acc =  37.3046875 %
For b = 20 , Acc =  37.3046875 %
For b = 21 , Acc =  37.3046875 %
For b = 22 , Acc =  37.3046875 %
For b = 23 , Acc =  37.3046875 %
For b = 24 , Acc =  37.3046875 %
For b = 25 , Acc =  37.3046875 %
For b = 26 , Acc =  37.3046875 %
For b = 27 , Acc =  37.3046875 %
For b = 28 , Acc =  37.3046875 %
For b = 29 , Acc =  37.3046875 %
For b = 30 , Acc =  37.3046875 %
The Ma

In [36]:
data.groupby('class').mean()

Unnamed: 0_level_0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,17.46283,21.604906,115.365377,978.376415,0.102898,0.145188,0.160775,0.08799,0.192909,0.06268,...,21.134811,29.318208,141.37033,1422.286321,0.144845,0.374824,0.450606,0.182237,0.323468,0.09153
1,12.146524,17.914762,78.075406,462.790196,0.092478,0.080085,0.046058,0.025717,0.174186,0.062867,...,13.379801,23.51507,87.005938,558.89944,0.124959,0.182673,0.166238,0.074444,0.270246,0.079442


> If we look at the mean data above, its clear that when the breast cancer is **melignant** (i.e. 1), the mean of the values is lower as compared to cases when the breast cancer is **benign** (i.e. 0). Now on applying **pd.cut** all the values that are above the mean were labeled 0 and vice-versa. Now, according this model the prediction will tend to 1 as more and more elements of the dataset tends to 1, which will result in a wrong prediction in this case, since the y=1 has lower mean values. Thus we'll invert the lables and re-run the model.

In [34]:
x_binarised_train = x_train.apply(pd.cut, bins=2, labels=[1,0])
x_binarised_test = x_test.apply(pd.cut, bins=2, labels=[1,0])
x_binarised_train = x_binarised_train.values
x_binarised_test = x_binarised_test.values

In [37]:
#Reapplying the same model again
# b is the only parameter that we need to find
max_acc = 0
max_acc_param=0
for b in range(x_binarised_train.shape[1] + 1):
    accurate_rows = 0
    for x, y in zip(x_binarised_train, y_train):
        y_pred = (np.sum(x) >= b)
        accurate_rows += (y_pred == y)
    accuracy = accurate_rows/x_binarised_train.shape[0]*100
    print("For b =", b, ", Acc = ", (accuracy), "%")
    if accuracy >  max_acc:
        max_acc = accuracy
        max_acc_param = b
print("The Maximum accurcy for this model is", max_acc, "% for parameter =", max_acc_param)

For b = 0 , Acc =  62.6953125 %
For b = 1 , Acc =  62.6953125 %
For b = 2 , Acc =  62.6953125 %
For b = 3 , Acc =  62.6953125 %
For b = 4 , Acc =  62.6953125 %
For b = 5 , Acc =  62.6953125 %
For b = 6 , Acc =  62.6953125 %
For b = 7 , Acc =  62.6953125 %
For b = 8 , Acc =  62.6953125 %
For b = 9 , Acc =  62.6953125 %
For b = 10 , Acc =  62.6953125 %
For b = 11 , Acc =  62.6953125 %
For b = 12 , Acc =  62.6953125 %
For b = 13 , Acc =  62.6953125 %
For b = 14 , Acc =  63.0859375 %
For b = 15 , Acc =  63.28125 %
For b = 16 , Acc =  64.2578125 %
For b = 17 , Acc =  64.84375 %
For b = 18 , Acc =  65.625 %
For b = 19 , Acc =  66.40625 %
For b = 20 , Acc =  67.1875 %
For b = 21 , Acc =  68.75 %
For b = 22 , Acc =  70.1171875 %
For b = 23 , Acc =  72.4609375 %
For b = 24 , Acc =  75.5859375 %
For b = 25 , Acc =  78.515625 %
For b = 26 , Acc =  81.8359375 %
For b = 27 , Acc =  84.5703125 %
For b = 28 , Acc =  84.9609375 %
For b = 29 , Acc =  81.4453125 %
For b = 30 , Acc =  75.9765625 %
The Ma

In [38]:
# Now, let's run this model for test-dataset for parameter=28 and see how accurate the model is.

In [40]:
b = 28
accurate_rows = 0
for x, y in zip(x_binarised_test, y_test):
    y_pred = (np.sum(x) >= b)
    accurate_rows += (y_pred == y)
accuracy = accurate_rows/x_binarised_test.shape[0]*100
print("Acc = ", (accuracy), "%")

Acc =  78.94736842105263 %


In [43]:
# Another way of calculating accuracy using sklearn.metrics
from sklearn.metrics import accuracy_score

In [45]:
b = 28
y_pred_test = []
for x in x_binarised_test:
    y_pred = (np.sum(x) >= b)
    y_pred_test.append(y_pred)
accuracy = accuracy_score(y_pred_test, y_test)
print("Acc =", accuracy * 100,"%")

Acc = 78.94736842105263 %
