In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv("breastcancer.csv")
dataset.head()

Unnamed: 0,Sample code number,Clump thickness,Uniformity of cell size,Uniformity of cell shape,Marginal Adhesion,Single Epithelial cell,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitosis,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2


In [3]:
dataset.describe()

Unnamed: 0,Sample code number,Clump thickness,Uniformity of cell size,Uniformity of cell shape,Marginal Adhesion,Single Epithelial cell,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitosis,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,683.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.544656,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,3.643857,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [4]:
dataset['Bare Nuclei'].isna().sum()

16

In [5]:
dataset.dtypes

Sample code number            int64
Clump thickness               int64
Uniformity of cell size       int64
Uniformity of cell shape      int64
Marginal Adhesion             int64
Single Epithelial cell        int64
Bare Nuclei                 float64
Bland Chromatin               int64
Normal Nucleoli               int64
Mitosis                       int64
Class                         int64
dtype: object

In [6]:
dataset['Class'].value_counts()

2    458
4    241
Name: Class, dtype: int64

In [7]:
X = dataset.iloc[:, 1:-1].to_numpy()
y = dataset.iloc[:, -1].to_numpy()
X[23:25, :]

array([[ 8.,  4.,  5.,  1.,  2., nan,  7.,  3.,  1.],
       [ 1.,  1.,  1.,  1.,  2.,  1.,  3.,  1.,  1.]])

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [9]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
X_train = imputer.fit_transform(X_train)
X_train[23:25,:]

array([[2., 3., 1., 1., 5., 1., 1., 1., 1.],
       [1., 1., 1., 1., 5., 1., 3., 1., 1.]])

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
#Feature scaling is not required unlike for svr, but it will improve performance in this case

In [11]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train) #lower the C parameter more regularized the model (less overfitting)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
#Prepping test data
X_test = imputer.transform(X_test)
X_test = sc.transform(X_test)

In [13]:
y_pred = classifier.predict(X_test)

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[109   3]
 [  3  60]]


In [15]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)

0.962797619047619

In [16]:
#Accuracy = (TP + TN) / (TP + TN + FP + FN)
accuracy_score(y_test, y_pred)

0.9657142857142857

In [17]:
#Sensitivity = TP / (TP + FN) ability to determine the patient cases (malignant tumors) correctly #Recall
60/(60+3)

0.9523809523809523

In [18]:
#Specificity = TN / (TN + FP)  ability to determine the healthy cases (benign tumors) correctly
109/(109+3)

0.9732142857142857

In [19]:
#Precision = TP / (TP + FP) the number of correct positive predictions made
60/(60+3)

0.9523809523809523