In [64]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


In [9]:
cancer = pd.read_csv('breast_cancer_dataset (1).csv')

In [11]:
cancer.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,severity
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,malignant
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,malignant
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,malignant
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,malignant
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,malignant


In [23]:
cancer.shape # rows and columns

(569, 31)

In [25]:
cancer['severity'].value_counts()

severity
benign       357
malignant    212
Name: count, dtype: int64

In [82]:
X = cancer.drop(columns = 'severity')   #X, y - Matrix's are usually capitalized, vectors are usually lower
y = cancer['severity']

In [84]:
y

0      malignant
1      malignant
2      malignant
3      malignant
4      malignant
         ...    
564    malignant
565    malignant
566    malignant
567    malignant
568       benign
Name: severity, Length: 569, dtype: object

In [86]:
y = y.map({'malignant' : 1, 'benign' : 0})

In [88]:
y

0      1
1      1
2      1
3      1
4      1
      ..
564    1
565    1
566    1
567    1
568    0
Name: severity, Length: 569, dtype: int64

In [106]:
#Train test split
#randomly assigned 20% of rows to test and 80% to train, random_state means rows are not shuffled
#stratify makes sure that the positive and negative values are of similar distribution per test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [94]:
cancer.shape

(569, 31)

In [108]:
X_train.shape

(455, 30)

In [110]:
X_test.shape

(114, 30)

In [114]:
y.value_counts()

severity
0    357
1    212
Name: count, dtype: int64

In [120]:
y_train.value_counts()

severity
0    285
1    170
Name: count, dtype: int64

In [118]:
y_test.value_counts()

severity
0    72
1    42
Name: count, dtype: int64

In [154]:
log_reg = LogisticRegression(max_iter=10000, random_state=5)

In [156]:
log_reg.fit(X_train, y_train) #A warning does not matter - means that slope and intercept come from randomization, iterations only 100 -increase until warning is gone

In [158]:
log_reg.coef_ #number of features we have(columns)30

array([[-0.82566872, -0.16724423,  0.17095314, -0.02081624,  0.18195813,
         0.20784782,  0.43067346,  0.25624882,  0.23156932,  0.03372222,
         0.05245347, -0.94163764,  0.02473273,  0.10373602,  0.02137183,
        -0.05670744,  0.01988995,  0.02730176,  0.02208924, -0.01289732,
        -0.20838222,  0.36805515,  0.22236064,  0.00954132,  0.32696306,
         0.64780443,  1.25646369,  0.50281753,  0.59432663,  0.09828713]])

In [160]:
log_reg.intercept_ #-b

array([-28.92537631])

In [166]:
from sklearn.metrics import accuracy_score

In [174]:
y_train_pred = log_reg.predict(X_train)
y_train_pred

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,

In [176]:
accuracy_score(y_train, y_train_pred)

0.9560439560439561

In [180]:
y_test_pred = log_reg.predict(X_test)
y_test_pred

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0], dtype=int64)

In [182]:
accuracy_score(y_test, y_test_pred)

0.956140350877193

In [184]:
y_test_pred

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0], dtype=int64)

In [188]:
y_test_pred_proba = log_reg.predict_proba(X_test)

In [190]:
y_test_pred_proba

array([[9.90530712e-01, 9.46928836e-03],
       [9.26087443e-01, 7.39125568e-02],
       [9.80554883e-01, 1.94451168e-02],
       [9.95564090e-01, 4.43590954e-03],
       [9.99530610e-01, 4.69390076e-04],
       [8.32253991e-06, 9.99991677e-01],
       [9.99067234e-01, 9.32765757e-04],
       [8.59973585e-01, 1.40026415e-01],
       [9.83499458e-01, 1.65005421e-02],
       [9.60070760e-01, 3.99292399e-02],
       [9.99993691e-01, 6.30852317e-06],
       [2.41041986e-07, 9.99999759e-01],
       [4.29591828e-06, 9.99995704e-01],
       [9.95578349e-01, 4.42165080e-03],
       [1.02583996e-08, 9.99999990e-01],
       [9.99278872e-01, 7.21127793e-04],
       [9.99953008e-01, 4.69924940e-05],
       [8.14439226e-01, 1.85560774e-01],
       [9.99632383e-01, 3.67617427e-04],
       [6.64571715e-01, 3.35428285e-01],
       [1.41100060e-02, 9.85889994e-01],
       [4.80592946e-03, 9.95194071e-01],
       [0.00000000e+00, 1.00000000e+00],
       [1.02033966e-06, 9.99998980e-01],
       [8.275496