Importing the Dependencies

In [14]:
import numpy as np
import pandas as pd
import sklearn.datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection & Processing

In [15]:
breast_cancer_dataset = pd.read_csv('breast-cancer_csv.csv')# loading the data from sklearn


In [16]:
print(breast_cancer_dataset)

       age menopause tumor-size inv-nodes node-caps  deg-malig breast  \
0    40-49   premeno      15-19       0-2       yes          3  right   
1    50-59      ge40      15-19       0-2        no          1  right   
2    50-59      ge40      35-39       0-2        no          2   left   
3    40-49   premeno      35-39       0-2       yes          3  right   
4    40-49   premeno      30-34       3-5       yes          2   left   
..     ...       ...        ...       ...       ...        ...    ...   
281  50-59      ge40      30-34       6-8       yes          2   left   
282  50-59   premeno      25-29       3-5       yes          2   left   
283  30-39   premeno      30-34       6-8       yes          2  right   
284  50-59   premeno      15-19       0-2        no          2  right   
285  50-59      ge40      40-44       0-2        no          3   left   

    breast-quad irradiat                 Class  
0       left_up       no     recurrence-events  
1       central       no 

In [17]:
# loading the data to a data frame
data_frame = pd.DataFrame(breast_cancer_dataset)

In [18]:
# print the first 5 rows of the dataframe
breast_cancer_dataset.head()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
0,40-49,premeno,15-19,0-2,yes,3,right,left_up,no,recurrence-events
1,50-59,ge40,15-19,0-2,no,1,right,central,no,no-recurrence-events
2,50-59,ge40,35-39,0-2,no,2,left,left_low,no,recurrence-events
3,40-49,premeno,35-39,0-2,yes,3,right,left_low,yes,no-recurrence-events
4,40-49,premeno,30-34,3-5,yes,2,left,right_up,no,recurrence-events


In [19]:
# adding the 'target' column to the data frame
breast_cancer_dataset['label'] = breast_cancer_dataset['deg-malig']

In [20]:
# print last 5 rows of the dataframe
breast_cancer_dataset.tail()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class,label
281,50-59,ge40,30-34,6-8,yes,2,left,left_low,no,no-recurrence-events,2
282,50-59,premeno,25-29,3-5,yes,2,left,left_low,yes,no-recurrence-events,2
283,30-39,premeno,30-34,6-8,yes,2,right,right_up,no,no-recurrence-events,2
284,50-59,premeno,15-19,0-2,no,2,right,left_low,no,no-recurrence-events,2
285,50-59,ge40,40-44,0-2,no,3,left,right_up,no,no-recurrence-events,3


In [21]:
# number of rows and columns in the dataset
breast_cancer_dataset.shape

(286, 11)

In [22]:
# getting some information about the data
breast_cancer_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          286 non-null    object
 1   menopause    286 non-null    object
 2   tumor-size   286 non-null    object
 3   inv-nodes    286 non-null    object
 4   node-caps    278 non-null    object
 5   deg-malig    286 non-null    int64 
 6   breast       286 non-null    object
 7   breast-quad  285 non-null    object
 8   irradiat     286 non-null    object
 9   Class        286 non-null    object
 10  label        286 non-null    int64 
dtypes: int64(2), object(9)
memory usage: 24.7+ KB


In [23]:
# checking for missing values
breast_cancer_dataset.isnull().sum()

age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      8
deg-malig      0
breast         0
breast-quad    1
irradiat       0
Class          0
label          0
dtype: int64

In [24]:
# statistical measures about the data
breast_cancer_dataset.describe()

Unnamed: 0,deg-malig,label
count,286.0,286.0
mean,2.048951,2.048951
std,0.738217,0.738217
min,1.0,1.0
25%,2.0,2.0
50%,2.0,2.0
75%,3.0,3.0
max,3.0,3.0


In [25]:
# checking the distribution of Target Varibale
breast_cancer_dataset['label'].value_counts()

2    130
3     85
1     71
Name: label, dtype: int64

1 --> Benign

0 --> Malignant

In [26]:
breast_cancer_dataset.groupby('label').mean()

  breast_cancer_dataset.groupby('label').mean()


Unnamed: 0_level_0,deg-malig
label,Unnamed: 1_level_1
1,1.0
2,2.0
3,3.0


Separating the features and target

In [27]:
X = breast_cancer_dataset.drop(columns='label', axis=1)
Y = breast_cancer_dataset['label']

In [28]:
print(X)

       age menopause tumor-size inv-nodes node-caps  deg-malig breast  \
0    40-49   premeno      15-19       0-2       yes          3  right   
1    50-59      ge40      15-19       0-2        no          1  right   
2    50-59      ge40      35-39       0-2        no          2   left   
3    40-49   premeno      35-39       0-2       yes          3  right   
4    40-49   premeno      30-34       3-5       yes          2   left   
..     ...       ...        ...       ...       ...        ...    ...   
281  50-59      ge40      30-34       6-8       yes          2   left   
282  50-59   premeno      25-29       3-5       yes          2   left   
283  30-39   premeno      30-34       6-8       yes          2  right   
284  50-59   premeno      15-19       0-2        no          2  right   
285  50-59      ge40      40-44       0-2        no          3   left   

    breast-quad irradiat                 Class  
0       left_up       no     recurrence-events  
1       central       no 

In [29]:
print(Y)

0      3
1      1
2      2
3      3
4      2
      ..
281    2
282    2
283    2
284    2
285    3
Name: label, Length: 286, dtype: int64


Splitting the data into training data & Testing data

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [31]:
print(X.shape, X_train.shape, X_test.shape)

(286, 10) (228, 10) (58, 10)


Model Training

Logistic Regression

In [32]:
model = LogisticRegression()

In [33]:
# training the Logistic Regression model using Training data
model.fit(X_train, Y_train)

ValueError: could not convert string to float: '60-69'

Model Evaluation

Accuracy Score

In [None]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [None]:
print('Accuracy on training data = ', training_data_accuracy)

Accuracy on training data =  0.9494505494505494


In [None]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [None]:
print('Accuracy on test data = ', test_data_accuracy)

Accuracy on test data =  0.9210526315789473


Building a Predictive System

In [None]:
input_data = (13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,0.1885,0.05766,0.2699,0.7886,2.058,23.56,0.008462,0.0146,0.02387,0.01315,0.0198,0.0023,15.11,19.26,99.7,711.2,0.144,0.1773,0.239,0.1288,0.2977,0.07259)

# change the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array as we are predicting for one datapoint
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The Breast cancer is Malignant')

else:
  print('The Breast Cancer is Benign')



[1]
The Breast Cancer is Benign




In [None]:
import pickle

In [None]:
filename = 'Breast_cancer_detection.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open('Breast_cancer_detection.sav', 'rb'))

In [None]:
for column in X.columns:
  print(column)

mean radius
mean texture
mean perimeter
mean area
mean smoothness
mean compactness
mean concavity
mean concave points
mean symmetry
mean fractal dimension
radius error
texture error
perimeter error
area error
smoothness error
compactness error
concavity error
concave points error
symmetry error
fractal dimension error
worst radius
worst texture
worst perimeter
worst area
worst smoothness
worst compactness
worst concavity
worst concave points
worst symmetry
worst fractal dimension
