### Logistic Regression to predict Breast Cancer

### Steps

### 1. Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
%matplotlib inline

### 2. Read the csv data file

In [2]:
train=pd.read_csv('bc.csv')
train.head()

Unnamed: 0.1,Unnamed: 0,Cl.thickness,Cell.size,Cell.shape,Marg.adhesion,Epith.c.size,Bare.nuclei,Bl.cromatin,Normal.nucleoli,Mitoses,Class
0,1,5,1,1,1,2,1,3,1,1,0
1,2,5,4,4,5,7,10,3,2,1,0
2,3,3,1,1,1,2,2,3,1,1,0
3,4,6,8,8,1,3,4,3,7,1,0
4,5,4,1,1,3,2,1,3,1,1,0


In [87]:
train.shape

(683, 11)

### 3. Get the information about all the columns of the data file

In [88]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683 entries, 0 to 682
Data columns (total 11 columns):
Unnamed: 0         683 non-null int64
Cl.thickness       683 non-null int64
Cell.size          683 non-null int64
Cell.shape         683 non-null int64
Marg.adhesion      683 non-null int64
Epith.c.size       683 non-null int64
Bare.nuclei        683 non-null int64
Bl.cromatin        683 non-null int64
Normal.nucleoli    683 non-null int64
Mitoses            683 non-null int64
Class              683 non-null int64
dtypes: int64(11)
memory usage: 58.8 KB


### 4. Drop the unnamed column which is not needed

In [89]:
train = train.drop(train.columns[0],axis=1)

In [90]:
train.head()

Unnamed: 0,Cl.thickness,Cell.size,Cell.shape,Marg.adhesion,Epith.c.size,Bare.nuclei,Bl.cromatin,Normal.nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0


### 5. Calculate the number of occurences of Class 0 and Class 1 - 0=Benign, 1=Malignant

In [91]:
train['Class'].value_counts()

0    444
1    239
Name: Class, dtype: int64

### 6. Downsample the data to minority class

In [5]:
class_majority = train[train.Class==0]
class_minority = train[train.Class==1]

downsample = resample(class_majority, replace = 'False',n_samples=239)

downsampled_data = pd.concat([downsample,class_minority])

downsampled_data.Class.value_counts()


1    239
0    239
Name: Class, dtype: int64

### 7. Upsample the data to majority class

In [6]:
upsample = resample(class_minority, replace = 'True', n_samples=444)

upsampled_data = pd.concat([upsample, class_majority])

upsampled_data.Class.value_counts()

1    444
0    444
Name: Class, dtype: int64

### 8. Train the model using downsampled data

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(downsampled_data.drop('Class',axis=1),downsampled_data['Class'],random_state=77) 

### 9. Logistic Regression model

In [170]:
from sklearn.linear_model import LogisticRegression

logmodel = LogisticRegression()

logmodel.fit(X_train,y_train)

predictions = logmodel.predict(X_test)



### 10. Calculate the accuracy of the model

In [171]:
from sklearn.metrics import accuracy_score

In [172]:
score = accuracy_score(predictions,y_test)

In [173]:
print("Accuracy: ",score*100)

Accuracy:  98.33333333333333
