In [39]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [40]:
# load dataset
dataset = pd.read_csv('data/breast_cancer_data.csv')
dataset.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0
1,20.57,17.77,132.9,1326.0,0.08474,0
2,19.69,21.25,130.0,1203.0,0.1096,0
3,11.42,20.38,77.58,386.1,0.1425,0
4,20.29,14.34,135.1,1297.0,0.1003,0


In [41]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   mean_radius      569 non-null    float64
 1   mean_texture     569 non-null    float64
 2   mean_perimeter   569 non-null    float64
 3   mean_area        569 non-null    float64
 4   mean_smoothness  569 non-null    float64
 5   diagnosis        569 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 26.8 KB


In [42]:
dataset.describe()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
count,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.0
50%,13.37,18.84,86.24,551.1,0.09587,1.0
75%,15.78,21.8,104.1,782.7,0.1053,1.0
max,28.11,39.28,188.5,2501.0,0.1634,1.0


In [59]:
dataset.diagnosis.value_counts()

1    357
0    212
Name: diagnosis, dtype: int64

In [60]:
X = dataset.drop(['diagnosis'], axis=1)
y = dataset['diagnosis']

In [62]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()

In [63]:
X, y = oversample.fit_resample(X, y)

In [67]:
y.value_counts()

0    357
1    357
Name: diagnosis, dtype: int64

In [47]:
# scaling features 
sc = MinMaxScaler(feature_range=(0,1))
X_scaled = sc.fit_transform(X)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [68]:
# model fit
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

In [69]:
print('Accuracy Score on train data: ', accuracy_score(y_true=y_train, y_pred=log_model.predict(X_train)))
print('Accuracy Score on the test data: ', accuracy_score(y_true=y_test, y_pred=log_model.predict(X_test)))

Accuracy Score on train data:  0.9176882661996497
Accuracy Score on the test data:  0.8951048951048951


In [70]:
# saving model as a pickle
import pickle
pickle.dump(log_model,open("ml_model.sav", "wb"))
pickle.dump(sc, open("scaler.sav", "wb"))