In [1]:
import pandas as pd
from sklearn import linear_model, model_selection

## Reading a CSV file into a DataFrame using Pandas

In [2]:
breast_cancer_df = pd.read_csv('breast_cancer.csv')

#### Preprocessing steps : -

In [3]:
def fill_missing_values(column):
  breast_cancer_df[column].fillna(breast_cancer_df[column].mean(), inplace= True)

In [4]:
def call_fill_missing_values(dictionary):
  for key in dictionary:
    value = dictionary[key]
    if value:
      fill_missing_values(key)

In [5]:
def drop_missing_values():
  breast_cancer_df.dropna(inplace= True)

In [6]:
def create_dictionary(column):
  column_unique_values= column.unique()
  column_dictionary={}
  for index,element in enumerate(column_unique_values):
    column_dictionary[element]= index
  return column_dictionary

###### Replacing non-numeric values with numeric values

In [7]:
diagnosis = create_dictionary(breast_cancer_df['diagnosis'])
diagnosis

{'B': 1, 'M': 0}

In [8]:
breast_cancer_df.diagnosis.replace(diagnosis, inplace= True)
breast_cancer_df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,0,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,0,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,0,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,0,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


###### option 1 /

In [9]:
drop_missing_values()

###### option 2 /

In [9]:
empty_value_column_dictionary = dict(breast_cancer_df.isna().sum() > 0)
empty_value_column_dictionary 

{'area_mean': True,
 'area_se': True,
 'area_worst': True,
 'compactness_mean': True,
 'compactness_se': False,
 'compactness_worst': False,
 'concave points_mean': True,
 'concave points_se': True,
 'concave points_worst': False,
 'concavity_mean': False,
 'concavity_se': True,
 'concavity_worst': False,
 'diagnosis': False,
 'fractal_dimension_mean': True,
 'fractal_dimension_se': False,
 'fractal_dimension_worst': False,
 'id': False,
 'perimeter_mean': True,
 'perimeter_se': False,
 'perimeter_worst': False,
 'radius_mean': True,
 'radius_se': True,
 'radius_worst': False,
 'smoothness_mean': False,
 'smoothness_se': True,
 'smoothness_worst': False,
 'symmetry_mean': False,
 'symmetry_se': False,
 'symmetry_worst': False,
 'texture_mean': True,
 'texture_se': True,
 'texture_worst': False}

In [10]:
call_fill_missing_values(empty_value_column_dictionary )

In [11]:

breast_cancer_df.isna().sum()

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

## Splitting the data into inputs/features (x) and output/target ( y )

In [12]:
feautres = breast_cancer_df.loc[:, (breast_cancer_df.columns != 'id') & (breast_cancer_df.columns != 'diagnosis')]
feautres.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [13]:
labels = breast_cancer_df.loc[:, (breast_cancer_df.columns == 'diagnosis')]
labels.head()

Unnamed: 0,diagnosis
0,0
1,0
2,0
3,0
4,0


In [14]:
features_train, features_test, labels_train, labels_test = model_selection.train_test_split(feautres,labels,test_size=0.85, stratify=labels, random_state=42)

## Picking the appropriate linear model type for this kind of problem

In [19]:
model= linear_model.LogisticRegression(penalty='l2', max_iter=2000, random_state=42 , C=12)

## Trainning the model

In [20]:
model.fit(features_train,labels_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=12, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
model.predict(features_test)

array([1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,

## Evaluating model accuracy (should be above 90% or 0.90)

In [26]:
# when dropping missing values (using option 1)
print("model's train accuracy: ",model.score(features_train,labels_train))
print("model's test accuracy: ",model.score(features_test,labels_test))

model's train accuracy:  1.0
model's test accuracy:  0.921443736730361


In [22]:
# when filling missing values with the column mean value (using option 2)
print("model's train accuracy: ",model.score(features_train,labels_train))
print("model's test accuracy: ",model.score(features_test,labels_test))

model's train accuracy:  0.9882352941176471
model's test accuracy:  0.9483471074380165
