### Inspect and preprocess dataset

In [1]:
# Load libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('breast-cancer-wisconsin-data.csv', header=0)

In [3]:
# Get shape, inspect head and tail and column names
print('The dataset has', data.shape[0], 'rows and', data.shape[1], 'columns')
print(data.head())
print(data.tail())
print('The column names are:', data.columns)

The dataset has 569 rows and 32 columns
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  radius_worst  

In [4]:
# Inspect structure of the dataframe
print('Structure of the dataframe:')
print(data.info())

Structure of the dataframe:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14

In [5]:
# Check for missing values
print('Confirm missing values information:')
print(data.isna().sum())

Confirm missing values information:
id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


In [6]:
# Drop column 32 with all nan values and confirm operation
print('Confirm drop operation of missing values:')
data.dropna(axis=1, inplace=True)
print(data.isna().sum())

Confirm drop operation of missing values:
id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


In [7]:
# Get basic information about the dataset
print('Data types of the features')
print(data.dtypes)

Data types of the features
id                           int64
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          f

In [8]:
# Column ID can be dropped
data.drop(['id'], axis=1, inplace=True)
# One column is categorical and the rest are all numerical
# The categorical feature is the target: 'diagnosis'

In [9]:
# Inspect the target
print('Unique values and count of the target:')
target_count = data['diagnosis'].value_counts()
print(target_count)
countB = np.sum(data['diagnosis'] == 'B')
countM = np.sum(data['diagnosis'] == 'M')
percentage_benign = (countB/data.shape[0])*100
percentage_malignant = (countM/data.shape[0])*100
print(round(percentage_benign, 2), 'of the samples correspond to a benign diagnosis')
print(round(percentage_malignant, 2), 'of the samples correspond to a malignant diagnosis')

Unique values and count of the target:
B    357
M    212
Name: diagnosis, dtype: int64
62.74 of the samples correspond to a benign diagnosis
37.26 of the samples correspond to a malignant diagnosis


In [10]:
# Data quality: check for values less than zero among the features
X = data.drop(columns='diagnosis')
print('Number of values less than zero:', X.agg(lambda x: sum(x < 0.000000)).sum())
subset_mean = data.iloc[:, 1:11]
subset_se = data.iloc[:, 11:21]
subset_worst = data.iloc[:, 21:31]

Number of values less than zero: 0


In [11]:
print('Summary statistics of the numerical variables in subset "mean":')
print(subset_mean.describe())
print(subset_mean.shape)

Summary statistics of the numerical variables in subset "mean":
       radius_mean  texture_mean  perimeter_mean    area_mean  \
count   569.000000    569.000000      569.000000   569.000000   
mean     14.127292     19.289649       91.969033   654.889104   
std       3.524049      4.301036       24.298981   351.914129   
min       6.981000      9.710000       43.790000   143.500000   
25%      11.700000     16.170000       75.170000   420.300000   
50%      13.370000     18.840000       86.240000   551.100000   
75%      15.780000     21.800000      104.100000   782.700000   
max      28.110000     39.280000      188.500000  2501.000000   

       smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.014064          0.052813        0.079720             0.038803   
min           0.052630    

In [12]:
print('Summary statistics of the numerical variables in subset "standard error":')
print(subset_se.describe())
print(subset_se.shape)

Summary statistics of the numerical variables in subset "standard error":
        radius_se  texture_se  perimeter_se     area_se  smoothness_se  \
count  569.000000  569.000000    569.000000  569.000000     569.000000   
mean     0.405172    1.216853      2.866059   40.337079       0.007041   
std      0.277313    0.551648      2.021855   45.491006       0.003003   
min      0.111500    0.360200      0.757000    6.802000       0.001713   
25%      0.232400    0.833900      1.606000   17.850000       0.005169   
50%      0.324200    1.108000      2.287000   24.530000       0.006380   
75%      0.478900    1.474000      3.357000   45.190000       0.008146   
max      2.873000    4.885000     21.980000  542.200000       0.031130   

       compactness_se  concavity_se  concave points_se  symmetry_se  \
count      569.000000    569.000000         569.000000   569.000000   
mean         0.025478      0.031894           0.011796     0.020542   
std          0.017908      0.030186           

In [13]:
print('Summary statistics of the numerical variables in subset "worst":')
print(subset_worst.describe())
print(subset_worst.shape)

Summary statistics of the numerical variables in subset "worst":
       radius_worst  texture_worst  perimeter_worst   area_worst  \
count    569.000000     569.000000       569.000000   569.000000   
mean      16.269190      25.677223       107.261213   880.583128   
std        4.833242       6.146258        33.602542   569.356993   
min        7.930000      12.020000        50.410000   185.200000   
25%       13.010000      21.080000        84.110000   515.300000   
50%       14.970000      25.410000        97.660000   686.500000   
75%       18.790000      29.720000       125.400000  1084.000000   
max       36.040000      49.540000       251.200000  4254.000000   

       smoothness_worst  compactness_worst  concavity_worst  \
count        569.000000         569.000000       569.000000   
mean           0.132369           0.254265         0.272188   
std            0.022832           0.157336         0.208624   
min            0.071170           0.027290         0.000000   
25%    

In [14]:
# Standardize data
data_features = data.iloc[:,1:]
print(data_features)
target = data['diagnosis']
print(target)
ss = StandardScaler()
data_features_ss = ss.fit_transform(data_features)
print(data_features_ss)
data = pd.concat([target, data_features], axis=1)
print(data.head)

     radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     compactness_mean  concavity_mean  concave points_mean  sym

In [15]:
# Pickle data
data.to_pickle('data')