## Wrangling

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [2]:
df = pd.read_csv('data/cancer_data.csv', index_col='id')
df.head()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,...,radius_max,texture_max,perimeter_max,area_max,smoothness_max,compactness_max,concavity_max,concave_points_max,symmetry_max,fractal_dimension_max
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,M,11.42,20.38,77.58,386.1,,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,,0.8663,0.6869,0.2575,0.6638,0.173
84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
df.shape

(569, 31)

In [4]:
df = df.loc[:, df.columns.str.contains('mean') | df.columns.str.contains('diagnosis')]
df.head()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
842302,M,17.99,,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
84348301,M,11.42,20.38,77.58,386.1,,0.2839,0.2414,0.1052,0.2597,0.09744
84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883


In [5]:
new_label = [col[:-5] if '_mean' in col else col for col in df]
new_label

['diagnosis',
 'radius',
 'texture',
 'perimeter',
 'area',
 'smoothness',
 'compactness',
 'concavity',
 'concave_points',
 'symmetry',
 'fractal_dimension']

In [6]:
df.columns = new_label
df.head()

Unnamed: 0_level_0,diagnosis,radius,texture,perimeter,area,smoothness,compactness,concavity,concave_points,symmetry,fractal_dimension
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
842302,M,17.99,,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
84348301,M,11.42,20.38,77.58,386.1,,0.2839,0.2414,0.1052,0.2597,0.09744
84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883


In [7]:
df.duplicated().sum()

5

In [8]:
df = df.drop_duplicates()
df.duplicated().sum()

0

## Exploring

In [9]:
df['diagnosis'].value_counts(normalize=True)

B    0.62766
M    0.37234
Name: diagnosis, dtype: float64

> B = benign (jinak), M = malignant (ganas)

In [10]:
df['malignant'] = df['diagnosis'].replace({'B': 0, 'M': 1})
df['malignant'].value_counts(normalize=True)

0    0.62766
1    0.37234
Name: malignant, dtype: float64

In [11]:
x = df.drop(columns = ['malignant', 'diagnosis'])
y = df.malignant

#Train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((451, 10), (113, 10), (451,), (113,))

In [12]:
x_train.isnull().sum()

radius                0
texture              20
perimeter             0
area                  0
smoothness           37
compactness           0
concavity             0
concave_points        0
symmetry             56
fractal_dimension     0
dtype: int64

In [16]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

In [27]:
preprocessor = ColumnTransformer([
    ('numeric', numerical_pipeline, [x_train.columns])
])

In [26]:
x_train.columns

Index(['radius', 'texture', 'perimeter', 'area', 'smoothness', 'compactness',
       'concavity', 'concave_points', 'symmetry', 'fractal_dimension'],
      dtype='object')

In [28]:
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', KMeans())
])

In [None]:
model = 