In [1]:
import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def disp(df, rows = 3):
    print(df.shape)
    display(df.head(rows))

In [3]:
def show_image(image_name = 'ISIC_0027419'):
    image = plt.imread('skin-cancer-mnist-ham10000/ims/' + image_name + '.jpg')
    plt.imshow(image)

In [4]:
meta = pd.read_csv('skin-cancer-mnist-ham10000/HAM10000_metadata.csv')
disp(meta)

(10015, 7)


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp


In [5]:
meta.dx.value_counts()

nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: dx, dtype: int64

In [6]:
print(meta.lesion_id.duplicated().sum(), meta.image_id.duplicated().sum())

2545 0


In [7]:
uniq_meta = meta.drop_duplicates('lesion_id')
uniq_meta.dx.value_counts()

nv       5403
bkl       727
mel       614
bcc       327
akiec     228
vasc       98
df         73
Name: dx, dtype: int64

In [8]:
data = uniq_meta.loc[uniq_meta.dx == 'nv',:]
data = data.iloc[0:614,:]
data = pd.concat((data, uniq_meta.loc[uniq_meta.dx == 'mel',:]))
data = data.reset_index(drop = True)
data.loc[:, 'pixel_values'] = np.nan
data['pixel_values'] = data['pixel_values'].astype(object)
disp(data)

(1228, 8)


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,pixel_values
0,HAM_0001751,ISIC_0024698,nv,consensus,70.0,male,face,
1,HAM_0000559,ISIC_0024693,nv,follow_up,45.0,female,upper extremity,
2,HAM_0004932,ISIC_0032212,nv,follow_up,45.0,female,foot,


In [9]:
for ix, image in enumerate(data.image_id):
    print('\r' + str(ix), end='')
    im = plt.imread('skin-cancer-mnist-ham10000/ims/' + image + '.jpg')
    data.at[ix, 'pixel_values'] = im

1227

In [10]:
def categorize(dx):
    if dx == 'nv':
        return np.array([1,0])
    elif dx == 'mel':
        return np.array([0,1])

In [11]:
data.loc[:,'y'] = data.dx.apply(categorize)

In [12]:
mel_test = np.random.choice(data.loc[data.dx=='mel'].index, size=int(len(data) * 0.1), replace = False)
nv_test = np.random.choice(data.loc[data.dx=='nv'].index, size=int(len(data) * 0.1), replace = False)
test_ix = np.concatenate((mel_test, nv_test))
data.loc[test_ix, 'split'] = 'test'

In [13]:
test_ix.shape

(244,)

In [14]:
disp(data)

(1228, 10)


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,pixel_values,y,split
0,HAM_0001751,ISIC_0024698,nv,consensus,70.0,male,face,"[[[163, 137, 166], [163, 137, 162], [164, 138,...","[1, 0]",
1,HAM_0000559,ISIC_0024693,nv,follow_up,45.0,female,upper extremity,"[[[229, 146, 164], [229, 146, 162], [233, 149,...","[1, 0]",
2,HAM_0004932,ISIC_0032212,nv,follow_up,45.0,female,foot,"[[[235, 157, 155], [236, 157, 153], [237, 159,...","[1, 0]",


In [15]:
x_tr = data.loc[data.split != 'test', 'pixel_values'].values
y_tr = data.loc[data.split != 'test', 'y'].values
x_ts = data.loc[data.split == 'test', 'pixel_values'].values
y_ts = data.loc[data.split == 'test', 'y'].values

x_tr = np.array([array for array in x_tr])
y_tr = np.array([array for array in y_tr])
x_ts = np.array([array for array in x_ts])
y_ts = np.array([array for array in y_ts])

x_tr = x_tr.astype('float32')
x_ts = x_ts.astype('float32')

x_tr /= 255
x_ts /= 255

x_tr.shape, y_tr.shape, x_ts.shape, y_ts.shape

((984, 450, 600, 3), (984, 2), (244, 450, 600, 3), (244, 2))

In [16]:
import keras
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils

Using TensorFlow backend.


In [23]:
np.random.seed(123)

model = Sequential()
model.add(Conv2D(32, kernel_size=3, activation='relu',\
                 data_format="channels_last", input_shape=x_tr.shape[1:]))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(10,10)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

In [24]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [25]:
model.fit(x_tr, y_tr, batch_size=30, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fca5bd1e5c0>

In [26]:
score = model.evaluate(x_ts, y_ts, verbose=2)

In [27]:
score

[0.16660494479488153, 0.926229507219596]