In [18]:
import pandas as pd
import numpy as np
import os
import shutil
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [19]:

df_data = pd.read_csv('train_labels.csv')
df_data[df_data['id'] != 'dd6dfed324f9fcb6f93f46f32fc800f2ec196be2']
df_data[df_data['id'] != '9369c7278ec8bcc6c880d99194de09fc2bd4efbe']
print(df_data.shape)

(220025, 2)


In [20]:
df_data.head()

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835,0
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1
2,755db6279dae599ebb4d39a9123cce439965282d,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0
4,068aba587a4950175d04c680d38943fd488d6a9d,0


In [21]:
df_data['label'].value_counts()

0    130908
1     89117
Name: label, dtype: int64

In [22]:
SAMPLE_SIZE = 89117

df_0 = df_data[df_data['label'] == 0].sample(SAMPLE_SIZE, random_state = 101)
df_1 = df_data[df_data['label'] == 1].sample(SAMPLE_SIZE, random_state = 101)

df_data = pd.concat([df_0, df_1], axis=0).reset_index(drop=True)
df_data = shuffle(df_data)

df_data['label'].value_counts()
y = df_data['label']

df_train, df_val = train_test_split(df_data, test_size=0.10, random_state=101, stratify=y)

print(df_train.shape)
print(df_val.shape)

df_train['label'].value_counts()
df_val['label'].value_counts()

(160410, 2)
(17824, 2)


1    8912
0    8912
Name: label, dtype: int64

In [15]:
# create train_dir and validation_dir

base_dir = 'data/base_dir'
os.mkdir(base_dir)

# train_dir
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)

# val_dir
val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)


# create new folders inside train_dir
no_tumor_tissue = os.path.join(train_dir, 'a_no_tumor_tissue')
os.mkdir(no_tumor_tissue)
has_tumor_tissue = os.path.join(train_dir, 'b_has_tumor_tissue')
os.mkdir(has_tumor_tissue)


# create new folders inside val_dir
no_tumor_tissue = os.path.join(val_dir, 'a_no_tumor_tissue')
os.mkdir(no_tumor_tissue)
has_tumor_tissue = os.path.join(val_dir, 'b_has_tumor_tissue')
os.mkdir(has_tumor_tissue)

print(os.listdir('data/base_dir'))

['val_dir', 'train_dir']


In [16]:
df_data.set_index('id', inplace=True)
df_data.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
c60161c656a2ff6bd7756bb38a5fdfcd2ad5e828,0
c5ecefce958d0b0e2d9878d6cdd9455ce17be998,1
6f6f8da2a0eeeaa2cb868a66c9cd446eaf35d16c,0
ecfe16aa1358b16357e1e06c5034174795a236c8,1
58e26b94378bc778b8859783fc56527b535a0a9b,1


In [None]:
# Get a list of train and val images
train_list = list(df_train['id'])
val_list = list(df_val['id'])

# Transfer the train images
for image in train_list:
    
    # the id in the csv file does not have the .tif extension therefore we add it here
    fname = image + '.tif'
    # get the label for a certain image
    target = df_data.loc[image,'label']
    
    # these must match the folder names
    if target == 0:
        label = 'a_no_tumor_tissue'
    if target == 1:
        label = 'b_has_tumor_tissue'
    
    # source path to image
    src = os.path.join('train', fname)
    # destination path to image
    dst = os.path.join(train_dir, label, fname)
    # copy the image from the source to the destination
    shutil.copyfile(src, dst)


# Transfer the val images
for image in val_list:
    # the id in the csv file does not have the .tif extension therefore we add it here
    fname = image + '.tif'
    # get the label for a certain image
    target = df_data.loc[image,'label']
    
    # these must match the folder names
    if target == 0:
        label = 'a_no_tumor_tissue'
    if target == 1:
        label = 'b_has_tumor_tissue'
    # source path to image
    src = os.path.join('/home/lee/KAGGLE--练习/histopathologic-cancer-detection/train', fname)
    # destination path to image
    dst = os.path.join(val_dir, label, fname)
    # copy the image from the source to the destination
    shutil.copyfile(src, dst)

In [None]:
print(len(os.listdir('base_dir/train_dir/a_no_tumor_tissue')))
print(len(os.listdir('base_dir/train_dir/b_has_tumor_tissue')))
print(len(os.listdir('base_dir/val_dir/a_no_tumor_tissue')))
print(len(os.listdir('base_dir/val_dir/b_has_tumor_tissue')))

In [24]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator 

num_train_samples = len(df_train)
num_val_samples = len(df_val)
train_batch_size = 128
val_batch_size = 128

IMAGE_SIZE = 100
IMAGE_CHANNELS = 3

train_path = 'data/base_dir/train_dir'
valid_path = 'data/base_dir/val_dir'
test_path = 'test'

def get_data_gen():
    

    train_steps = np.ceil(num_train_samples / train_batch_size)
    val_steps = np.ceil(num_val_samples / val_batch_size)

    datagen = ImageDataGenerator(rescale=1.0/255.)

    train_gen = datagen.flow_from_directory(train_path,
                                            target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                            batch_size=train_batch_size,
                                            class_mode='binary')

    val_gen = datagen.flow_from_directory(valid_path,
                                            target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                            batch_size=val_batch_size,
                                            class_mode='binary')

    test_gen = datagen.flow_from_directory(valid_path,
                                            target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                            batch_size=1,
                                            class_mode='binary',
                                            shuffle=False)
    
    return train_gen,val_gen,test_gen

In [25]:
from tensorflow import keras 
from tensorflow.keras import layers,Input
from tensorflow.keras.models import Model,Sequential,load_model
from tensorflow.keras.initializers import TruncatedNormal

In [None]:
#0.87
def get_cnn_model_1(): 


    model = models.Sequential()

    model.add(layers.Conv2D(32,(3,3),padding='same',activation='relu',input_shape=(100,100,3),kernel_initializer='TruncatedNormal'))
    model.add(layers.MaxPooling2D((3,3)))
    model.add(layers.Conv2D(64,(3,3),padding='same',activation='relu',kernel_initializer='TruncatedNormal'))
    model.add(layers.MaxPooling2D((3,3)))
    
    model.add(layers.Flatten())
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(512,activation='relu',kernel_initializer='TruncatedNormal'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(1,activation='sigmoid'))

    return model  

In [None]:
#0.90
def get_cnn_model_2():
    ###  model  
    input_tensor = Input(shape=(100,100,3))
    x1 = layers.Conv2D(32,(2,2),activation='relu',kernel_initializer='TruncatedNormal')(input_tensor)
    x1 = layers.Conv2D(64,(2,2),activation='relu',kernel_initializer='TruncatedNormal')(x1)
    x1 = layers.MaxPooling2D((2,2))(x1)
    x1 = layers.Conv2D(128,(2,2),activation='relu',kernel_initializer='TruncatedNormal')(x1)
    x1 = layers.MaxPooling2D((4,4))(x1)
    x1 = layers.Conv2D(526,(2,2),activation='relu',kernel_initializer='TruncatedNormal')(x1)
    x1 = layers.MaxPooling2D((4,4))(x1)

    x1 = layers.Flatten()(x1)
    x1 = layers.Dropout(0.5)(x1)
    x1 = layers.Dense(512,activation='sigmoid',kernel_initializer='TruncatedNormal')(x1)
    x1 = layers.Dropout(0.5)(x1)

    output_tensor = layers.Dense(1,activation='sigmoid')(x1)
    model = Model(input_tensor,output_tensor) 
    return model

In [None]:
#0.93
def get_cnn_model_4():
    

    input_tensor = Input(shape=(100,100,3))
    x1 = layers.Conv2D(32,(2,2),activation='relu',kernel_initializer='TruncatedNormal')(input_tensor)
    x1 = layers.Conv2D(64,(2,2),activation='relu',kernel_initializer='TruncatedNormal')(x1)
    x1 = layers.MaxPooling2D((2,2))(x1)
    x1 = layers.Conv2D(128,(2,2),activation='relu',kernel_initializer='TruncatedNormal')(x1)
    x1 = layers.MaxPooling2D((2,2))(x1)
    x1 = layers.Conv2D(256,(2,2),activation='relu',kernel_initializer='TruncatedNormal')(x1)
    x1 = layers.MaxPooling2D((2,2))(x1)
    x1 = layers.Conv2D(512,(2,2),activation='relu',kernel_initializer='TruncatedNormal')(x1)
    x1 = layers.MaxPooling2D((2,2))(x1)
    x1 = layers.Conv2D(1024,(2,2),activation='relu',kernel_initializer='TruncatedNormal')(x1)
    x1 = layers.MaxPooling2D((2,2))(x1)
    
    x1 = layers.Flatten()(x1)
    x1 = layers.Dropout(0.5)(x1)
    x1 = layers.Dense(512,activation='sigmoid',kernel_initializer='TruncatedNormal')(x1)
    x1 = layers.Dropout(0.5)(x1)



    output_tensor = layers.Dense(1,activation='sigmoid')(x1)
    model = Model(input_tensor,output_tensor) 

    return model

In [None]:
def get_cnn_model_8_1():
  
    input_tensor = Input(shape=(100,100,3))

    x1 = layers.Conv2D(32,3,padding='same')(input_tensor)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Activation('relu')(x1)
    x1 = layers.Conv2D(32,3,padding='same')(x1)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Activation('relu')(x1)
    x1 = layers.Conv2D(32,3,padding='same')(x1)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Activation('relu')(x1)
    x1 = layers.MaxPool2D(2,strides=2)(x1) #output (50,50,32)



    x2 = layers.Conv2D(64,3,padding='same')(x1)
    x2 = layers.BatchNormalization()(x2)
    x2 = layers.Activation('relu')(x2)
    x2 = layers.Conv2D(64,3,padding='same')(x2)
    x2 = layers.BatchNormalization()(x2)
    x2 = layers.Activation('relu')(x2)
    x2 = layers.Conv2D(64,3,padding='same')(x2)
    x2 = layers.BatchNormalization()(x2)
    x2 = layers.Activation('relu')(x2)
    x2 = layers.MaxPool2D(2,strides=2)(x2)

    residual_x1 = layers.Conv2D(64,1,strides=2,padding='same')(x1)
    x2 = layers.add([x2,residual_x1]) #output (25,25,64)



    x3 = layers.Conv2D(128,3,padding='same')(x2)
    x3 = layers.BatchNormalization()(x3)
    x3 = layers.Activation('relu')(x3)
    x3 = layers.Conv2D(128,3,padding='same')(x2)
    x3 = layers.BatchNormalization()(x3)
    x3 = layers.Activation('relu')(x3)
    x3 = layers.Conv2D(128,3,padding='same')(x2)
    x3 = layers.BatchNormalization()(x3)
    x3 = layers.Activation('relu')(x3)

    residual_x2 = layers.Conv2D(128,1,padding='same')(x2)
    x3 = layers.add([x3,residual_x2]) #output (25,25,128)



    x4 = layers.Conv2D(128,3,padding='same')(x3)
    x4 = layers.BatchNormalization()(x4)
    x4 = layers.Activation('relu')(x4)
    x4 = layers.Conv2D(128,3,padding='same')(x4)
    x4 = layers.BatchNormalization()(x4)
    x4 = layers.Activation('relu')(x4)
    x4 = layers.Conv2D(128,3,padding='same')(x4)
    x4 = layers.BatchNormalization()(x4)
    x4 = layers.Activation('relu')(x4)

    x4 = layers.add([x4,x3])     #output (25,25,128)


    x5 = layers.Conv2D(256,3,padding='same')(x4)
    x5 = layers.BatchNormalization()(x5)
    x5 = layers.Activation('relu')(x5)
    x5 = layers.Conv2D(256,3,padding='same')(x5)
    x5 = layers.BatchNormalization()(x5)
    x5 = layers.Activation('relu')(x5)
    x5 = layers.Conv2D(256,3,padding='same')(x5)
    x5 = layers.BatchNormalization()(x5)
    x5 = layers.Activation('relu')(x5)

    residual_x4 = layers.Conv2D(256,1,padding='same')(x4)
    x5 = layers.add([x5,residual_x4]) # output (25,25,128)

    x5 = layers.MaxPool2D(5,strides=5)(x5)
    x5 = layers.Flatten()(x5)
    x5 = layers.Dropout(0.5)(x5)

    x = layers.Dense(512)(x5)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(0.5)(x)

    output_tensor = layers.Dense(1,activation='sigmoid')(x)

    model = Model(input_tensor,output_tensor)
    model.summary()

In [26]:
# 95%
def get_cnn_model_8():
    
    input_tensor = Input(shape=(100,100,3))

    x1 = layers.Conv2D(32,3,padding='same')(input_tensor)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Activation('relu')(x1)
    x1 = layers.Conv2D(32,3,padding='same')(x1)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Activation('relu')(x1)
    x1 = layers.Conv2D(32,3,padding='same')(x1)
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Activation('relu')(x1)
    x1 = layers.MaxPool2D(2,strides=2)(x1) #output (50,50,32)



    x2 = layers.Conv2D(64,3,padding='same')(x1)
    x2 = layers.BatchNormalization()(x2)
    x2 = layers.Activation('relu')(x2)
    x2 = layers.Conv2D(64,3,padding='same')(x2)
    x2 = layers.BatchNormalization()(x2)
    x2 = layers.Activation('relu')(x2)
    x2 = layers.Conv2D(64,3,padding='same')(x2)
    x2 = layers.BatchNormalization()(x2)
    x2 = layers.Activation('relu')(x2)
    x2 = layers.MaxPool2D(2,strides=2)(x2)

    residual_x1 = layers.Conv2D(64,1,strides=2,padding='same')(x1)
    x2 = layers.add([x2,residual_x1]) #output (25,25,64)


    x2_s = layers.SeparableConv2D(64,3,padding='same')(x2)
    x2_s = layers.BatchNormalization()(x2_s)
    x2_s = layers.Activation('relu')(x2_s)
    x2_s = layers.SeparableConv2D(64,3,padding='same')(x2_s)
    x2_s= layers.BatchNormalization()(x2_s)
    x2_s= layers.Activation('relu')(x2_s)

    x2_s = layers.add([x2_s,x2]) 


    x3 = layers.Conv2D(128,3,padding='same')(x2_s)
    x3 = layers.BatchNormalization()(x3)
    x3 = layers.Activation('relu')(x3)
    x3 = layers.Conv2D(128,3,padding='same')(x3)
    x3 = layers.BatchNormalization()(x3)
    x3 = layers.Activation('relu')(x3)
    x3 = layers.Conv2D(128,3,padding='same')(x3)
    x3 = layers.BatchNormalization()(x3)
    x3 = layers.Activation('relu')(x3)

    residual_x2 = layers.Conv2D(128,1,padding='same')(x2_s)
    x3 = layers.add([x3,residual_x2]) #output (25,25,128)



    x4 = layers.Conv2D(128,3,padding='same')(x3)
    x4 = layers.BatchNormalization()(x4)
    x4 = layers.Activation('relu')(x4)
    x4 = layers.Conv2D(128,3,padding='same')(x4)
    x4 = layers.BatchNormalization()(x4)
    x4 = layers.Activation('relu')(x4)
    x4 = layers.Conv2D(128,3,padding='same')(x4)
    x4 = layers.BatchNormalization()(x4)
    x4 = layers.Activation('relu')(x4)

    x4 = layers.add([x4,x3])     #output (25,25,128)


    x4_s = layers.SeparableConv2D(128,3,padding='same')(x4)
    x4_s = layers.BatchNormalization()(x4_s)
    x4_s = layers.Activation('relu')(x4_s)
    x4_s = layers.SeparableConv2D(128,3,padding='same')(x4_s)
    x4_s= layers.BatchNormalization()(x4_s)
    x4_s= layers.Activation('relu')(x4_s)

    x4_s = layers.add([x4_s,x4]) 

    x5 = layers.Conv2D(256,3,padding='same')(x4_s)
    x5 = layers.BatchNormalization()(x5)
    x5 = layers.Activation('relu')(x5)
    x5 = layers.Conv2D(256,3,padding='same')(x5)
    x5 = layers.BatchNormalization()(x5)
    x5 = layers.Activation('relu')(x5)
    x5 = layers.Conv2D(256,3,padding='same')(x5)
    x5 = layers.BatchNormalization()(x5)
    x5 = layers.Activation('relu')(x5)

    residual_x4 = layers.Conv2D(256,1,padding='same')(x4_s)
    x5 = layers.add([x5,residual_x4]) 

    x5_1 = layers.Conv2D(256,3,padding='same')(x5)
    x5_1 = layers.BatchNormalization()(x5_1)
    x5_1 = layers.Activation('relu')(x5_1)
    x5_1 = layers.Conv2D(256,3,padding='same')(x5_1)
    x5_1 = layers.BatchNormalization()(x5_1)
    x5_1 = layers.Activation('relu')(x5_1)
    x5_1 = layers.Conv2D(256,3,padding='same')(x5_1)
    x5_1 = layers.BatchNormalization()(x5_1)
    x5_1 = layers.Activation('relu')(x5_1)
    x5_1 = layers.add([x5,x5_1])

    x5_s = layers.SeparableConv2D(256,3,padding='same')(x5_1)
    x5_s = layers.BatchNormalization()(x5_s)
    x5_s = layers.Activation('relu')(x5_s)
    x5_s = layers.SeparableConv2D(256,3,padding='same')(x5_s)
    x5_s= layers.BatchNormalization()(x5_s)
    x5_s= layers.Activation('relu')(x5_s)
    x5_s = layers.SeparableConv2D(256,3,padding='same')(x5_s)
    x5_s= layers.BatchNormalization()(x5_s)
    x5_s= layers.Activation('relu')(x5_s)
    x5_s = layers.add([x5_s,x5_1]) 


    x = layers.MaxPool2D(5,strides=5)(x5_s)
    x = layers.Flatten()(x)
    x = layers.Dropout(0.3)(x)

    x = layers.Dense(512)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(0.3)(x)

    output_tensor = layers.Dense(1,activation='sigmoid')(x)

    model = Model(input_tensor,output_tensor)
    return model

In [28]:
model = get_cnn_model_8()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 100, 100, 3)  0                                            
__________________________________________________________________________________________________
conv2d_21 (Conv2D)              (None, 100, 100, 32) 896         input_2[0][0]                    
__________________________________________________________________________________________________
batch_normalization_26 (BatchNo (None, 100, 100, 32) 128         conv2d_21[0][0]                  
__________________________________________________________________________________________________
activation_26 (Activation)      (None, 100, 100, 32) 0           batch_normalization_26[0][0]     
__________________________________________________________________________________________________
conv2d_22 

In [None]:
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping,ReduceLROnPlateau,TensorBoard

In [None]:
train_gen,val_gen,test_gen = get_data_gen()

In [None]:
#complile model
model.compile(loss='binary_crossentropy',
             optimizer=optimizers.RMSprop(lr=1e-5),
             metrics=['accuracy'],
             )

checkpoint = ModelCheckpoint(filepath='weight_dir/weights.{epoch:02d}-{val_loss:.2f}.hdf5',
                             monitor='val_loss', 
                             save_weights_only=True,
                             verbose=1,
                             save_best_only=True, 
                             period=1)
earlystopping = EarlyStopping(monitor='acc',
                              patience=3)
reducelr = ReduceLROnPlateau(monitor='val_loss', 
                            factor=0.5,
                            patience=10)


# training model
history = model.fit_generator(generator=train_gen,
                             steps_per_epoch=train_steps,              
                             validation_data=val_gen,
                             validation_steps=val_steps,
                             callbacks=[checkpoint,earlystopping,reducelr], 
                             epochs=30,
                             )