In [None]:
import os
import cv2
import glob
import ntpath
import shutil
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
# 加载数据
src_path = "G:/LiverCancer/nucleus/image/"

nucleus_samples = [file for fold_1 in os.listdir(src_path)
                        for fold_2 in os.listdir(os.path.join(src_path, fold_1))
                        for file in glob.glob(os.path.join(src_path, fold_1, fold_2, "*.jpg"))]

samples_grade_0, samples_grade_1, samples_grade_2, samples_grade_3, samples_grade_4 = [], [], [], [], []

for index, nucleus_sample in enumerate(nucleus_samples):

    if "grade0" in nucleus_sample: samples_grade_0 += [ [nucleus_sample, 0] ]
    if "grade1" in nucleus_sample: samples_grade_1 += [ [nucleus_sample, 1] ]
    if "grade2" in nucleus_sample: samples_grade_2 += [ [nucleus_sample, 2] ]
    if "grade3" in nucleus_sample: samples_grade_3 += [ [nucleus_sample, 3] ]
    if "grade4" in nucleus_sample: samples_grade_4 += [ [nucleus_sample, 4] ]

# 加载模型
model = get_net(load_weight_path = "G:/LiverCancer/model/model_liver_CNN_Cluster.hd5")

feature_extractor = Model(inputs=model.input, outputs=model.get_layer('final_features_64').output)

In [None]:
# 提取特征
features = []

for index, record in enumerate(samples_grade_4):
    
    sample_image = cv2.imread(record[0])
    sample_image = (sample_image - np.average(sample_image)) / np.std(sample_image)
    sample_image = sample_image.reshape(1, sample_image.shape[0], sample_image.shape[1], 3)
    
    norm_feat = feature_extractor.predict(sample_image)[0]
    features += [norm_feat]

In [None]:
# 聚类
features = np.array(features)[:,1].tolist()
kmeans = KMeans(n_clusters=3, random_state=0).fit(features)

# 降维可视化
pca = PCA(n_components=2)
pca_res = pca.fit_transform(features) 
plt.scatter(pca_res[:, 0], pca_res[:, 1], c=kmeans.labels_)
plt.show()

# 计算样本到重心的距离
centre_points = np.mean(kmeans.cluster_centers_, axis=0)

distances = np.sum(np.abs(features - centre_points), axis=1).tolist()

# 根据距离排序，选距离较近90%
distances_sorted = np.array(distances).argsort()

count = int(len(features)*0.9)

selected = distances_sorted[0:count]

# 保存数据
for index, record in enumerate(samples_grade_4):
    
    sample_path = record[0]
    
    if index in selected:
        shutil.copy(sample_path, "G:/LiverCancer/nucleus/image_filtered/grade4/1/"+ntpath.basename(sample_path))
    else:
        shutil.copy(sample_path, "G:/LiverCancer/nucleus/image_filtered/grade4/0/"+ntpath.basename(sample_path))

In [None]:
from keras.models import Model
from keras.optimizers import SGD
from keras.layers.advanced_activations import ReLU
from keras.metrics import categorical_accuracy, categorical_crossentropy
from keras.layers import Input,AveragePooling2D,Convolution2D,BatchNormalization,MaxPooling2D,Concatenate,GlobalMaxPooling2D,Dense

def get_net(input_shape=(128, 128, 3), load_weight_path=None) -> Model:
    
    inputs = Input(shape=input_shape, name="input")
    x = inputs
    
    x_ident_1 = x
    x_ident_1 = AveragePooling2D(pool_size=(2, 2), strides=(2, 2), border_mode='valid')(x_ident_1)
    # 1st layer group
    x = Convolution2D(16, 3, 3, activation=None, border_mode='same', name='conv1a', subsample=(1, 1))(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Convolution2D(16, 3, 3, activation=None, border_mode='same', name='conv1b', subsample=(1, 1))(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), border_mode='valid', name='pool1')(x)
    x = Concatenate(axis=3)([x,x_ident_1])
    
    x_ident_1 = AveragePooling2D(pool_size=(2, 2), strides=(2, 2), border_mode='valid')(x_ident_1)
    x_ident_2 = AveragePooling2D(pool_size=(2, 2), strides=(2, 2), border_mode='valid')(x)
    # 2nd layer group
    x = Convolution2D(32, 3, 3, activation=None, border_mode='same', name='conv2a', subsample=(1, 1))(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Convolution2D(32, 3, 3, activation=None, border_mode='same', name='conv2b', subsample=(1, 1))(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), border_mode='valid', name='pool2')(x)
    x = Concatenate(axis=3)([x,x_ident_1,x_ident_2])
    
    x_ident_1 = AveragePooling2D(pool_size=(2, 2), strides=(2, 2), border_mode='valid')(x_ident_1)
    x_ident_2 = AveragePooling2D(pool_size=(2, 2), strides=(2, 2), border_mode='valid')(x_ident_2)
    x_ident_3 = AveragePooling2D(pool_size=(2, 2), strides=(2, 2), border_mode='valid')(x)
    # 3rd layer group
    x = Convolution2D(64, 3, 3, activation=None, border_mode='same', name='conv3a', subsample=(1, 1))(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Convolution2D(64, 3, 3, activation=None, border_mode='same', name='conv3b', subsample=(1, 1))(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), border_mode='valid', name='pool3')(x)
    x = Concatenate(axis=3)([x,x_ident_1,x_ident_2,x_ident_3])
     
    x_ident_1 = AveragePooling2D(pool_size=(2, 2), strides=(2, 2), border_mode='valid')(x_ident_1)
    x_ident_2 = AveragePooling2D(pool_size=(2, 2), strides=(2, 2), border_mode='valid')(x_ident_2)
    x_ident_3 = AveragePooling2D(pool_size=(2, 2), strides=(2, 2), border_mode='valid')(x_ident_3)
    x_ident_4 = AveragePooling2D(pool_size=(2, 2), strides=(2, 2), border_mode='valid')(x)
    # 4th layer group
    x = Convolution2D(128, 3, 3, activation=None, border_mode='same', name='conv4a', subsample=(1, 1),)(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Convolution2D(128, 3, 3, activation=None, border_mode='same', name='conv4b', subsample=(1, 1),)(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = MaxPooling2D(pool_size=(2, 2), strides=(2, 2), border_mode='valid', name='pool4')(x)
    x = Concatenate(axis=3)([x,x_ident_1,x_ident_2,x_ident_3,x_ident_4])
    
    x = GlobalMaxPooling2D()(x)
    x = BatchNormalization(name="final_features")(x)
        
    x = Dense(64, activation='relu', name='final_features_64')(x)
    out_class = Dense(5, activation='softmax', name='out_class')(x)

    model = Model(input=inputs, output=out_class)
    
    if load_weight_path is not None: model.load_weights(load_weight_path, by_name=False)

    optimizer = SGD(lr=0.001, momentum=0.9, nesterov=True)
    loss = {"out_class": "categorical_crossentropy"}
    metrics={"out_class": [categorical_accuracy, categorical_crossentropy]}

    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    model.summary(line_length=140)

    return model