In [1]:
#Sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from itertools import product
from matplotlib import pyplot

#bibliotecas dos descritores
import numpy as np
import mahotas
import cv2
import os
import h5py
import warnings

import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

In [2]:
fixed_size = tuple((100, 100))
train_path = "../testes/cyst_IC/train"
h5_data = '../testes/output/data.h5'
h5_labels = '../testes/output/labels.h5'
bins = 8

## Extraindo os descritores das imagens

In [3]:
# Descritor 1: Hu Moments
def fd_hu_moments(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    feature = cv2.HuMoments(cv2.moments(image)).flatten()
    return feature

# Descritor 2: Haralick Texture
def fd_haralick(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    haralick = mahotas.features.haralick(gray).mean(axis=0)
    return haralick

# Descritor 3: Color Histogram
def fd_histogram(image, mask=None):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist  = cv2.calcHist([image], [0, 1, 2], None, [bins, bins, bins], [0, 256, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    return hist.flatten()

In [4]:
train_labels = os.listdir(train_path)

# classificação
train_labels.sort()
print(train_labels)

['K', 'R', 'S']


In [5]:
# vetores descritores
global_features = []
labels = []

In [6]:
for training_name in train_labels:
    dir = os.path.join(train_path, training_name)
    list = os.listdir(dir)
    number_files = len(list)

    current_label = training_name

    for x in range(1,number_files+1):
        file = dir + "/(" + str(x) + ").png"

        image = cv2.imread(file)
        image = cv2.resize(image, fixed_size)

        fv_hu_moments = fd_hu_moments(image)
        fv_haralick   = fd_haralick(image)
        fv_histogram  = fd_histogram(image)

        global_feature = np.hstack([fv_histogram, fv_haralick, fv_hu_moments])

        labels.append(current_label)
        global_features.append(global_feature)

    print("Processando pasta: {}".format(current_label))

Processando pasta: K
Processando pasta: R
Processando pasta: S


In [7]:
# codificando descritores
targetNames = np.unique(labels)
le = LabelEncoder()
target = le.fit_transform(labels)

In [8]:
# normalizando a matriz de descritores
scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_features = scaler.fit_transform(global_features)

In [9]:
# salvando os descritores no formato HDF5
h5f_data = h5py.File(h5_data, 'w')
h5f_data.create_dataset('dataset_1', data=np.array(rescaled_features))

h5f_label = h5py.File(h5_labels, 'w')
h5f_label.create_dataset('dataset_1', data=np.array(target))

h5f_data.close()
h5f_label.close()

In [10]:
# TREINANDO COM OS DESCRITORES
num_trees = 100
test_size = 0.50
seed = 9
test_path = "../testes/cyst_IC/test"
scoring = "accuracy"

train_labels = os.listdir(train_path)

train_labels.sort()

if not os.path.exists(test_path):
    os.makedirs(test_path)

In [11]:
results = []
names = []

# importando os descritores
h5f_data  = h5py.File(h5_data, 'r')
h5f_label = h5py.File(h5_labels, 'r')

global_features_string = h5f_data['dataset_1']
global_labels_string   = h5f_label['dataset_1']

global_features = np.array(global_features_string)
global_labels   = np.array(global_labels_string)

h5f_data.close()
h5f_label.close()

In [12]:
# separando os dados em teste e validação
(trainDataGlobal, testDataGlobal, trainLabelsGlobal, testLabelsGlobal) = train_test_split(np.array(global_features),
                                                                                          np.array(global_labels),
                                                                                          test_size=test_size,
                                                                                          random_state=seed)

## Classificando as imagens médicas

In [13]:
# Explorando o SVC
clf = SVC(random_state=seed)

# treinamento
clf.fit(trainDataGlobal, trainLabelsGlobal)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=9, shrinking=True, tol=0.001,
    verbose=False)

In [14]:
# verificando qual a taxa de acerto para o conjunto teste
clf.score(testDataGlobal, testLabelsGlobal)

0.8765432098765432

## Experimentando uma imagem teste

In [15]:
# selecionando uma imagem teste qualquer para verificar seus scores
n = 1440
scores = clf.decision_function(testDataGlobal[n:n+1])[0]
scores

array([ 2.22212669, -0.23506601,  1.09433067])

In [16]:
# verificando qual classe essa imagem pertence (K=0, R=1, S=2)
testLabelsGlobal[n]

2

In [17]:
# selecionando uma imagem de cada classe do nosso conjunto de treinamento
imgs_train = trainDataGlobal[2], trainDataGlobal[0], trainDataGlobal[1]
trainLabelsGlobal[2], trainLabelsGlobal[0], trainLabelsGlobal[1]

(0, 1, 2)

In [18]:
# função que calcula as distâncias
def calc_dist(img_test, imgs_train):
    return [np.sqrt(np.sum((img_test - imgs_train[0])**2)),
            np.sqrt(np.sum((img_test - imgs_train[1])**2)), 
            np.sqrt(np.sum((img_test - imgs_train[2])**2))]

In [19]:
# calculando a distância entre a imagem teste e as imagens de treinamento
dist = calc_dist(testDataGlobal[n], imgs_train)
dist

[0.8177133869572023, 1.4962028892998522, 0.3231796463478212]

In [20]:
# novos scores
np.absolute(scores/dist)

array([2.71748846, 0.15710838, 3.38613736])

## Aplicando ensemble no conjunto todo

In [21]:
# cálculo das imagens de cada conjunto com menor distância
dists_K = []
dists_R = []
dists_S = []
for i in range(len(trainDataGlobal)):
    dist = 0
    for j in range(len(trainDataGlobal)):
        if trainLabelsGlobal[i] == trainLabelsGlobal[j]:
            dist = dist + np.sqrt(np.sum((trainDataGlobal[i] - trainDataGlobal[j])**2))
    if trainLabelsGlobal[i] == 0:
        dists_K.append([i, dist])
    elif trainLabelsGlobal[i] == 1:
        dists_R.append([i, dist])
    elif trainLabelsGlobal[i] == 2:
        dists_S.append([i, dist])
        
K = [x[1] for x in dists_K]
n = K.index(min(K))
print("[n, distancia]", dists_K[n])

R = [x[1] for x in dists_R]
n = R.index(min(R))
print("[n, distancia]", dists_R[n])

S = [x[1] for x in dists_S]
n = S.index(min(S))
print("[n, distancia]", dists_S[n])

[n, distancia] [733, 566.3339320894613]
[n, distancia] [258, 479.1787664226994]
[n, distancia] [1360, 334.7079018406201]


In [22]:
# selecionando uma imagem de cada classe do nosso conjunto de treinamento
imgs_train = trainDataGlobal[733], trainDataGlobal[258], trainDataGlobal[1360]
trainLabelsGlobal[733], trainLabelsGlobal[258], trainLabelsGlobal[1360]

(0, 1, 2)

In [23]:
# função pra calcular os pesos
def weight(d):
    w = []
    for i in range(len(d)):
        s = 0
        for j in range(len(d)):
            if j != i:
                s = s + d[j]
        w.append((s/len(d))/d[i])
        
    return w

In [24]:
# aplicando a ideia de distância em todo o conjunto teste
allScores = clf.decision_function(testDataGlobal)

newScores = []
dists = []
for i in range(len(allScores)):
    dist = calc_dist(testDataGlobal[i], imgs_train)
    weights = weight(dist)
    dists.append(dist)
    newScores.append(allScores[i]*weights)

In [25]:
# obtendo as previsões para o conjunto teste
predict = []
for score in newScores: 
    score = score.tolist()
    predict.append(score.index(max(score)))

In [26]:
# calculando o desempenho dada a transformação
testOk = 0    
for i in range(len(testDataGlobal)):      
    if predict[i] == testLabelsGlobal[i]:
        testOk = testOk + 1
        
print(testOk/len(testDataGlobal))

0.8844855967078189


In [27]:
# Explorando o melhor algoritmo: RF
clf2 = RandomForestClassifier(n_estimators=num_trees, random_state=seed)

# treinamento
clf2.fit(trainDataGlobal, trainLabelsGlobal)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=9, verbose=0,
                       warm_start=False)

In [28]:
# verificando qual a taxa de acerto para o conjunto teste
clf2.score(testDataGlobal, testLabelsGlobal)

0.9876834473251029

In [29]:
# aplicando a ideia de distância em todo o conjunto teste
allScores_2 = clf2.predict_proba(testDataGlobal)

newScores_2 = []
dists_2 = []
for i in range(len(allScores_2)):
    weights = weight(dists[i])
    newScores_2.append(allScores_2[i]*weights)

In [30]:
# obtendo as previsoes para o conjunto teste
predict_2 = []
for score in newScores_2: 
    score = score.tolist()
    predict_2.append(score.index(max(score)))

In [31]:
# calculando o desempenho dada a transformação
testOk = 0    
for i in range(len(testDataGlobal)):      
    if predict_2[i] == testLabelsGlobal[i]:
        testOk = testOk + 1
        
print(testOk/len(testDataGlobal))

0.9880521262002744


## Usando uma rede neural como ensemble

In [32]:
# criando a entrada e a saida da rede neural
dataRN = []
for i in range(len(allScores)):      
    s = allScores[i].tolist() + dists[i]
    dataRN.append(np.array(s))
    
# separando os dados em teste e validação
(trainDataRN, testDataRN, trainLabelsRN, testLabelsRN) = train_test_split(np.array(dataRN),
                                                                          np.array(testLabelsGlobal),
                                                                          test_size=test_size,
                                                                          random_state=seed)

In [33]:
# Explorando a rede neural (+pesos calculados)
clfRN = MLPClassifier(hidden_layer_sizes=50)

# treinamento
clfRN.fit(trainDataRN, trainLabelsRN)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=50, learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [34]:
# verificando qual a taxa de acerto para o conjunto teste
clfRN.score(testDataRN, testLabelsRN)

0.8820301783264746

In [35]:
# criando a entrada e a saida da rede neural
dataRN_2 = []
for i in range(len(allScores)):      
    s = allScores_2[i].tolist() + dists[i]
    dataRN_2.append(np.array(s))
    
# separando os dados em teste e validação
(trainDataRN_2, testDataRN_2, trainLabelsRN_2, testLabelsRN_2) = train_test_split(np.array(dataRN_2),
                                                                                  np.array(testLabelsGlobal),
                                                                                  test_size=test_size,
                                                                                  random_state=seed)

In [44]:
# Explorando a rede neural (+RF)
clfRN_2 = MLPClassifier(hidden_layer_sizes=50)

# treinamento
clfRN_2.fit(trainDataRN_2, trainLabelsRN_2)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=50, learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [45]:
# verificando qual a taxa de acerto para o conjunto teste
clfRN_2.score(testDataRN_2, testLabelsRN_2)

0.9945130315500685

In [52]:
# Explorando a rede neural (+pesos calculados com 4 camadas)
clfRN_3 = MLPClassifier(hidden_layer_sizes=(50,50))

# treinamento
clfRN_3.fit(trainDataRN, trainLabelsRN)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(50, 50), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [53]:
# verificando qual a taxa de acerto para o conjunto teste
clfRN_3.score(testDataRN, testLabelsRN)

0.8950292148225646

In [40]:
# Explorando a rede neural (+RF com 4 camada)
clfRN_4 = MLPClassifier(hidden_layer_sizes=(50,50))

# treinamento
clfRN_4.fit(trainDataRN_2, trainLabelsRN_2)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(50, 50), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [41]:
# verificando qual a taxa de acerto para o conjunto teste
clfRN_4.score(trainDataRN_2, trainLabelsRN_2)

0.9958847736625515