In [1]:
import medmnist
from medmnist import INFO, Evaluator
import numpy as np
from matplotlib import pyplot as plt
import os 


In [2]:
data = np.load("breastmnist.npz", allow_pickle=True)
lst = data.files
for item in lst:
    print(item)
    print(data[item])


train_images
[[[139 139 138 ... 165 162 165]
  [ 84  92  94 ... 124 116 112]
  [ 84  82  87 ... 124 127 126]
  ...
  [ 63  70  83 ...  36  34  32]
  [ 59  70  73 ...  50  40  34]
  [ 62  65  61 ...  51  42  40]]

 [[ 82  85  83 ...  76  83  86]
  [ 80  82  68 ...  59  61  64]
  [ 56  59  60 ...  59  58  60]
  ...
  [ 24  23  19 ...  18  18  17]
  [ 23  21  16 ...  24  23  19]
  [ 22  24  18 ...  26  22  19]]

 [[157 155 151 ... 130 128 131]
  [160 162 156 ... 126 126 135]
  [129 119 122 ... 114 110 110]
  ...
  [ 67  66  63 ...  77  75  71]
  [ 74  70  72 ...  80  82  76]
  [ 85  81  73 ...  77  79  72]]

 ...

 [[159 160 167 ... 134 136 143]
  [144 147 144 ... 123 128 143]
  [120 129 140 ... 119 101  99]
  ...
  [ 54  55  53 ...  52  44  40]
  [ 50  53  51 ...  43  46  51]
  [ 57  52  51 ...  50  55  56]]

 [[ 55  57  61 ...  59  62  64]
  [ 63  72  78 ...  68  70  71]
  [ 51  50  64 ...  61  50  57]
  ...
  [ 12  12  13 ...  19  19  19]
  [ 13  12  14 ...  17  18  18]
  [ 12  12  15 

In [None]:
lst

In [3]:
x_train, y_train = data["train_images"], data["train_labels"]
x_val, y_val = data["val_images"], data["val_labels"]
x_test, y_test = data["test_images"], data["test_labels"]

x_train = x_train.reshape(-1, 28**2)
x_val = x_val.reshape(-1, 28**2)
x_test = x_test.reshape(-1, 28**2)

y_train = y_train.ravel()
y_val = y_val.ravel()
y_test = y_test.ravel()

x_train_val = np.concatenate((x_train, x_val))
y_train_val = np.concatenate((y_train, y_val))

In [None]:
print(f"Tamanho do conjunto de treino: {len(x_train)}")
print(f"Tamanho do conjunto de validação: {len(x_val)}")
print(f"Tamanho do conjunto de teste: {len(x_test)}")

In [None]:
# Plotando um exemplo de imagem do conjunto de treino
img_example = x_train[0].reshape(28,28)
plt.imshow(img_example, cmap="gray")
print(f"Dimensões de uma imagem do dataset: {img_example.shape}")

# Balanceamento do conjunto de treino

In [4]:
possible_classes = np.unique(y_train)
p_class = len(y_train[y_train == 1])
n_class = len(y_train[y_train == 0])

print(f"Possíveis {possible_classes}")
print(f"Classes positivas: {p_class} ({p_class/len(y_train) *100 } %)")
print(f"Classes negativas: {n_class} ({n_class/len(y_train) *100 } %)")
print("O dataset está DESBALANCEADO")

Possíveis [0 1]
Classes positivas: 399 (73.07692307692307 %)
Classes negativas: 147 (26.923076923076923 %)
O dataset está DESBALANCEADO


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


model = LogisticRegression(max_iter=5000, class_weight="balanced")
model.fit(x_train, y_train)
print(model.score(x_train, y_train))
print(model.score(x_val, y_val))
print(model.score(x_test, y_test))



In [None]:
from sklearn.model_selection import GridSearchCV

solvers = ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
penalties = ["none", "l1", "l2", "elasticnet"]
c_values = np.logspace(-4, 4, 10)
lr_grid = dict(solver=solvers, penalty=penalties, C=c_values)
lr_model = LogisticRegression(max_iter=5000, class_weight="balanced")

lr_grid_search = GridSearchCV(estimator=lr_model, param_grid=lr_grid, n_jobs=os.cpu_count(), cv=5, verbose=3)
lr_grid_result = lr_grid_search.fit(x_train_val, y_train_val)

In [None]:
x_train * 4

In [None]:
x_train

In [None]:
lr_grid_result.best_params_


In [None]:
lr_best_model = LogisticRegression(penalty="l1", C=0.3593813663804626, solver='liblinear')
lr_best_model.fit(x_train_val, y_train_val)
lr_best_model.score(x_test,y_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
prediction = lr_best_model.predict(x_test)
cm = confusion_matrix(prediction, y_test)
display(cm)
report = classification_report(y_test, prediction)
print(report)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

## encontrando melhor valor de hiperparâmetros:

grid_params = { 'n_neighbors' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
for u in range(2, 71):
    knn = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 2, cv= u, n_jobs = -1)
    knn.fit(x_train, y_train)

    # melhor pontuação

    print(knn.best_score_)


In [52]:
#melhores parâmetros
knn = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 2, cv= 19, n_jobs = -1)
knn.fit(x_train, y_train)

# melhor pontuação

print(knn.best_score_)

knn.best_params_

Fitting 19 folds for each of 300 candidates, totalling 5700 fits
0.8116411718952555


{'metric': 'minkowski', 'n_neighbors': 9, 'weights': 'distance'}

In [53]:
## treinando com os melhores hiperparâmetros

knn = KNeighborsClassifier(n_neighbors= 9, weights= 'distance', metric='minkowski')
knn.fit(x_train, y_train)
predic1= knn.predict(x_val)
print("pontuação nos dados de validação:", knn.score(x_val, y_val))

## treinando com dados de validação
knn.fit(x_train_val, y_train_val)
predic2= knn.predict(x_test)
print("pontuação nos dados de teste:", knn.score(x_test, y_test))


    


pontuação nos dados de validação: 0.8333333333333334
pontuação nos dados de teste: 0.8076923076923077


In [54]:
from sklearn.metrics import confusion_matrix, classification_report
prediction = knn.predict(x_test)
cm = confusion_matrix(prediction, y_test)
display(cm)
report = classification_report(y_test, prediction)
print(report)

array([[ 16,   4],
       [ 26, 110]], dtype=int64)

              precision    recall  f1-score   support

           0       0.80      0.38      0.52        42
           1       0.81      0.96      0.88       114

    accuracy                           0.81       156
   macro avg       0.80      0.67      0.70       156
weighted avg       0.81      0.81      0.78       156

