## Baixando dataset do kaggle

In [None]:
! pip install -q kaggle
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"nymeria42","key":"de8d2d9536abc1eb868da72e1603f28d"}'}

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d tolgadincer/labeled-chest-xray-images
! mkdir labeled-chest-xray-images
! unzip -q labeled-chest-xray-images -d labeled-chest-xray-images

Downloading labeled-chest-xray-images.zip to /content
 99% 1.16G/1.17G [00:12<00:00, 132MB/s]
100% 1.17G/1.17G [00:12<00:00, 96.9MB/s]


In [None]:
import os
import random
import numpy as np
import io

path = "/content/labeled-chest-xray-images/chest_xray"
path_train = path + "/train"
path_test = path + "/test"

## Pré-processamento dos dados de entrada

In [None]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import tensorflow as tf

In [None]:
train_files = []
for root, dirs, files in os.walk(path_train):
    for file in files:
        if file.endswith(".jpeg"):
            train_files.append(os.path.join(root, file))

train_labels = []
for file_path in train_files:
    folder_name = os.path.basename(os.path.dirname(file_path))
    train_labels.append(folder_name)

train_images = []
for file_path in train_files:
    with open(file_path, 'rb') as f:
      image_bytes = f.read()
    train_images.append(io.BytesIO(image_bytes))

train_labels = np.array(train_labels)

train_data = list(zip(train_images, train_labels))

random.shuffle(train_data)

images_train, labels_train = zip(*train_data)

In [None]:
test_files = []
for root, dirs, files in os.walk(path_test):
    for file in files:
        if file.endswith(".jpeg"):
            test_files.append(os.path.join(root, file))

test_labels = []
for file_path in test_files:
    folder_name = os.path.basename(os.path.dirname(file_path))
    test_labels.append(folder_name)

test_images = []
for file_path in test_files:
    with open(file_path, 'rb') as f:
        image_bytes = f.read()
    test_images.append(io.BytesIO(image_bytes))

test_labels = np.array(test_labels)

test_data = list(zip(test_images, test_labels))

random.shuffle(test_data)

images_test, labels_test = zip(*test_data)

## Treinamento do modelo

In [None]:
class KerasFeaturizer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def _predict_single_image(self, img):
        img = np.expand_dims(img, axis=0)
        return self.model.predict(img, verbose=0).flatten()

    def transform(self, X):
        features = []
        for bytes_io in X:
            img = load_img(bytes_io, target_size=(64, 64))
            img = img_to_array(img)
            img = img / 255.0  # Normalize pixel values between 0 and 1
            feature = self._predict_single_image(img)
            features.append(feature)
        return np.array(features)

base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(64, 64, 3))

keras_featurizer = KerasFeaturizer(base_model)

logistic_regression = LogisticRegression(max_iter=10000, C=1/0.03, l1_ratio=0.5, penalty='elasticnet', solver='saga')

label_encoder = LabelEncoder()

pipeline = Pipeline([
    ('featurizer', keras_featurizer),
    ('logistic_regression', logistic_regression)
])

X_train, X_test, y_train, y_test = images_train, images_test, labels_train, labels_test
y_train_encoded = label_encoder.fit_transform(y_train)
pipeline.fit(X_train, y_train_encoded)


## Avaliação

In [None]:
score = pipeline.score(X_test, label_encoder.transform(y_test))
predictions = pipeline.predict(X_test)

In [None]:
score

0.8205128205128205

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(label_encoder.transform(y_test), predictions)

array([[124, 110],
       [  8, 382]])