## Organização do dataset

In [None]:
def dicom2png(input_file, output_file):
    try:
        ds = pydicom.dcmread(input_file)
        shape = ds.pixel_array.shape

        # Convert to float to avoid overflow or underflow losses.
        image_2d = ds.pixel_array.astype(float)

        # Rescaling grey scale between 0-255
        image_2d_scaled = (np.maximum(image_2d,0) / image_2d.max()) * 255.0

        # Convert to uint
        image_2d_scaled = np.uint8(image_2d_scaled)

        # Write the PNG file
        with open(output_file, 'wb') as png_file:
            w = png.Writer(shape[1], shape[0], greyscale=True)
            w.write(png_file, image_2d_scaled)
    except:
        print('Could not convert: ', input_file)

In [None]:
from google.colab import drive

drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [None]:
import pandas as pd
import shutil
import glob
from sklearn.model_selection import train_test_split
import os

In [None]:
study_level = pd.read_csv("gdrive/MyDrive/covid-dataset/train_study_level.csv")
image_level = pd.read_csv("gdrive/MyDrive/covid-dataset/train_image_level.csv")

study_level['study_name'] = study_level['id'].apply(lambda x: x.replace('_study', ''))

In [None]:
df = pd.DataFrame()

df['image_name'] = image_level['id'].apply(lambda x: x.replace('_image', ''))
df['study_name'] = image_level['StudyInstanceUID']

merge = pd.merge(df, study_level, on='study_name')

r0 = merge['Typical Appearance'].apply(lambda x: 'typical' if x == 1 else False)
r1 = merge['Atypical Appearance'].apply(lambda x: 'atypical' if x == 1 else False)
r2 = merge['Indeterminate Appearance'].apply(lambda x: 'indeterminate' if x == 1 else False)

labels = []

for a,b,c in zip(r0, r1, r2):
  if a != False:
    labels.append(a)
    continue
  if b != False:
    labels.append(b)
    continue
  if c != False:
    labels.append(c)
    continue

  labels.append('not recognized')

merge['label'] = labels

In [None]:
shutil.copy('gdrive/MyDrive/covid-dataset/nn_train_600.zip', './')

'./nn_train_600.zip'

In [None]:
!unzip -qq nn_train_600.zip

In [None]:
img_df = pd.DataFrame()

paths = glob.glob('./nn_train_600/**/*.png', recursive=True)

img_df['path'] = paths
img_df['image_name'] = img_df['path'].apply(lambda x: x.split('/')[-1].replace('.png', ''))

fndf = pd.merge(merge, img_df, on='image_name')

In [None]:
X, y = fndf['path'], fndf['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
os.makedirs('train/typical', exist_ok=True)
os.makedirs('train/indeterminate', exist_ok=True)
os.makedirs('train/atypical', exist_ok=True)

os.makedirs('test/typical', exist_ok=True)
os.makedirs('test/indeterminate', exist_ok=True)
os.makedirs('test/atypical', exist_ok=True)

In [None]:
def distribute_images(_paths, _labels, _folder):
  for path, label in zip(_paths, _labels):
    shutil.copy(path, _folder + '/' + label)

distribute_images(X_train, y_train, 'train')
distribute_images(X_test, y_test, 'test')

## Fine-tuning EfficientNet

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0, EfficientNetB1, EfficientNetB2, EfficientNetB3, EfficientNetB4, EfficientNetB5, EfficientNetB6, EfficientNetB7
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print("Running on TPU ", tpu.cluster_spec().as_dict()["worker"])
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError:
    print("Not connected to a TPU runtime. Using CPU/GPU strategy")
    strategy = tf.distribute.MirroredStrategy()

Not connected to a TPU runtime. Using CPU/GPU strategy
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


In [None]:
batch_size = 64
height = 456
width = 456
input_shape = (height, width, 3)

In [None]:
with strategy.scope():
  train_datagen = ImageDataGenerator(
        rescale=1,
        rotation_range=10,
        width_shift_range=0.1,
        height_shift_range=0.1,
        shear_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,)

  # Note that the validation data should not be augmented!
  test_datagen = ImageDataGenerator(rescale=1)

  train_generator = train_datagen.flow_from_directory(
          # This is the target directory
          "train",
          # All images will be resized to target height and width.
          target_size=(height, width),
          batch_size=batch_size,
          # Since we use categorical_crossentropy loss, we need categorical labels
          class_mode='categorical')

  validation_generator = test_datagen.flow_from_directory(
          "test",
          target_size=(height, width),
          batch_size=batch_size,
          class_mode='categorical', shuffle=False)

Found 3382 images belonging to 3 classes.
Found 846 images belonging to 3 classes.


In [None]:
with strategy.scope():
  model = models.Sequential()
  model.add(layers.Input(shape=(height, width, 3)))
  model.add(EfficientNetB7(include_top=True, weights=None, classes=3))
  model.compile(
      optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
  )

  model.summary()

  hist = model.fit_generator(
        train_generator,
        steps_per_epoch= 3382 // batch_size,
        epochs=20,
        validation_data=validation_generator,
        validation_steps= 846 // batch_size,
        verbose=1,)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
efficientnetb7 (Functional)  (None, 3)                 64105370  
Total params: 64,105,370
Trainable params: 63,794,643
Non-trainable params: 310,727
_________________________________________________________________




Epoch 1/20


ResourceExhaustedError: ignored

In [None]:
def build_model(num_classes):
    inputs = layers.Input(shape=(height, width, 3))
    x = inputs
    model = EfficientNetB5(include_top=False, input_tensor=x, weights="imagenet")

    # Freeze the pretrained weights
    model.trainable = False

    # Rebuild top
    x = layers.GlobalAveragePooling2D(name="avg_pool")(model.output)
    x = layers.BatchNormalization()(x)

    top_dropout_rate = 0.2
    x = layers.Dropout(top_dropout_rate, name="top_dropout")(x)
    outputs = layers.Dense(num_classes, activation="softmax", name="pred")(x)

    # Compile
    model = tf.keras.Model(inputs, outputs, name="EfficientNet")

    for layer in model.layers[-20:]:
        if not isinstance(layer, layers.BatchNormalization):
            layer.trainable = True

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    model.compile(
        optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"]
    )
    return model


with strategy.scope():
  model2 = build_model(3)

  model2.summary()

  checkpoint_filepath = 'gdrive/MyDrive/covid-dataset'
  model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
      filepath=checkpoint_filepath,
      save_weights_only=True,
      monitor='val_accuracy',
      mode='max',
      save_best_only=True)

  hist = model2.fit_generator(
        train_generator,
        steps_per_epoch= 3382 // batch_size,
        epochs=50,
        validation_data=validation_generator,
        validation_steps= 846 // batch_size,
        verbose=1, callbacks=[model_checkpoint_callback])

Model: "EfficientNet"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 456, 456, 3) 0                                            
__________________________________________________________________________________________________
rescaling_1 (Rescaling)         (None, 456, 456, 3)  0           input_2[0][0]                    
__________________________________________________________________________________________________
normalization_1 (Normalization) (None, 456, 456, 3)  7           rescaling_1[0][0]                
__________________________________________________________________________________________________
stem_conv_pad (ZeroPadding2D)   (None, 457, 457, 3)  0           normalization_1[0][0]            
_______________________________________________________________________________________



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50

In [None]:
def build_model(num_classes):
    inputs = layers.Input(shape=(height, width, 3))
    x = inputs
    model = EfficientNetB5(include_top=False, input_tensor=x, weights="imagenet")

    # Freeze the pretrained weights
    model.trainable = False

    # Rebuild top
    x = layers.GlobalAveragePooling2D(name="avg_pool")(model.output)
    x = layers.BatchNormalization()(x)

    top_dropout_rate = 0.2
    x = layers.Dropout(top_dropout_rate, name="top_dropout")(x)
    outputs = layers.Dense(num_classes, activation="softmax", name="pred")(x)

    # Compile
    model = tf.keras.Model(inputs, outputs, name="EfficientNet")

    for layer in model.layers[-20:]:
        if not isinstance(layer, layers.BatchNormalization):
            layer.trainable = True

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    model.compile(
        optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"]
    )
    return model

model2 = build_model(3)
model2.load_weights('gdrive/MyDrive/covid-dataset')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f050d132e10>

In [None]:
model2.predict(validation_generator)

array([[0.10458558, 0.0721257 , 0.8232887 ],
       [0.1769225 , 0.15461828, 0.6684592 ],
       [0.02098048, 0.0517534 , 0.9272662 ],
       ...,
       [0.21524633, 0.37649578, 0.40825784],
       [0.05470058, 0.18880783, 0.7564916 ],
       [0.03345237, 0.17072712, 0.79582053]], dtype=float32)

In [None]:
import numpy as np


np.unique(validation_generator.labels, 
          return_counts=True)

(array([0, 1, 2], dtype=int32), array([ 93, 207, 546]))

In [None]:
model2.evaluate(validation_generator)



[0.8401544094085693, 0.6666666865348816]

In [None]:
y_pred = model2.predict(validation_generator)

In [None]:
y_true, y_pred = validation_generator.classes, np.argmax(y_pred, axis=1)

In [None]:
from sklearn.metrics import accuracy_score


accuracy_score(y_true, y_pred)

0.6666666666666666

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
indices_class = {v:k for k,v in validation_generator.class_indices.items()}
indices_class
target_names = ['atypical', 'indeterminate', 'typical']
target_names

['atypical', 'indeterminate', 'typical']

In [None]:
print('Confusion Matrix')
print(confusion_matrix(y_true, y_pred))

Confusion Matrix
[[  7  10  76]
 [  4  47 156]
 [  2  34 510]]


In [None]:
print('Precision: What proportion of positive identifications was actually correct?')
print('When it predicts a <Class> is true, it is correct <Precision> of the time.', '\n')

print('Recall: What proportion of actual positives was identified correctly?')
print('Correctly identifies <Recall> of all true <Class>.', '\n')

print('F1-SCORE: Combines the precision and recall of a classifier into a\nsingle metric by taking their harmonic meany.')


print('Classification Report')
print(classification_report(y_true, y_pred, target_names=target_names))

Precision: What proportion of positive identifications was actually correct?
When it predicts a <Class> is true, it is correct <Precision> of the time. 

Recall: What proportion of actual positives was identified correctly?
Correctly identifies <Recall> of all true <Class>. 

F1-SCORE: Combines the precision and recall of a classifier into a
single metric by taking their harmonic meany.
Classification Report
               precision    recall  f1-score   support

     atypical       0.54      0.08      0.13        93
indeterminate       0.52      0.23      0.32       207
      typical       0.69      0.93      0.79       546

     accuracy                           0.67       846
    macro avg       0.58      0.41      0.41       846
 weighted avg       0.63      0.67      0.60       846

