# Import library

In [1]:
import os
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import keras


from glob import glob
from tqdm import tqdm
from matplotlib import pyplot as plt
from tensorflow.python.client import device_lib

In [2]:
#Set up CPU and GPU
def set_cpu_gpus(mixed_precision=True):
    try: 
        # printed out the detected devices
        list_ld = device_lib.list_local_devices()
        for dev in list_ld: print(dev.name,dev.memory_limit)
        physical_devices = tf.config.list_physical_devices(
            'GPU' if len(list_ld) - 1 else 'CPU'
        )
        # For GPU devices, set growth memory constraint
        if 'GPU' in physical_devices[-1]:
            tf.config.optimizer.set_jit(True)
            keras.mixed_precision.set_global_policy("mixed_float16")
            for pd in physical_devices:
                tf.config.experimental.set_memory_growth(pd, True)
        strategy = tf.distribute.MirroredStrategy()
        return (strategy, physical_devices)
    except: 
        raise ValueError('No Device Detected!')

In [3]:
strategy, physical_devices =  set_cpu_gpus()
physical_devices, tf.__version__

/device:CPU:0 268435456
/device:GPU:0 4176478208
INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce GTX 1660 Ti, compute capability 7.5
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


([PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')], '2.10.1')

In [4]:
class CFG:
    class data:
        fold=0
        batch_size=8
        image_size=(512, 512)
        
       
        base_path = r"D:/Breast_Cancer_Detection"
        
        
        path_to_train= r"D:/Breast_Cancer_Detection/5_folds_data.csv"
        path_to_train_images=  r"D:/Breast_Cancer_Detection/train_images/dataset"

# Load data

In [6]:
df = pd.read_csv(CFG.data.path_to_train)
print(f"train.shape = {df.shape}")

df.head()

train.shape = (54706, 15)


Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,fold
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False,1
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False,1
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False,1
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False,1
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True,0


In [20]:
df['img_path'] = f'{CFG.data.path_to_train_images}' + '/' + df['patient_id'].astype(str) + '_' + df['image_id'].astype(str) + '.png'


display(df.head(2))
display(df['img_path'][0])

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,fold,image_path,img_path
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False,1,D:/Breast_Cancer_Detection/train_images/datase...,D:/Breast_Cancer_Detection/train_images/datase...
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False,1,D:/Breast_Cancer_Detection/train_images/datase...,D:/Breast_Cancer_Detection/train_images/datase...


'D:/Breast_Cancer_Detection/train_images/dataset/10006_462822612.png'

In [21]:
train_df = df.query(f'fold != {CFG.data.fold}').reset_index(drop=True)
valid_df = df.query(f'fold == {CFG.data.fold}').reset_index(drop=True)

train_df.shape, valid_df.shape

((43727, 17), (10979, 17))

In [22]:
assert not round(233 / 10746 - 925 / 42802)

train_df.cancer.value_counts(), valid_df.cancer.value_counts()

(0    42802
 1      925
 Name: cancer, dtype: int64,
 0    10746
 1      233
 Name: cancer, dtype: int64)

# Data Loader

In [23]:
# SetAutoTune
BATCH_SIZE = 32 * strategy.num_replicas_in_sync
INP_SIZE = (512, 512)
AUTOTUNE = tf.data.AUTOTUNE
EPOCHS = 15

In [24]:
def image_decoder(with_labels):

    def decode(path):
        file_bytes = tf.io.read_file(path)
        img = tf.image.decode_jpeg(file_bytes, channels = 3)
        img = tf.reshape(img, [*INP_SIZE, 3])
        return img
    
    def decode_with_labels(path, label):
        return decode(path), label
    
    return decode_with_labels if with_labels else decode

def create_dataset(
    df, 
    batch_size  = 32, 
    with_labels = False,  
    shuffle     = False
):
    # Image file decoder
    decode_fn = image_decoder(with_labels)

    # Create Dataset
    if with_labels:
        dataset = tf.data.Dataset.from_tensor_slices(
            (df['img_path'].values, df['cancer'].values)
        )
    else:
        dataset = tf.data.Dataset.from_tensor_slices(
            (df['img_path'].values)
        )
        
    dataset = dataset.map(decode_fn, num_parallel_calls = AUTOTUNE)
    dataset = dataset.shuffle(8 * BATCH_SIZE, reshuffle_each_iteration = True) if shuffle else dataset
    dataset = dataset.batch(batch_size, drop_remainder=shuffle)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

In [25]:
training_dataset = create_dataset(
    train_df,
    batch_size  = BATCH_SIZE, 
    with_labels = True, 
    shuffle = True
)

valid_dataset = create_dataset(
    valid_df,
    batch_size  = BATCH_SIZE, 
    with_labels = True, 
    shuffle = False
)

In [28]:
print("Shape training data: ",training_dataset)
print("Shape valid data: ",valid_dataset)

Shape training data:  <PrefetchDataset element_spec=(TensorSpec(shape=(32, 512, 512, 3), dtype=tf.uint8, name=None), TensorSpec(shape=(32,), dtype=tf.int64, name=None))>
Shape valid data:  <PrefetchDataset element_spec=(TensorSpec(shape=(None, 512, 512, 3), dtype=tf.uint8, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>


In [None]:
# Nghia: em có thể lấy thử 1 vài batch trong dataset xem nó có cho ra data như mình mong muốn hay không