# Initial Libraries and Settings

In [None]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import shutil
import os
import concurrent.futures

In [None]:
# Directory containing the main csv file
path = './data/'
# Output directory for the sampled datasets
output = './output/'
# Base path for main dataset
base_path = 'F:/'
# Path where sampled dataset will be stored
sampled_path = base_path + 'sampled'

# Create Data Partition

In [None]:
def get_dataset(disease):
    train_df = pd.read_csv(path + 'train.csv')
    train_df[disease] = train_df[disease].fillna(0)
    return train_df

def create_sample_dataset(df, disease, sample_size):
    df_positive = df[df[disease] == 1].sample(n=sample_size, random_state=42)
    df_negative = df[df[disease] == 0].sample(n=sample_size, random_state=42)
    df_sample = pd.concat([df_positive, df_negative])
    df_sample = df_sample.sample(frac=1, random_state=42)
    return df_sample

def save_dataset(df, disease, approach, sample_size, iteration):
    df_sample = create_sample_dataset(df, disease, sample_size)
    #only save relevant columns
    df_sample = df_sample[['Path', disease]]
    df_sample.to_csv(f'{output}/{disease}_{approach}_{sample_size}_{iteration}.csv', index=False)

In [None]:
approaches = ['U-Zeros', 'U-Ones']
disease = 'Atelectasis'
for approach in approaches:
    train_df = get_dataset(disease)
    train_df[disease] = train_df[disease].replace(-1,  0 if approach == 'U-Zeros' else 1)
    sample_size = min(20000,train_df[disease].value_counts()[1])
    no_of_samples = int(train_df[disease].size / sample_size)
    for i in range(1, no_of_samples + 1):
        save_dataset(train_df, disease, approach, sample_size, i)

# Create New Sampled Dataset Folder

In [None]:
approach = 'U-Zeros'
sample_size = 33376
iteration = 4
disease = 'Atelectasis'

path_for_sampled = sampled_path + f'/{disease}_{approach}_{sample_size}_{iteration}'

df = pd.read_csv(f'{output}/{disease}_{approach}_{sample_size}_{iteration}.csv')
rows_processed = 0
def move_files(row):
    global rows_processed
    print("Index: ", row['Path'])
    path = row['Path']
    if not os.path.exists(path_for_sampled):
        os.makedirs(path_for_sampled)    
    path_without_file = path[:path.rfind('/')]

    if not os.path.exists(f'{path_for_sampled}/{path_without_file}'):
        os.makedirs(f'{path_for_sampled}/{path_without_file}')

    # shutil.copy(f'{base_path}/{path}', f'{path_for_sampled}/{path}')
    # read the image, decrease jpeg quality and then copy it
    image = tf.keras.preprocessing.image.load_img(f'{base_path}/{path}')
    image = tf.keras.preprocessing.image.img_to_array(image)
    image = tf.image.encode_jpeg(image, quality=10)
    tf.keras.preprocessing.image.save_img(f'{path_for_sampled}/{path}', image)
    rows_processed += 1
    print("Rows Processed: ", rows_processed)

index = 0
for _, row in df.iterrows():
    print("Index: ", index)
    move_files(row)
    index += 1

df.to_csv(f'{path_for_sampled}/train.csv', index=False)

# Create Data Generators

In [None]:
approach = 'U-Zeros'
sample_size = 33376
iteration = 4

dataset = pd.read_csv(f'{output}/{disease}_{approach}_{sample_size}_{iteration}.csv')

train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)

print(train_df[disease].value_counts())
print(val_df[disease].value_counts())

In [None]:
def load_image(img_path, label):
    img = tf.io.read_file(base_path + img_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [224, 224])
    img = tf.cast(img, tf.float32) / 255.0
    return img, label

def dataset_from_df(df, disease):
    paths = df['Path'].values
    labels = df[disease].values
    ds = tf.data.Dataset.from_tensor_slices((paths, labels))
    ds = ds.map(load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds = ds.batch(64)
    ds = ds.prefetch(tf.data.experimental.AUTOTUNE)
    return ds

In [None]:
test_ds = dataset_from_df(val_df, disease)
train_ds = dataset_from_df(train_df, disease)

# Train DenseNet121

In [None]:
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model

base_model = DenseNet121(include_top=False, weights='imagenet', input_shape=(224, 224, 3))
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
predictions = Dense(1, activation='sigmoid')(x)
model = Model(inputs=base_model.input, outputs=predictions)

for layer in base_model.layers:
    layer.trainable = False


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.AUC()])
model.summary()

In [None]:
model.fit(train_ds, epochs=5, validation_data=test_ds)