In [1]:
#General import 
import pandas as pd
import zipfile
import numpy as np
import tensorflow as tf
import keras
from tqdm import tqdm
import os

#image augmentation
from keras_preprocessing.image import img_to_array, array_to_img, load_img, ImageDataGenerator

#image loading
import matplotlib.pyplot as plt
from skimage.io import imread

In [2]:
# Image and data locs:
ziploc ="E:/School/UU/PATREC/CXR_project/"
imgloc = "E:/School/UU\PATREC/.darwin/datasets/v7-labs/covid-19-chest-x-ray-dataset/images/"
store_loc = "E:/School/UU/PATREC/.darwin/datasets/v7-labs/augmented_loc/"

# zip file locs
train_zf = zipfile.ZipFile(ziploc+'train_data.zip') 
val_zf = zipfile.ZipFile(ziploc+'val_data.zip') 
test_zf = zipfile.ZipFile(ziploc+'test_data.zip') 
train_df = pd.read_csv(train_zf.open('train_data.csv'))
val_df = pd.read_csv(val_zf.open('val_data.csv'))
test_df = pd.read_csv(test_zf.open('val_data.csv'))

#augmented image location
train_dir = "E:/School/UU/PATREC/.darwin/datasets/v7-labs/augmented_loc/train"
val_dir = "E:/School/UU/PATREC/.darwin/datasets/v7-labs/augmented_loc/val"

In [3]:
#adds labels for data
def label(df):
    df["label"] = np.nan
    df.loc[df["type"] == "No Pneumonia", 'label'] = "A"
    df.loc[df["type"] == "Bacterial Pneumonia", 'label'] = "B"
#     df.loc[df["type"] == "Fungal Pneumonia", 'label'] = "C"
    df.loc[df["type"] == "Viral Pneumonia", 'label'] = "D"
    df.loc[df["Covid"] == True, 'label'] = "C"
#     df.loc[df["type"] == "Undefined Pneumonia", 'label'] = "F"
    return df

In [4]:
#add labels
train = label(train_df)
val = label(val_df)
test = label(test_df)
# Drop the undefined data labels
train2 = train[train['label'].notna()]
val2 = val[val['label'].notna()]
test2 = test[test['label'].notna()]
train2.label.value_counts()

B    2252
A    1284
D    1159
C     416
Name: label, dtype: int64

In [5]:
#Balanced all_train
covid_train = train2[train2['label']== "C"]
no_train = train2[train2['label']== "A"].sample(len(covid_train),random_state=1)
viral_train = train2[train2['label']== "D"].sample(len(covid_train),random_state=1)
bac_train = train2[train2['label']== "B"].sample(len(covid_train),random_state=1)

#balanced all_val
covid_val = val2[val2['label']== "C"]
no_val = val2[val2['label']== "A"].sample(len(covid_val),random_state=1)
viral_val = val2[val2['label']== "D"].sample(len(covid_val),random_state=1)
bac_val = val2[val2['label']== "B"].sample(len(covid_val),random_state=1)

all_train = pd.concat([covid_train, no_train, viral_train, bac_train])
all_val = pd.concat([covid_val, no_val, viral_val, bac_val])

In [6]:
print("train dist \n",all_train.label.value_counts())
print("\n")
print("val dist \n",all_val.label.value_counts())

train dist 
 C    416
A    416
D    416
B    416
Name: label, dtype: int64


val dist 
 C    52
A    52
D    52
B    52
Name: label, dtype: int64


In [7]:
all_train

Unnamed: 0,filename,type,ogfilename,lung1,lung2,view,Covid,width,height,json_filename,label
4721,00007084.png,Viral Pneumonia,tpmd200203f2-c.png,"{'path': [{'x': 590.0, 'y': 109.0}, {'x': 581,...","{'path': [{'x': 842, 'y': 114}, {'x': 842, 'y'...",View/PA,True,1399.0,1264.0,tpmd200203f2-c.json,C
4722,00006635.png,Viral Pneumonia,296_2020_4584_Fig2_HTML-c.png,"{'path': [{'x': 231.0, 'y': 21}, {'x': 226.0, ...","{'path': [{'x': 353.0, 'y': 15}, {'x': 353.0, ...",View/AP,True,605.0,609.0,296_2020_4584_Fig2_HTML-c.json,C
4723,00006952.jpg,Viral Pneumonia,extubation-4.jpg,"{'path': [{'x': 174, 'y': 11.0}, {'x': 158.0, ...","{'path': [{'x': 144.45, 'y': 47.06}, {'x': 139...",View/PA,True,443.0,370.0,extubation-4.json,C
4724,00006852.png,Viral Pneumonia,covid-19-caso-91-1-12.png,"{'path': [{'x': 446.0, 'y': 91}, {'x': 442.0, ...","{'path': [{'x': 648, 'y': 99}, {'x': 648, 'y':...",View/PA,True,1161.0,1185.0,covid-19-caso-91-1-12.json,C
4725,00006932.jpg,Viral Pneumonia,covid-19-rapidly-progressive-acute-respiratory...,"{'path': [{'x': 453, 'y': 133}, {'x': 450, 'y'...","{'path': [{'x': 596, 'y': 109}, {'x': 596, 'y'...",View/PA,True,994.0,913.0,covid-19-rapidly-progressive-acute-respiratory...,C
...,...,...,...,...,...,...,...,...,...,...,...
1481,00002015.jpeg,Bacterial Pneumonia,person996_bacteria_2924.jpeg,"{'path': [{'x': 517, 'y': 114.0}, {'x': 514, '...","{'path': [{'x': 791, 'y': 116.0}, {'x': 791, '...",,False,1294.0,1018.0,person996_bacteria_2924.json,B
439,00002313.jpeg,Bacterial Pneumonia,person673_bacteria_2566.jpeg,"{'path': [{'x': 532, 'y': 54.0}, {'x': 531, 'y...","{'path': [{'x': 351.0, 'y': 55}, {'x': 349.0, ...",,False,848.0,552.0,person673_bacteria_2566.json,B
1608,00003250.jpeg,Bacterial Pneumonia,person307_bacteria_1442.jpeg,"{'path': [{'x': 412, 'y': 54}, {'x': 409.0, 'y...","{'path': [{'x': 689, 'y': 37}, {'x': 688, 'y':...",,False,1142.0,689.0,person307_bacteria_1442.json,B
628,00002910.jpeg,Bacterial Pneumonia,person446_bacteria_1931.jpeg,"{'path': [{'x': 505.0, 'y': 92}, {'x': 502.0, ...","{'path': [{'x': 717, 'y': 82}, {'x': 717, 'y':...",,False,1176.0,856.0,person446_bacteria_1931.json,B


### Offline augmentation and storing the values

In [8]:
def augment_and_save_imgs(imgloc, storeloc, df):
    datagenerator = ImageDataGenerator(rotation_range=10, # rotation
        width_shift_range=0.2, # horizontal shift
        height_shift_range=0.2, # vertical shift
        zoom_range=0.2, # zoom
        horizontal_flip=True, # horizontal flip
        brightness_range=[0.2,1.2]) # brightness)
    for f in tqdm(df['ogfilename']):
        #print("name: \t",f)
        img = load_img(imgloc +f)
        x = img_to_array(img)
        x = x.reshape((1, ) + x.shape)
        x =tf.image.resize_with_pad(x, 256, 256, antialias=False)
        i= 0
        for batch in datagenerator.flow(x, batch_size=1, save_to_dir = storeloc, seed = 1, shuffle = False,
                                        save_prefix="aug_"+f, save_format = "png"):
            i+=1
            if i> 5:
                break
    return True
    

In [9]:
# Doesnt work was meant as speedup
def augment_and_save_imgs2(imgloc, storeloc, df):
    datagenerator = ImageDataGenerator(rotation_range=10, # rotation
        width_shift_range=0.2, # horizontal shift
        height_shift_range=0.2, # vertical shift
        zoom_range=0.2, # zoom
        horizontal_flip=True, # horizontal flip
        brightness_range=[0.2,1.2]) # brightness)
    i = 0
    for batch in datagenerator.flow_from_dataframe(
        dataframe = df, # what df to work with
        directory = imgloc, # file location
        x_col = "ogfilename", # which file to get
        y_col = "label", # so it knows the label of new augmented image
        class_mode = "categorical", #converts abcde into categorical
        batch_size = 1,
        seed = 1,
        shuffle = False,
        target_size = (224,224), # changed value for vgg16
        keep_aspect_ratio = True,
        validate_filenames= True,
        save_to_dir = storeloc,
        save_prefix = "aug",
        save_format = 'png'):
        i += 1

In [10]:
# uncomment if you still need the images
#test = augment_and_save_imgs(imgloc, store_loc, all_train)

In [11]:
#all_train[all_train["ogfilename"] == "296_2020_4584_Fig2_HTML-c.png"]

In [12]:
# uncomment if you still need the images
#val = augment_and_save_imgs(imgloc, store_loc+"val", all_val)

In [13]:
# make df with the new filenames and add label
def make_aug_df(df, directory):
    new_df = pd.DataFrame()
    new_df["ogfilename"] = os.listdir(directory)
    for index,row in df.iterrows():
        new_df.loc[new_df['ogfilename'].str.contains(row['ogfilename']), 'label'] = row['label']
    return new_df

In [14]:
final_train = make_aug_df(all_train, train_dir)
final_val =  make_aug_df(all_val, val_dir)

In [15]:
final_val

Unnamed: 0,ogfilename,label
0,aug_02b76460.jpg_0_1447.png,C
1,aug_02b76460.jpg_0_2543.png,C
2,aug_02b76460.jpg_0_2778.png,C
3,aug_02b76460.jpg_0_3462.png,C
4,aug_02b76460.jpg_0_7687.png,C
...,...,...
1243,aug_yxppt-2020-02-19_00-51-27_287214-day8.jpg_...,C
1244,aug_yxppt-2020-02-19_00-51-27_287214-day8.jpg_...,C
1245,aug_yxppt-2020-02-19_00-51-27_287214-day8.jpg_...,C
1246,aug_yxppt-2020-02-19_00-51-27_287214-day8.jpg_...,C


In [16]:
final_train

Unnamed: 0,ogfilename,label
0,aug_000001-19.jpg_0_1447.png,B
1,aug_000001-19.jpg_0_2543.png,B
2,aug_000001-19.jpg_0_2778.png,B
3,aug_000001-19.jpg_0_3462.png,B
4,aug_000001-19.jpg_0_7687.png,B
...,...,...
9979,aug_yxppt-2020-02-19_00-51-27_287214-day10.jpg...,C
9980,aug_yxppt-2020-02-19_00-51-27_287214-day10.jpg...,C
9981,aug_yxppt-2020-02-19_00-51-27_287214-day10.jpg...,C
9982,aug_yxppt-2020-02-19_00-51-27_287214-day10.jpg...,C


In [17]:
final_train[final_train.ogfilename.str.contains("000001-19.jpg", case = False)]

Unnamed: 0,ogfilename,label
0,aug_000001-19.jpg_0_1447.png,B
1,aug_000001-19.jpg_0_2543.png,B
2,aug_000001-19.jpg_0_2778.png,B
3,aug_000001-19.jpg_0_3462.png,B
4,aug_000001-19.jpg_0_7687.png,B
5,aug_000001-19.jpg_0_7844.png,B


In [18]:
all_train[all_train['ogfilename'] == "yxppt-2020-02-19_00-51-27_287214-day10.jpg"]

Unnamed: 0,filename,type,ogfilename,lung1,lung2,view,Covid,width,height,json_filename,label
5094,00007092.jpg,Viral Pneumonia,yxppt-2020-02-19_00-51-27_287214-day10.jpg,"{'path': [{'x': 214.0, 'y': 86}, {'x': 213.0, ...","{'path': [{'x': 141, 'y': 80}, {'x': 140, 'y':...",View/AP_Supine,True,333.0,325.0,yxppt-2020-02-19_00-51-27_287214-day10.json,C


### Checking if operation was succesfull by grabbing n samples

In [19]:
print("og train distribution \n",all_train.label.value_counts())
print("og val distribution \n",all_val.label.value_counts())
print("aug train distribution \n",final_train.label.value_counts())
print("aug val| distribution \n",final_val.label.value_counts())

og train distribution 
 C    416
A    416
D    416
B    416
Name: label, dtype: int64
og val distribution 
 C    52
A    52
D    52
B    52
Name: label, dtype: int64
aug train distribution 
 B    2496
D    2496
C    2496
A    2496
Name: label, dtype: int64
aug val| distribution 
 C    312
A    312
D    312
B    312
Name: label, dtype: int64


### save new train and val data as csv

In [20]:
# Store new dataframes in csv, once validated

In [21]:
# Export to CSV files in zip
tcompression_opts = dict(method='zip',archive_name='final_train_data.csv')
vcompression_opts = dict(method='zip',archive_name='final_val_data.csv')  
final_train.to_csv('E:/School/UU/PATREC/CXR_project/final_train_data.csv.zip', index=False,compression=tcompression_opts)
final_val.to_csv('E:/School/UU/PATREC/CXR_project/final_val_data.csv.zip', index=False,compression=vcompression_opts)
