# Resize Images

We need to resize all images to smaller size (ex. 128\*128 pixels) for everyone to download a copy of dataset in local machines. 

In [None]:
# import packages

import os
import pandas as pd 
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

%matplotlib inline
%load_ext autoreload
%autoreload 2

Scripts below were run locally, if you want to run it yourself, please change the file path!

In [None]:

# Read train.txt
train_df = pd.read_csv('./RawData/train_COVIDx_CT-2A.txt', sep=" ", header=None)
train_df.columns=['filename', 'label', 'xmin','ymin','xmax','ymax']
train_df=train_df.drop(['xmin', 'ymin','xmax', 'ymax'], axis=1 )
# Read val.txt
val_df = pd.read_csv('./RawData/val_COVIDx_CT-2A.txt', sep=" ", header=None)
val_df.columns=['filename', 'label', 'xmin','ymin','xmax','ymax']
val_df=val_df.drop(['xmin', 'ymin','xmax', 'ymax'], axis=1 )
# Read test.txt
test_df = pd.read_csv('./RawData/test_COVIDx_CT-2A.txt', sep=" ", header=None)
test_df.columns=['filename', 'label', 'xmin','ymin','xmax','ymax']
test_df=test_df.drop(['xmin', 'ymin','xmax', 'ymax'], axis=1 )


Original images were in 1024\*1024 pixels to 256\*256 pixels with only 1 channel, the largest image has about 600 KB disk size. We resize them all to 128\*128 pixels with only 1 channel, each resized image has about 10 KB disk size. Total disk size reduced from ~29 GB to ~2 GB for 194,922 images.

**NOTE: Given images were squares, calling `Image.resize()` to 128\*128 would not affect ratio and would be enough for this situation. If given images have no consistent ratios, consider `Image.thumbnail()`.**

In [None]:
image_path = './RawData/2A_images/'
new_image_path = './Data/2A_images/'

new_size = (128,128)

for filename in train_df['filename']:
    img = Image.open(image_path + filename)
    new_img = img.resize(new_size)
    new_img.save(new_image_path + 'train/' + filename)

for filename in val_df['filename']:
    img = Image.open(image_path + filename)
    new_img = img.resize(new_size)
    new_img.save(new_image_path + 'val/' + filename)
    
for filename in test_df['filename']:
    img = Image.open(image_path + filename)
    new_img = img.resize(new_size)
    new_img.save(new_image_path + 'test/' + filename)

Sample image: In our original dataset, the sample size is very huge. In traning set, there are 143,778 images which brings huge challenge into our computational power. We wanted to use the whole data set the the beginning, and we tried to process and load the images to tensorflow. However, we find it will create error of excedding 10% of free system memory. Therefore, we think sampling is a necessary step in data preprocessing for this project. 

In [None]:
#image_path = './2A_images/'
new_image_path = '/Users/chenzhengyi/Desktop/CU-Applied ML/project/Data/2A_images_sample/'
# labels={0:'Normal',1:'Pneumonia',2:'COVID-19'}
#train_df_sample = train_df.sample(9000)
#val_df_sample = val_df.sample(1800)
#test_df_sample = test_df.sample(1800)

for index, row in train_df[train_df['label']==0].sample(3000).iterrows():
    filepath, filename = row['filepath'], row['filename']
    img = Image.open(filepath)
    img.save(new_image_path + 'train/Normal/' + filename)
    img.close()
    
for index, row in train_df[train_df['label']==1].sample(3000).iterrows():
    filepath, filename = row['filepath'], row['filename']
    img = Image.open(filepath)
    img.save(new_image_path + 'train/Pneumonia/' + filename)
    img.close()
    
for index, row in train_df[train_df['label']==2].sample(3000).iterrows():
    filepath, filename = row['filepath'], row['filename']
    img = Image.open(filepath)
    img.save(new_image_path + 'train/COVID-19/' + filename)
    img.close()

  for index, row in val_df[val_df['label']==0].sample(600).iterrows():
    filepath, filename = row['filepath'], row['filename']
    img = Image.open(filepath)
    img.save(new_image_path + 'val/Normal/' + filename)
    img.close()
    
for index, row in val_df[val_df['label']==1].sample(600).iterrows():
    filepath, filename = row['filepath'], row['filename']
    img = Image.open(filepath)
    img.save(new_image_path + 'val/Pneumonia/' + filename)
    img.close()
    
for index, row in val_df[val_df['label']==2].sample(600).iterrows():
    filepath, filename = row['filepath'], row['filename']
    img = Image.open(filepath)
    img.save(new_image_path + 'val/COVID-19/' + filename)
    img.close()
    
for index, row in test_df[test_df['label']==0].sample(600).iterrows():
    filepath, filename = row['filepath'], row['filename']
    img = Image.open(filepath)
    img.save(new_image_path + 'test/Normal/' + filename)
    img.close()
    
for index, row in test_df[test_df['label']==1].sample(600).iterrows():
    filepath, filename = row['filepath'], row['filename']
    img = Image.open(filepath)
    img.save(new_image_path + 'test/Pneumonia/' + filename)
    img.close()
    
for index, row in test_df[test_df['label']==2].sample(600).iterrows():
    filepath, filename = row['filepath'], row['filename']
    img = Image.open(filepath)
    img.save(new_image_path + 'test/COVID-19/' + filename)
    img.close()