In [1]:
import os
import pandas as pd
import torch
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import img_to_array
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

2023-01-29 15:16:42.500048: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
valid_size = 40

# A closer look at our data

In [3]:
label = pd.read_csv("data/raw_data/labels.csv")
label.head()

Unnamed: 0,Image_path,Condition
0,img_4513976.jpg,0
1,img_7764995.jpg,1
2,img_451308.jpg,0
3,img_7768372.jpg,1
4,img_7765274.jpg,1


In [4]:
label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1399 entries, 0 to 1398
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Image_path  1399 non-null   object
 1   Condition   1399 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 22.0+ KB


In [5]:
# calculate percentage of damaged cars
round(sum(label["Condition"]) / label.shape[0], 2)

0.93

* To address the issue of imbalanced data, we will create a balanced validation set by selecting 20 images each of damaged and good condition. Additionally, we will utilize image augmentation techniques on our training image data to further balance the training data set.

# Split Data

In [6]:
def split(data,valid_size,if_train, seed= 123):
    """
    The split function is used to divide a dataset into a training and validation set. 
    To ensure that the damaged conditions are represented equally in test sets, 
    the function takes into account the imbalanced nature of the data. 
    The function takes in four parameters: the path of the original dataset,
    the test size we desire, a flag indicating whether the output is for training or validation, 
    and a seed for reproducibility. It returns the appropriate training or validation dataset.
    """ 
    class_damaged_size_ratio = (valid_size/2)/ sum(data['Condition'])
    class_undamged_valid_size_ratio = (valid_size/2)/ sum(data['Condition'] == 0)
    damaged = data[data.Condition == 1]
    undamaged = data[data.Condition == 0]
    train_damaged, test_damaged = train_test_split(damaged, test_size=class_damaged_size_ratio , random_state=seed) # for Reproducibility
    train_undamaged, test_undamaged = train_test_split(undamaged, test_size=class_undamged_valid_size_ratio , random_state=seed) # for Reproducibility
    if if_train:
        return pd.concat([train_damaged,train_undamaged])
    else:
        return pd.concat([test_damaged,test_undamaged])

In [7]:
train = split(label,valid_size,if_train = True)
validation = split(label,valid_size,if_train = False)
print(f'validation class ratio: {sum(validation["Condition"])/sum(validation["Condition"] == 0)}')

validation class ratio: 1.0


In [8]:
def move_image(raw_data_folder,label,damaged_folder,non_damaged_folder):
    '''
    move validation image data to given folders
    '''
    if len(os.listdir(damaged_folder)) + len(os.listdir(non_damaged_folder)) == 0:
        for index in range(len(label)):
            raw_img_path = os.path.join(raw_data_folder, label.iloc[index, 0])
            image = Image.open(raw_img_path)
            if label.iloc[index, 1] == 1:
                damaged_path = os.path.join(damaged_folder,label.iloc[index, 0])
                image.save(damaged_path)
            else:
                non_damaged_path = os.path.join(non_damaged_folder,label.iloc[index, 0])
                image.save(non_damaged_path)
    else:
        print('Data has been processed')
        
        
raw_data_folder = 'data/raw_data/Images'
damaged_folder = 'data/processed_data/valid/damaged'
non_damaged_folder = 'data/processed_data/valid/non-damaged'
move_image(raw_data_folder,validation,damaged_folder,non_damaged_folder)

Data has been processed


# Generate new images 

In [9]:
# Image augmentation function
datagen = ImageDataGenerator(
    rotation_range=0.2,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

In [10]:
def repulicate(raw_data_folder,label,n_replicates,damaged_folder,non_damaged_folder):
    '''
    generate images and move training image data to given folders.
    '''
    if len(os.listdir(damaged_folder)) + len(os.listdir(non_damaged_folder)) == 0:
        for index in range(len(label)):
            raw_img_path = os.path.join(raw_data_folder, label.iloc[index, 0])
            image = Image.open(raw_img_path)
            if label.iloc[index, 1] == 1:
                damaged_path = os.path.join(damaged_folder,label.iloc[index, 0])
                image.save(damaged_path)
            else:
                image_tensor = img_to_array(image)
                image_tensor = image_tensor.reshape((1,) + image_tensor.shape)
                i = 0
                for batch in datagen.flow(image_tensor, batch_size=1,save_to_dir=non_damaged_folder, save_prefix='aug', save_format='jpg'):
                    i += 1
                    if i > n_replicates:
                        break  
    else:
        print('Data has been processed')


In [11]:
n_replicates = round(sum(train["Condition"]) / sum(train["Condition"] == 0)) 
raw_data_folder = 'data/raw_data/Images'
damaged_folder = 'data/processed_data/train/damaged'
non_damaged_folder = 'data/processed_data/train/non-damaged'
repulicate(raw_data_folder,train,n_replicates,damaged_folder,non_damaged_folder )
print(f'training data class ratio: {round(len(os.listdir(damaged_folder))/len(os.listdir(non_damaged_folder)),2)} ')


training data class ratio: 1.01 


* Now we have a balanced training and validation data set.