In [1]:
#Import packges
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import pandas as pd

In [2]:
#Save data path to a variable
data_path='../Data/'
train_path=data_path+'train/'
test_path=data_path+'test/'

In [57]:
#Load dataset containing study results
df=pd.read_csv(data_path+'Study Results/Summary.csv')

In [64]:
#Separate the dataset into train and test set based on the images in each set

#Create a dataframe to store results
train_df=pd.DataFrame()
#Go through each subdirectory in the train folder
for category in os.listdir(train_path):
    #Find all rows from the original dataframe, that contain the filename of images in the folder,
    #And combine it with the train dataframe
    train_df=pd.concat([train_df,(df[df['file'].isin(os.listdir(train_path+category))])])
train_df.reset_index(drop=True, inplace=True)

#Repeat the same as above for the test set
test_df=pd.DataFrame()
for category in os.listdir(test_path):
    test_df=pd.concat([test_df,(df[df['file'].isin(os.listdir(test_path+category))])])
test_df.reset_index(drop=True, inplace=True)

In [11]:
#Get the number of files that are in each subdirectory of the train and test folders
train_neg_len=len(os.listdir(train_path+'Negative'))
train_neut_len=len(os.listdir(train_path+'Neutral'))
train_pos_len=len(os.listdir(train_path+'Positive'))

test_neg_len=len(os.listdir(test_path+'Negative'))
test_neut_len=len(os.listdir(test_path+'Neutral'))
test_pos_len=len(os.listdir(test_path+'Positive'))

train_len=train_neg_len+train_neut_len+train_pos_len
test_len=test_neg_len+test_neut_len+test_pos_len

One thing that is important is to find the order that the subdirectories of the train and test set are in, because images will be moved based on that order.

In [67]:
#Find the order of subdirectories in train and test folder
print('Order of train picture category:', os.listdir(train_path))
print('Order of test picture category:', os.listdir(test_path))

Order of train picture category: ['Negative', 'Neutral', 'Positive']
Order of test picture category: ['Negative', 'Neutral', 'Positive']


In [12]:
print("# of train pictures: ",train_len)
print("Train: # of negative pics:",train_neg_len,", neutral pics:",train_neut_len,", positive pics:",train_pos_len)
print("# of test pictures: ",test_len)
print("Test: # of negative pics:",test_neg_len,", neutral pics:",test_neut_len,", positive pics:",test_pos_len)

# of train pictures:  585
Train: # of negative pics: 416 , neutral pics: 72 , positive pics: 97
# of test pictures:  145
Test: # of negative pics: 104 , neutral pics: 17 , positive pics: 24


In [41]:
#Create temporary folders to store new images (First time only)
#os.mkdir(data_path+'Temp train')
#os.mkdir(data_path+'Temp test')

In [43]:
#Create subdirectories that reflect the original train and test directories (First time only)
'''
for category in os.listdir(train_path):
    os.mkdir(temp_train_path+category)

for category in os.listdir(test_path):
    os.mkdir(temp_test_path+category)
'''

In [15]:
#Save temporary folder paths to variables
temp_train_path=data_path+'Temp train/'
temp_test_path=data_path+'Temp test/'

temp_train_neg=temp_train_path+'Negative/'
temp_train_neut=temp_train_path+'Neutral/'
temp_train_pos=temp_train_path+'Positive/'

temp_test_neg=temp_test_path+'Negative/'
temp_test_neut=temp_test_path+'Neutral/'
temp_test_pos=temp_test_path+'Positive/'

Because horizontal_flip and vertical_flip only flips images randomly, rather than flipping all images, instead of saving new images through ImageDataGenerator, images can be specified to be flipped during the actual training of the model.

In [166]:
datagen = ImageDataGenerator(#rescale=1./255,
                             #      zoom_range=0.3,
                             #      shear_range=45,
                             #      fill_mode='reflect'
                             #      brightness_range=(1.5,2.2)
                             #      width_shift_range=0.3,
                             #      height_shift_range=0.3
                             )

In [167]:
i=0
for image in datagen.flow_from_directory(directory=train_path,
                                         shuffle=False,
                                         target_size=(256,256),
                                         batch_size=1,
                                         save_prefix='', #Rename prefix according to augment method
                                         save_to_dir=temp_train_path, 
                                         save_format='jpg'):
    i+=1
    if i==train_len:
        break

Found 585 images belonging to 3 classes.


In [168]:
i=0
for image in datagen.flow_from_directory(directory=test_path,
                                         shuffle=False,
                                         target_size=(256,256),
                                         batch_size=1,
                                         save_prefix='', #Rename prefix according to augment method
                                         save_to_dir=temp_test_path, 
                                         save_format='jpg'):
    i+=1
    if i==test_len:
        break

Found 145 images belonging to 3 classes.


In [169]:
#Create a list of all files in the train and test directory excluding folders
#Train set images
temp_train_images=[file for file in os.listdir(temp_train_path) if not os.path.isdir(temp_train_path+file)]

#Test set images
temp_test_images=[file for file in os.listdir(temp_test_path) if not os.path.isdir(temp_test_path+file)]

#Sanity check
temp_train_images[:10]

['vert shift_0_3653867.jpg',
 'vert shift_100_875558.jpg',
 'vert shift_101_2985736.jpg',
 'vert shift_102_8754996.jpg',
 'vert shift_103_623368.jpg',
 'vert shift_104_6317447.jpg',
 'vert shift_105_6986049.jpg',
 'vert shift_106_4082185.jpg',
 'vert shift_107_4118948.jpg',
 'vert shift_108_9700.jpg']

Since os.listdir does not list files in an alphanumeric order rather than the order they are stored in, moving the images based on the current index order will not match up with the actual order the images are meant to be split from. Each subdirectory of the train and test folder contain a range of images based on picture number (e.g.: picture 0-416 is in the Negative folder in the train set).

In order to move the images based on the picture number, the list that contains the image names have to be sorted in the order that the images are stored in. The picture number is stored in the filename between two underscores, "_", which can be extracted via the split() method.

In [170]:
#Create a new index list
new_index=[]
for image in temp_train_images:
    #Split the file name based on underscores, and save the second element of the split into a new list,
    #as the picture number follows right after the first "_", so long as the filename (i.e: prefix) does not contain another "_"
    new_index.append(int(image.split('_')[1]))
sorted_temp_train_images=[img for _,img in sorted(zip(new_index,temp_train_images))]


new_index=[]
for image in temp_test_images:
    new_index.append(int(image.split('_')[1]))
sorted_temp_test_images=[img for _,img in sorted(zip(new_index,temp_test_images))]

In [171]:
#Sanity check
print(sorted_temp_train_images[0:5])
print(sorted_temp_test_images[0:5])

['vert shift_0_3653867.jpg', 'vert shift_1_1673921.jpg', 'vert shift_2_6740855.jpg', 'vert shift_3_3119711.jpg', 'vert shift_4_5624235.jpg']
['vert shift_0_3262519.jpg', 'vert shift_1_5744505.jpg', 'vert shift_2_283581.jpg', 'vert shift_3_5801477.jpg', 'vert shift_4_8485031.jpg']


In [172]:
#Moving train images
for i in range(0, len(sorted_temp_train_images)):
    if i < train_neg_len:
        os.rename(f'{temp_train_path}/{sorted_temp_train_images[i]}',
                  f'{temp_train_neg}/{sorted_temp_train_images[i]}')
    elif i < (train_neg_len+train_neut_len):
        os.rename(f'{temp_train_path}/{sorted_temp_train_images[i]}',
                  f'{temp_train_neut}/{sorted_temp_train_images[i]}')
    else:
        os.rename(f'{temp_train_path}/{sorted_temp_train_images[i]}',
                  f'{temp_train_pos}/{sorted_temp_train_images[i]}')
        
#Moving test images        
for i in range(0, len(sorted_temp_test_images)):
    if i < test_neg_len:
        os.rename(f'{temp_test_path}/{sorted_temp_test_images[i]}',
                  f'{temp_test_neg}/{sorted_temp_test_images[i]}')
    elif i < (test_neg_len+test_neut_len):
        os.rename(f'{temp_test_path}/{sorted_temp_test_images[i]}',
                  f'{temp_test_neut}/{sorted_temp_test_images[i]}')
    else:
        os.rename(f'{temp_test_path}/{sorted_temp_test_images[i]}',
                  f'{temp_test_pos}/{sorted_temp_test_images[i]}')

In [173]:
new_train_df=train_df.copy()
new_train_df['file']=sorted_temp_train_images

new_test_df=test_df.copy()
new_test_df['file']=sorted_temp_test_images

In [74]:
#Save temporary train dataframe to a csv file (First time only)
#new_train_df.to_csv(data_path+'Study Results/Train df.csv', index=False)

#Save temporary test dataframe to a csv file (First time only)
#new_test_df.to_csv(data_path+'Study Results/Test df.csv', index=False)

In [174]:
#Load previously saved train dataframe if already created
temp_train_df=pd.read_csv(data_path+'Study Results/Train df.csv')

#Load previously saved test dataframe if already created
temp_test_df=pd.read_csv(data_path+'Study Results/Test df.csv')

In [175]:
#Combine new results to previously saved train dataframe
new_train_df=pd.concat([new_train_df, temp_train_df]).reset_index(drop=True)

#Combine new results to previously saved test dataframe
new_test_df=pd.concat([new_test_df, temp_test_df]).reset_index(drop=True)

In [176]:
#Sanity check
new_train_df

Unnamed: 0,file,Valence,Arousal,Internal norms,External norms,image type,labeled valence category,calculated valence category
0,vert shift_0_3653867.jpg,2.553,74.995,12.289429,16.409175,Animal mistreatment,Negative,Negative
1,vert shift_1_1673921.jpg,45.698,56.948,75.412445,78.690491,Animal mistreatment,Negative,Negative
2,vert shift_2_6740855.jpg,36.197,55.567,40.215111,63.345528,Animal mistreatment,Negative,Negative
3,vert shift_3_3119711.jpg,43.439,48.905,51.038688,40.093340,Animal mistreatment,Negative,Negative
4,vert shift_4_5624235.jpg,9.945,80.940,11.803228,6.986861,Animal mistreatment,Negative,Negative
...,...,...,...,...,...,...,...,...
3505,_580_9312817.jpg,53.348,51.338,,,Spider,Negative,Positive
3506,_581_8926806.jpg,39.626,54.421,,,Spider,Negative,Negative
3507,_582_5250039.jpg,40.406,44.130,,,Spider,Negative,Negative
3508,_583_834428.jpg,55.637,44.821,,,Spider,Negative,Positive


In [177]:
#Sanity check
new_test_df

Unnamed: 0,file,Valence,Arousal,Internal norms,External norms,image type,labeled valence category,calculated valence category
0,vert shift_0_3262519.jpg,27.544,67.897,22.594799,31.434631,Animal mistreatment,Negative,Negative
1,vert shift_1_5744505.jpg,12.554,68.251,16.115085,20.963218,Animal mistreatment,Negative,Negative
2,vert shift_2_283581.jpg,13.449,68.205,15.984217,23.872630,Animal mistreatment,Negative,Negative
3,vert shift_3_5801477.jpg,26.328,54.657,33.143322,36.414956,Animal mistreatment,Negative,Negative
4,vert shift_4_8485031.jpg,19.328,61.913,24.674352,25.339161,Animal mistreatment,Negative,Negative
...,...,...,...,...,...,...,...,...
865,_140_6187611.jpg,36.412,55.752,,,Spider,Negative,Negative
866,_141_9509437.jpg,44.716,63.822,,,Spider,Negative,Negative
867,_142_8736350.jpg,36.900,59.526,,,Spider,Negative,Negative
868,_143_4952263.jpg,32.774,61.638,,,Spider,Negative,Negative


In [178]:
#Saving new train set to a csv file
new_train_df.to_csv(data_path+'Study Results/Train df.csv', index=False)

#Save new test set to a csv file
new_test_df.to_csv(data_path+'Study Results/Test df.csv', index=False)