In [56]:
import pandas as pd
import subprocess
import os
pd.set_option("max_colwidth", None)
pd.set_option('display.max_columns', None)

In [57]:
cwd = os.getcwd()

In [58]:
folder_version = 'transformed_v1/transformed_v1'

In [59]:
col_transforms = ['','_horizontal','_horizontal_rotation_down', '_horizontal_rotation_up','_rotation_down','_rotation_up']

In [60]:
col_categories = ['radiographs','gaze_map_quantized','mask','teeth_mask','maxillomandibular']

In [61]:
# # make sure all files have similar file format (all upper case)
# for col_transform in col_transforms:
#     for col_category in col_categories:
#         subprocess.run(['mkdir',f'/Users/maxhoff/Downloads/transformed_v2/{col_category+col_transform}'])
#         for file in os.listdir(f'/Users/maxhoff/Downloads/transformed_v1/{col_category+col_transform}/'):
#             subprocess.run(["cp", 
#                             f"/Users/maxhoff/Downloads/transformed_v1/{col_category+col_transform}/{file}", 
#                             f"/Users/maxhoff/Downloads/transformed_v2/{col_category+col_transform}/{file.lower()}"])


In [62]:
# read in image labels
df = pd.read_csv("/Users/maxhoff/Documents/MIDS/w210/image_labels.csv")

In [63]:
# Find which images are in all folders/missing
agg_file_list = pd.DataFrame(os.listdir(f'/Users/maxhoff/Downloads/transformed_v2/radiographs/'),
                             columns=['image_ID']) # create initial image ID list
agg_file_list['image_ID'] = agg_file_list['image_ID'].str.lower() # capitalize all image IDs - for standarization purposes
for col_category in col_categories:
    for col_transform in col_transforms:
        # create a list of image_ID's for each folder
        category_list = pd.DataFrame(os.listdir(f'/Users/maxhoff/Downloads/transformed_v2/{col_category+col_transform}/'),
                                     columns=['image_ID'])
        # create indicator column for each folder
        category_list[col_category+col_transform] = 1
        # standardize image IDs to have only lower cases - currently not standard
        category_list['image_ID'] = category_list['image_ID'].str.lower()
        agg_file_list = agg_file_list.merge(category_list, how='outer', on='image_ID')

In [64]:
# count number of times an image ID appears across all folders
agg_file_list['folder_count'] = 0
for row in range(0, len(agg_file_list)-1):
    agg_file_list.iloc[row,len(agg_file_list.columns)-1] = agg_file_list.iloc[row,1:].sum()

In [65]:
images_within_all_folders = list(agg_file_list.loc[agg_file_list['folder_count'] == 30,'image_ID'])

In [66]:
df['image_ID'] = df['image_ID'].str.lower()

In [67]:
df = df.loc[df['image_ID'].isin(images_within_all_folders),:]

In [68]:
# read in exclusion list and remove
exclude_images = pd.read_csv(f'{cwd}/exclude_images.csv', header=None)
consider_exclude_images = pd.read_csv(f'{cwd}/consider_exclude_images.csv', header=None)
exclude_imgs = []
for img in list(exclude_images[0]):
    exclude_imgs.append(str(img)+'.JPG')
df = df[~df['image_ID'].isin(exclude_imgs)]
consider_exclude_imgs = []
for img in list(consider_exclude_images[0]):
    consider_exclude_imgs.append(str(img)+'.JPG')
df = df[~df['image_ID'].isin(consider_exclude_imgs)]

In [69]:
# keep only cavity cases - since those are more clear cut
df = df.loc[df['label'].isin(['periapical','none']),:]

In [70]:
# one hot encode labels
df.loc[df['label']!='none','label'] = 1
df.loc[df['label']=='none','label'] = 0

In [71]:
agg_df = pd.DataFrame()
for col_transform in col_transforms:
    # create df for each type of column - to concat at end
    globals()[f"df_{col_transform}"] = df.copy()
    for image_id in globals()[f"df_{col_transform}"]['image_ID']:
        for col_category in col_categories:
            # add image link for each category
            globals()[f"df_{col_transform}"].loc[globals()[f"df_{col_transform}"]['image_ID'] == image_id,f'{col_category}_image_link'] = f'https://w210-32iq.s3.amazonaws.com/{col_category+col_transform}/{image_id}' #f'https://storage.cloud.google.com/w210-32iq/{col_category+col_transform}/{image_id}'
    # create a new unique image id - since image id will be duplicated across categories
    globals()[f"df_{col_transform}"]['image_ID_new'] = globals()[f"df_{col_transform}"]['image_ID'].str.upper() + col_transform
    # create category field
    globals()[f"df_{col_transform}"]['category'] = col_transform
    # filter to only images that are within every folder
    globals()[f"df_{col_transform}"] = globals()[f"df_{col_transform}"].loc[globals()[f"df_{col_transform}"]['image_ID'].isin(images_within_all_folders),:]
    # concat dfs
    agg_df = pd.concat([agg_df, globals()[f"df_{col_transform}"]])
agg_df = agg_df.reset_index()
agg_df.drop(columns='index',inplace=True)

In [72]:
agg_df['image_links'] = ''
for row in range(0,len(agg_df)):
    agg_df.at[row,'image_links'] = [agg_df.loc[row,f'{col}_image_link'] for col in col_categories]

In [73]:
agg_df_dis = agg_df[agg_df['label'] == 1]
agg_df_no_dis = agg_df[agg_df['label'] == 0]

In [74]:
agg_df_no_dis = agg_df_no_dis.loc[agg_df_no_dis.index < len(agg_df_no_dis),:]

In [75]:
agg_df_balanced = pd.concat([agg_df_dis, agg_df_no_dis])

In [76]:
# output_df = agg_df[['label','radiographs_image_link','gaze_map_quantized_image_link','mask_image_link','teeth_mask_image_link','maxillomandibular_image_link']]
output_df = agg_df_balanced[['label','radiographs_image_link','gaze_map_quantized_image_link','mask_image_link','teeth_mask_image_link','maxillomandibular_image_link']]

In [77]:
# output_df.to_csv('output.csv',index=False)
output_df.to_csv(f'{cwd}/outputs/aws-output-balanced-liberal-clean.csv',index=False)

Inspect individual images

In [78]:
len(output_df)

4272

In [79]:
agg_df[agg_df['image_ID'] == '42.JPG'][['radiographs_image_link','mask_image_link']]

Unnamed: 0,radiographs_image_link,mask_image_link


In [80]:
output_df.sort_values(by='radiographs_image_link').head(50)

Unnamed: 0,label,radiographs_image_link,gaze_map_quantized_image_link,mask_image_link,teeth_mask_image_link,maxillomandibular_image_link
112,1,https://w210-32iq.s3.amazonaws.com/radiographs/1.jpg,https://w210-32iq.s3.amazonaws.com/gaze_map_quantized/1.jpg,https://w210-32iq.s3.amazonaws.com/mask/1.jpg,https://w210-32iq.s3.amazonaws.com/teeth_mask/1.jpg,https://w210-32iq.s3.amazonaws.com/maxillomandibular/1.jpg
865,0,https://w210-32iq.s3.amazonaws.com/radiographs/1000.jpg,https://w210-32iq.s3.amazonaws.com/gaze_map_quantized/1000.jpg,https://w210-32iq.s3.amazonaws.com/mask/1000.jpg,https://w210-32iq.s3.amazonaws.com/teeth_mask/1000.jpg,https://w210-32iq.s3.amazonaws.com/maxillomandibular/1000.jpg
876,1,https://w210-32iq.s3.amazonaws.com/radiographs/1001.jpg,https://w210-32iq.s3.amazonaws.com/gaze_map_quantized/1001.jpg,https://w210-32iq.s3.amazonaws.com/mask/1001.jpg,https://w210-32iq.s3.amazonaws.com/teeth_mask/1001.jpg,https://w210-32iq.s3.amazonaws.com/maxillomandibular/1001.jpg
852,1,https://w210-32iq.s3.amazonaws.com/radiographs/1002.jpg,https://w210-32iq.s3.amazonaws.com/gaze_map_quantized/1002.jpg,https://w210-32iq.s3.amazonaws.com/mask/1002.jpg,https://w210-32iq.s3.amazonaws.com/teeth_mask/1002.jpg,https://w210-32iq.s3.amazonaws.com/maxillomandibular/1002.jpg
841,1,https://w210-32iq.s3.amazonaws.com/radiographs/1004.jpg,https://w210-32iq.s3.amazonaws.com/gaze_map_quantized/1004.jpg,https://w210-32iq.s3.amazonaws.com/mask/1004.jpg,https://w210-32iq.s3.amazonaws.com/teeth_mask/1004.jpg,https://w210-32iq.s3.amazonaws.com/maxillomandibular/1004.jpg
845,0,https://w210-32iq.s3.amazonaws.com/radiographs/1007.jpg,https://w210-32iq.s3.amazonaws.com/gaze_map_quantized/1007.jpg,https://w210-32iq.s3.amazonaws.com/mask/1007.jpg,https://w210-32iq.s3.amazonaws.com/teeth_mask/1007.jpg,https://w210-32iq.s3.amazonaws.com/maxillomandibular/1007.jpg
870,1,https://w210-32iq.s3.amazonaws.com/radiographs/1009.jpg,https://w210-32iq.s3.amazonaws.com/gaze_map_quantized/1009.jpg,https://w210-32iq.s3.amazonaws.com/mask/1009.jpg,https://w210-32iq.s3.amazonaws.com/teeth_mask/1009.jpg,https://w210-32iq.s3.amazonaws.com/maxillomandibular/1009.jpg
869,1,https://w210-32iq.s3.amazonaws.com/radiographs/1009.jpg,https://w210-32iq.s3.amazonaws.com/gaze_map_quantized/1009.jpg,https://w210-32iq.s3.amazonaws.com/mask/1009.jpg,https://w210-32iq.s3.amazonaws.com/teeth_mask/1009.jpg,https://w210-32iq.s3.amazonaws.com/maxillomandibular/1009.jpg
412,0,https://w210-32iq.s3.amazonaws.com/radiographs/101.jpg,https://w210-32iq.s3.amazonaws.com/gaze_map_quantized/101.jpg,https://w210-32iq.s3.amazonaws.com/mask/101.jpg,https://w210-32iq.s3.amazonaws.com/teeth_mask/101.jpg,https://w210-32iq.s3.amazonaws.com/maxillomandibular/101.jpg
847,1,https://w210-32iq.s3.amazonaws.com/radiographs/1010.jpg,https://w210-32iq.s3.amazonaws.com/gaze_map_quantized/1010.jpg,https://w210-32iq.s3.amazonaws.com/mask/1010.jpg,https://w210-32iq.s3.amazonaws.com/teeth_mask/1010.jpg,https://w210-32iq.s3.amazonaws.com/maxillomandibular/1010.jpg
