In [1]:
import pandas as pd
from PIL import Image # used for loading images 
import numpy as np
import os # used for navigating to image path
import imageio # used for writing images
import natsort
from natsort import natsorted
import re # for matching image file name classes
import matplotlib.pyplot as plt
import random
import ntpath
import shutil
import csv
from timeit import default_timer as timer
import platform

In [2]:
RAW_IMAGE_DIR = '../../data/raw/Pictures for AI'
RAW_IMAGE_DIR_SUMMER = '../../data/raw/Summer 2021 AI Photos'
RAW_IMAGE_DIR_FALL = '../../data/raw/Likelihood of Failure Images/'
TIDY_IMAGE_TEST_DIR = '../../data/tidy/labeled-images-test/'
TIDY_IMAGE_DIR = '../../data/tidy/labeled-images/'
INDEX_DIR = '../../results/index-raw/'
INDEX_TEST_DIR = '../../results/index-raw-test/'
TRIM_LIST = RAW_IMAGE_DIR_FALL+'likelihood.csv'
NO_TRAILING = RAW_IMAGE_DIR_FALL+'Likelihood_of_failure_no_trailing.csv'

In [3]:
def getListOfFiles(dirName):
    """Returns single list of the filepath of each of the image files"""
    # source: https://thispointer.com/python-how-to-get-list-of-files-in-directory-and-sub-directories/
    allFiles = list()
    if 'Pictures for AI' or 'Likelihood of Failure Images' in dirName: # 
        listOfFile = os.listdir(dirName)
        # Iterate over all the entries
        for entry in listOfFile:
            # Create full path
            fullPath = os.path.join(dirName, entry)
            # If entry is a directory then get the list of files in this directory 
            if os.path.isdir(fullPath):
                allFiles = allFiles + getListOfFiles(fullPath)
            else:
                allFiles.append(fullPath)
    else:
        level_one = [f.path for f in os.scandir(dirName) if f.is_dir()] # get all of the immediate subdirectories in the parent folder downloaded from SharePoint
        for subdir in level_one:
            level_two = [f.path for f in os.scandir(subdir) if f.is_dir()] # get all of the immediate subdirectories for each arborist
            print(level_two)
            for subdir2 in level_two:
                if ('Away' in subdir2): # do not include images taken away powerlines
                    print('Ignoring images away from power lines')
                    #break
                else:
                    listOfFile = os.listdir(subdir2) # retain only images taken near power lines
                    # Iterate over all the entries
                    for entry in listOfFile:
                        # Create full path
                        fullPath = os.path.join(subdir2, entry)
                        # If entry is a directory then get the list of files in this directory 
                        if os.path.isdir(fullPath):
                            allFiles = allFiles + getListOfFiles(fullPath)
                        else:
                            allFiles.append(fullPath)
    return allFiles

In [4]:
def splitIndexDescrArb(images):
    """Split the filepaths of the images taken in Summer 2021 a list of indices, descriptions and arborists"""
    index_list = []
    description_list = []
    arborist_list = []
    for fn in images:
        print('fn: ' + fn)
        if (platform.system() == 'Windows'):
            if ('_' in fn):
                index = fn.split('_')[0].split('\\')[-1]
                description = fn.split('_')[1]
                arborist = fn.split('_')[0].split('\\')[1].split('/')[-1]
            else:
                index = fn.split('-')[0].split('\\')[-1].rstrip(' ')
                description = fn.split('-')[1]
                arborist = fn.split('-')[0].split('\\')[1].split('/')[-1]        
        else: #Unix/Linux systems
            if ('_' in fn):
                index = fn.split('_')[0].split('/')[-1]
                print("Index" + index)
                description = fn.split('_')[1]
                arborist = fn.split('_')[0].split('/')[-3] #.split('/')[-2]
                print(arborist)
            else:
                index = fn.split('-')[0].split('/')[-1].rstrip(' ')
                description = fn.split('-')[1]
                arborist = fn.split('-')[0].split('/')[-3]                    
        index = int(index)
        index_list.append(index)
        description_list.append(description)
        arborist_list.append(arborist)
    return index_list, description_list, arborist_list

In [5]:
def getUniqueImages(images, arb_list, idx_label, descr):
    """Get a list of unique Summer 2021 images by ignoring close-ups. Only selects an image with the classification present in the label."""
    df = pd.DataFrame([images, arb_list, idx_label, descr])
    df = df.transpose()
    df.columns=['Filename','Arborist', 'Index_Label', 'Description']
    unique_image_list = []
    for arborist in df.Arborist.unique():
        print(arborist)
        dfsub = df.loc[df.Arborist == arborist]
        for index in dfsub.Index_Label.unique():
            dfsub2 = dfsub.loc[dfsub.Index_Label == index]
            if len(dfsub2) == 1:
                selected_filename = dfsub2.Filename.tolist()[0]
            else:
                good_index_list = []
                for r in dfsub2.index:
                    if 'closeup' not in dfsub2.loc[r, 'Description']:
                        l = ['probable', 'possible', 'improbable']
                        if [i for i in l if re.findall(i, dfsub2.loc[r, 'Description'], re.IGNORECASE)]: # check if a classification is included in the file name
                            good_index_list.append(r)
                print(good_index_list)
                if len(good_index_list) == 1:
                    selected_filename = dfsub2.loc[good_index_list, 'Filename'].tolist()[0]
                else:
                    testdf = dfsub2.loc[good_index_list]
                    testdf['DescriptionLength'] = testdf.Description.str.len()
                    for i in good_index_list:
                        selected_filename = testdf.loc[testdf.DescriptionLength.idxmin(), 'Filename']
            print(selected_filename)                
            unique_image_list.append(selected_filename)
    return unique_image_list

In [6]:
def trimTrailingChars(data):
    """Converts original excel file containing classification categories for Fall 2021 images to csv"""
    read_file = pd.read_excel(data)
    read_file.to_csv(TRIM_LIST, index = None, header=True)
    read_file = pd.read_csv(TRIM_LIST)
    read_file['Likelihood of Failure Rating'] = read_file['Likelihood of Failure Rating'].str.replace('\xa0','')
    df = read_file.rename({"Image":"image", "File Name":"file_name", "Likelihood of Failure Rating": "likelihood_of_failure_rating"}, axis='columns')
    df.to_csv(NO_TRAILING, index = None, header=True)
    print(df.head())
    print(df['likelihood_of_failure_rating'].unique())
    if os.path.isfile(TRIM_LIST):
        os.remove(TRIM_LIST)

In [7]:
def encrypt(file_name):
    """Cryptographically encrypts each failure likelihood category using Python's built-in hash function"""
    if any(re.findall(r'improbable', file_name, re.IGNORECASE)):
        h = hash('improbable')
    elif any(re.findall(r'probable', file_name, re.IGNORECASE)):
        h = hash('probable')
    elif any(re.findall(r'possible', file_name, re.IGNORECASE)):
        h = hash('possible')
    else:
        h = hash('unknown')
    return h

In [8]:
def updateNameCount(word, d):
    """Creates a uniform label for all saved images and updates the global count"""
    h  = encrypt(word)
    save_name = ''
    for key in d.keys():
        if h == hash(key):
            save_name = TIDY_IMAGE_TEST_DIR + key + '-' + str(d[key]) + '.jpg'
            d[key] += 1
    return save_name

In [9]:
def saveImageFiles(image_file_list):
    """Serially labels all images by class:  and saves them to the designated tidy image directory."""
    counts = {'improbable':1,
                'possible':1,
                'probable':1,
                'unknown':1}
    csv_col_index = ['Labeled Image', 'Original File Path']
    index = {}
    shutil.rmtree(TIDY_IMAGE_TEST_DIR, ignore_errors=True) # Deletes the directory containing any existing labeled images
    shutil.rmtree(INDEX_TEST_DIR, ignore_errors=True)
    if not os.path.exists(TIDY_IMAGE_TEST_DIR):
        os.makedirs(TIDY_IMAGE_TEST_DIR)
    for filename in image_file_list:
        if '.JPG'.casefold() in filename.casefold():
            save_name = updateNameCount(filename, counts)
            imageio.imwrite(save_name, np.array(Image.open(filename)))
            index[save_name] = filename
        if '.HEIC'.casefold() in filename.casefold():
            df = pd.read_csv(NO_TRAILING)
            for row in df.itertuples():
                image_name = row.file_name
                rating = row.likelihood_of_failure_rating
                if image_name == filename.split("Failure Images/",1)[1]:
                    save_name = updateNameCount(rating, counts)
                    shutil.copy2(filename, save_name)
                    continue                    
            index[save_name] = filename
    if not os.path.exists(INDEX_TEST_DIR):
        os.makedirs(INDEX_TEST_DIR)
    with open(INDEX_TEST_DIR + 'labels_index.csv', 'w', newline='') as f: # TODO: separate by tab not comma
        writer = csv.DictWriter(f, fieldnames=csv_col_index)
        writer.writeheader()
        for key in natsort.natsorted(index.keys()): # iterate through the alphanumeric keys in a natural order
            key_name = key.replace(TIDY_IMAGE_TEST_DIR,'')
            val_name = index[key]
            f.write("%s,%s\n"%(key_name,val_name))
    
    print('Number of improbable images:', counts['improbable']-1)    
    print('Number of possible images:', counts['possible']-1)
    print('Number of probable images:', counts['probable']-1)
    print('Number of unknown images:', counts['unknown']-1)
    print('Total number of classified images:', counts['improbable']+counts['possible']+counts['probable']-3)

In [None]:
### PREVIOUS IMPLEMENTATION
def saveImageFiles(image_file_list): 
    """Serially labels all images by class:  and saves them to the designated tidy image directory."""
    
    improbable_counter = 1
    possible_counter = 1
    probable_counter = 1
    if not os.path.exists(TIDY_IMAGE_DIR):
        os.makedirs(TIDY_IMAGE_DIR)
    for filename in image_file_list:
        if '.JPG' in filename or '.jpg' in filename:        
            if any(re.findall(r'improbable', filename, re.IGNORECASE)):
                save_name = TIDY_IMAGE_DIR + 'improbable' + '-' + str(improbable_counter) + '.jpg'
                improbable_counter += 1
            elif any(re.findall(r'probable', filename, re.IGNORECASE)):
                save_name = TIDY_IMAGE_DIR + 'probable' + '-' + str(probable_counter) + '.jpg'
                probable_counter += 1 
            elif any(re.findall(r'possible', filename, re.IGNORECASE)):
                save_name = TIDY_IMAGE_DIR + 'possible' + '-' + str(possible_counter) + '.jpg'
                possible_counter += 1 
            imageio.imwrite(save_name, np.array(Image.open(filename)))
    print('Number of improbable images saved:', improbable_counter - 1)    
    print('Number of possible images saved:', possible_counter - 1)
    print('Number of probable images saved:', probable_counter - 1)

In [10]:
trimTrailingChars(RAW_IMAGE_DIR_FALL+'Likelihood of Failure Images.xlsx')
original_photos = getListOfFiles(RAW_IMAGE_DIR)
summer_photos = getListOfFiles(RAW_IMAGE_DIR_SUMMER)
fall_photos = getListOfFiles(RAW_IMAGE_DIR_FALL)
index_list, description_list, arborist_list = splitIndexDescrArb(summer_photos)
unique_image_list_summer = getUniqueImages(summer_photos, arborist_list, index_list, description_list)

   image      file_name likelihood_of_failure_rating
0  936.0  IMG_0936.HEIC                     Possible
1  937.0  IMG_0937.HEIC                     Probable
2  938.0  IMG_0938.HEIC                     Probable
3  939.0  IMG_0939.HEIC                   Improbable
4  940.0  IMG_0940.HEIC                   Improbable
['Possible' 'Probable' 'Improbable' 'probable' '2 improbable'
 'Improbable (catalpa)' 'Imminent'
 'From left to right: Improbable, Possible'
 'From left to right: improbable, probable, imminent' '2 probable'
 'Imminent (same as IMG_1134.HEIC)' nan]
fn: ../../data/raw/Summer 2021 AI Photos\Amanda\Near power lines\15_probable.jpg
fn: ../../data/raw/Summer 2021 AI Photos\Amanda\Near power lines\15_probable_crack.jpg
fn: ../../data/raw/Summer 2021 AI Photos\Amanda\Near power lines\16_possible.jpg
fn: ../../data/raw/Summer 2021 AI Photos\Amanda\Near power lines\17_improbable.jpg
fn: ../../data/raw/Summer 2021 AI Photos\Amanda\Near power lines\18_possible.jpg
fn: ../../data/raw/S

In [11]:
saveImageFiles(original_photos+unique_image_list_summer+fall_photos)

Number of improbable images: 515
Number of possible images: 178
Number of probable images: 155
Number of unknown images: 40
Total number of classified images: 848
