In [1]:
import pandas as pd
import os
import shutil

In [2]:
imageDir = '../Images/'
csvDir = '../CSV'
basePathPFR = '../PFR_Data/'
basePathFuel = '../FuelTypeData/'
splitCsv = 'csv'
TRAIN = 'train'
TEST = 'test'
VALD = 'vald'

In [3]:
def dataLoader(imageDir,csvFile,classBased=True,sampleRatio=0.60,splitClass="PFRType"):
    
    trainSplitFactor = sampleRatio
    testSplitFactor = valdSplitFactor = (1-sampleRatio)/2
    
    train = pd.DataFrame()
    test = pd.DataFrame()
    vald = pd.DataFrame()
    
    csv = os.path.join(csvDir,csvFile)
    if os.path.isfile(csv) == False:
        exit(1)
    if os.path.isdir(imageDir) == False:
        exit(1)
    
    splits = ['train','test','vald']
    
    
    csvData = pd.read_csv(csv)
    dataShuffled = csvData.sample(frac=1).reset_index(drop=True) #Shuffle data
    
    if classBased:
        
        if splitClass not in ['PFRType','FuelType']:
            print('ERROR: {sc} is not a valid spliClass input'.format(sc=splitClass))
            exit(1)
            
        if splitClass == "PFRType":
            basePath = basePathPFR
        elif splitClass == "FuelType":
            basePath = basePathFuel
            
        #Class based train test split
        classes = csvData[splitClass].unique()

        for typeClass in classes:
            
            print('\n\nProcessing Class {cl}'.format(cl=typeClass))
            
            imageTrainPath = os.path.join(basePath,TRAIN)
            imageTestPath = os.path.join(basePath,TEST)
            
            classData = dataShuffled[dataShuffled[splitClass] == typeClass]
            classData = classData.sample(frac=1).reset_index(drop=True) #Shuffle data
            
            total = classData.shape[0]
            trainLen = int(total*trainSplitFactor)
            valdLen = int(total*valdSplitFactor)
            testLen = int(total*testSplitFactor)
            
            print("\tTotal: {ttl}, Train: {tr}, Test: {te}, Vald: {vld}".format(ttl=total,tr=trainLen,te=testLen,vld = valdLen))
            trainData = classData[0:trainLen]
            valdData = classData[trainLen+1: (trainLen+valdLen)]
            testData = classData[(trainLen+valdLen)+1:(trainLen+valdLen+testLen)]
            
            imagesTrain = trainData['Image'].tolist()
            imagesVald = valdData['Image'].tolist()
            imagesTest = testData['Image'].tolist()
            
            for sp in splits:
                print('\t\tCopying files to',sp)
                
                if sp =='train':
                    images = imagesTrain
                    tLen = trainLen
                    data = trainData
                    train = pd.concat([train,data])
                elif sp =='test':
                    images = imagesTest
                    tLen = testLen
                    data = testData
                    test = pd.concat([test,data])
                elif sp == 'vald':
                    images = imagesVald
                    vLen = valdLen
                    data = valdData
                    vald = pd.concat([vald,data])
                    
                cpCount = 0
                for image in images:
                    copyFrom = os.path.join(imageDir,image+'.jpg')
                    copyTo = os.path.join(basePath,sp+'/',typeClass+'/',image+'.jpg')
                    
                    nestedDir = os.path.join(basePath,sp+'/',typeClass+'/')
                    if not os.path.exists(nestedDir):
                        os.makedirs(nestedDir)
                        
                    shutil.copy(copyFrom, copyTo)
                    print('\t\tCopied {cp} of {ttl}'.format(cp=cpCount,ttl=tLen),end="\r")
                    cpCount += 1
        
        CSVPath = os.path.join(basePath,splitCsv)
        if not os.path.exists(CSVPath+'/'):
            os.makedirs(CSVPath+'/')      
        
        train.to_csv(os.path.join(CSVPath+'/','train.csv'))
        test.to_csv(os.path.join(CSVPath+'/','test.csv'))
        vald.to_csv(os.path.join(CSVPath+'/','vald.csv'))
            
                        
    else:
        #means user doesnt want sampling for each class, just randomize data and split according to sampleRatio
        pass

In [4]:
dataLoader(imageDir,'cleanedData.csv',True,sampleRatio=0.70,splitClass="PFRType")



Processing Class 0
	Total: 5776, Train: 4043, Test: 866, Vald: 866
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 864 of 866

Processing Class 6-9
	Total: 1027, Train: 718, Test: 154, Vald: 154
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 152 of 154

Processing Class 10-12
	Total: 1066, Train: 746, Test: 159, Vald: 159
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 157 of 159

Processing Class 1-3
	Total: 518, Train: 362, Test: 77, Vald: 77
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 75 of 77

Processing Class 60+
	Total: 267, Train: 186, Test: 40, Vald: 40
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 38 of 40

Processing Class 30-40
	Total: 471, Train: 329, Test: 70, Vald: 70
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 68 of 70

Processing Class 20-25
	Total: 430, Tr

In [5]:
dataLoader(imageDir,'cleanedData.csv',True,sampleRatio=0.70,splitClass="FuelType")



Processing Class 20H280NG
	Total: 2322, Train: 1625, Test: 348, Vald: 348
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 346 of 348

Processing Class F2
	Total: 1051, Train: 735, Test: 157, Vald: 157
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 155 of 157

Processing Class Ethlyne
	Total: 1932, Train: 1352, Test: 289, Vald: 289
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 287 of 289

Processing Class NG
	Total: 4979, Train: 3485, Test: 746, Vald: 746
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 744 of 746

Processing Class F1
	Total: 415, Train: 290, Test: 62, Vald: 62
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 60 of 62

In [6]:
checkpoint_template = os.path.join('../Models/PFRModel/checkpoint/', "pfrNet_{epoch:03d}_{val_loss:.2f}.hdf5")

In [7]:
checkpoint_template


'../Models/PFRModel/checkpoint/pfrNet_{epoch:03d}_{val_loss:.2f}.hdf5'