In [2]:
import pandas as pd
import os
import shutil

In [3]:
imageDir = '../Images/'
csvDir = '../CSV'
basePathPFR = '../PFR_Data/'
basePathFuel = '../FuelTypeData/'
splitCsv = 'csv'
TRAIN = 'train'
TEST = 'test'
VALD = 'vald'

In [6]:
def dataLoader(imageDir,csvFile,classBased=True,sampleRatio=0.60,splitClass="PFRType"):
    
    trainSplitFactor = sampleRatio
    testSplitFactor = valdSplitFactor = (1-sampleRatio)/2
    
    train = pd.DataFrame()
    test = pd.DataFrame()
    vald = pd.DataFrame()
    
    csv = os.path.join(csvDir,csvFile)
    if os.path.isfile(csv) == False:
        exit(1)
    if os.path.isdir(imageDir) == False:
        exit(1)
    
    splits = ['train','test','vald']
    
    
    csvData = pd.read_csv(csv)
    dataShuffled = csvData.sample(frac=1).reset_index(drop=True) #Shuffle data
    
    if classBased:
        
        if splitClass not in ['PFRType','FuelType']:
            print('ERROR: {sc} is not a valid spliClass input'.format(sc=splitClass))
            exit(1)
            
        if splitClass == "PFRType":
            basePath = basePathPFR
        elif splitClass == "FuelType":
            basePath = basePathFuel
            
        #Class based train test split
        classes = csvData[splitClass].unique()

        for typeClass in classes:
            
            print('\n\nProcessing Class {cl}'.format(cl=typeClass))
            
            imageTrainPath = os.path.join(basePath,TRAIN)
            imageTestPath = os.path.join(basePath,TEST)
            
            classData = dataShuffled[dataShuffled[splitClass] == typeClass]
            classData = classData.sample(frac=1).reset_index(drop=True) #Shuffle data
            
            total = classData.shape[0]
            trainLen = int(total*trainSplitFactor)
            valdLen = int(total*valdSplitFactor)
            testLen = int(total*testSplitFactor)
            
            print("\tTotal: {ttl}, Train: {tr}, Test: {te}, Vald: {vld}".format(ttl=total,tr=trainLen,te=testLen,vld = valdLen))
            trainData = classData[0:trainLen]
            valdData = classData[trainLen+1: (trainLen+valdLen)]
            testData = classData[(trainLen+valdLen)+1:(trainLen+valdLen+testLen)]
            
            imagesTrain = trainData['Image'].tolist()
            imagesVald = valdData['Image'].tolist()
            imagesTest = testData['Image'].tolist()
            
            for sp in splits:
                print('\t\tCopying files to',sp)
                
                if sp =='train':
                    images = imagesTrain
                    tLen = trainLen
                    data = trainData
                    train = pd.concat([train,data])
                elif sp =='test':
                    images = imagesTest
                    tLen = testLen
                    data = testData
                    test = pd.concat([test,data])
                elif sp == 'vald':
                    images = imagesVald
                    vLen = valdLen
                    data = valdData
                    vald = pd.concat([vald,data])
                    
                cpCount = 0
                for image in images:
                    copyFrom = os.path.join(imageDir,image+'.jpg')
                    copyTo = os.path.join(basePath,sp+'/',typeClass+'/',image+'.jpg')
                    
                    nestedDir = os.path.join(basePath,sp+'/',typeClass+'/')
                    if not os.path.exists(nestedDir):
                        os.makedirs(nestedDir)
                        
                    shutil.copy(copyFrom, copyTo)
                    print('\t\tCopied {cp} of {ttl}'.format(cp=cpCount,ttl=tLen),end="\r")
                    cpCount += 1
        
        CSVPath = os.path.join(basePath,splitCsv)
        if not os.path.exists(CSVPath+'/'):
            os.makedirs(CSVPath+'/')      
        
        train.to_csv(os.path.join(CSVPath+'/','train.csv'))
        test.to_csv(os.path.join(CSVPath+'/','test.csv'))
        vald.to_csv(os.path.join(CSVPath+'/','vald.csv'))
            
                        
    else:
        #means user doesnt want sampling for each class, just randomize data and split according to sampleRatio
        pass

In [7]:
dataLoader(imageDir,'cleanedData.csv',True,sampleRatio=0.80,splitClass="PFRType")



Processing Class 0
	Total: 5776, Train: 4620, Test: 577, Vald: 577
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 575 of 577

Processing Class 6-9
	Total: 1027, Train: 821, Test: 102, Vald: 102
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 100 of 102

Processing Class 10-12
	Total: 1066, Train: 852, Test: 106, Vald: 106
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 104 of 106

Processing Class 1-3
	Total: 518, Train: 414, Test: 51, Vald: 51
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 49 of 51

Processing Class 60+
	Total: 267, Train: 213, Test: 26, Vald: 26
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 24 of 26

Processing Class 30-40
	Total: 471, Train: 376, Test: 47, Vald: 47
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 45 of 47

Processing Class 20-25
	Total: 430, Tr

In [8]:
dataLoader(imageDir,'cleanedData.csv',True,sampleRatio=0.80,splitClass="FuelType")



Processing Class 20H280NG
	Total: 2322, Train: 1857, Test: 232, Vald: 232
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 230 of 232

Processing Class F2
	Total: 1051, Train: 840, Test: 105, Vald: 105
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 103 of 105

Processing Class Ethlyne
	Total: 1932, Train: 1545, Test: 193, Vald: 193
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 191 of 193

Processing Class NG
	Total: 4979, Train: 3983, Test: 497, Vald: 497
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 495 of 497

Processing Class F1
	Total: 415, Train: 332, Test: 41, Vald: 41
		Copying files to train
		Copying files to test
		Copying files to vald
		Copied 39 of 41