In [1]:
import os, json, re, logging, time, datetime, sys
from os.path import join, getsize
## Windows stuff
import win32api
import win32con
import win32security
import win32file

In [2]:
## A function to get a file user on windows
def getOwner(filename):
    try:
        open (filename, "r").close ()
        sd = win32security.GetFileSecurity (filename, win32security.OWNER_SECURITY_INFORMATION)
        owner_sid = sd.GetSecurityDescriptorOwner ()
        name, domain, type = win32security.LookupAccountSid (None, owner_sid)
    except:
        name = 'NILACS'
    return name

In [3]:
## A function to get a file size on Windows
def getFileSize(filename):
    source = win32file.CreateFile(filename,win32file.GENERIC_READ,win32file.FILE_SHARE_READ,None,win32file.OPEN_EXISTING,0,None)
    size = win32file.GetFileSize(source)
    source.Close()
    return size

In [4]:
def getFileTime(filename):
    source = win32file.CreateFile(filename,win32file.GENERIC_READ,win32file.FILE_SHARE_READ,None,win32file.OPEN_EXISTING,0,None)
    filetime = str(win32file.GetFileTime(source)[0])
    source.Close()
    return filetime
#print(getFileTime('R:\\11. HeadsUp\\3. Other\\fake MBS\SYNTHETIC_MBS_1516.CSV'))

In [5]:
def getFileExtension(filename):
    return os.path.splitext(filename)[1]

In [6]:
## Setup the logger for debugging
logger = logging.getLogger('dir-size')
logger.setLevel(logging.DEBUG)
## Create file handler which logs even debug messages
fh = logging.FileHandler('dir-size.json')
fh.setLevel(logging.DEBUG)
# create formatter and add it to the handlers
formatter = logging.Formatter('{"time": "%(asctime)s" ,"name": "%(name)s" ,"level": "%(levelname)s" ,"message": "%(message)s"}')
fh.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)
logger.info('Start dir-size log')

In [7]:
## Directory of python script
print(os.getcwd())

c:\GitHub\mortie23\dir-size


## Get Directory Sizes Recursive

In [8]:
def getDirSize(dirstart,fileInfoFlag,debugFlag):
    dirstart_contract = re.sub('[^A-Za-z0-9]+', '', dirstart)
    print('getDirSize: ',dirstart_contract)
    ## A couple of log lines to get timing of runs
    logger.info('START - getDirSize')
    logger.info(dirstart_contract)
    dir_count = 1
    file_count = 0
    for root, dirs, files in os.walk(dirstart):
        numsep = root.count(os.path.sep)
        jsonLine={}
        jsonLine["dirs"]=dir_count
        jsonLine["numsep"]=numsep
        jsonLine["directory"]=root
        ## Initialise the file ifo
        fileset = []
        bytes_size=0
        ## Get the file info only when not debug and filename is less than 260
        ## 32bit Python windows does not like really long paths
        if debugFlag:
            pass
        else:
            if fileInfoFlag:
                for name in files:
                    file_count += 1
                    print('dircnt: {0}, filecnt: {1}, dir: {2}, file: {3}                                                        \r'
                          .format(dir_count, file_count, dirstart_contract, name), end="")
                    try:
                        path = join(root, name)
                        size = getFileSize(path)
                        owner = getOwner(path)
                        datetime = getFileTime(path)
                        ext = getFileExtension(path)
                        fileset.append({"name":name, "ext": ext, "size":size, "owner": owner, "datetime": datetime})
                        bytes_size += size
                    except:
                        len(join(root, name))
                        fileset.append({"name":name, "exception": 1})
        jsonLine["fileset"]=fileset
        jsonLine["bytes"]=bytes_size
        jsonLine["files"]=len(files)
        if debugFlag:
            pass
        else:
            ## Create new file
            fProc = open("./json-data/" +  dirstart_contract + '.' + str(dir_count) + '.json', "w")
            fProc.write(json.dumps(jsonLine))
            fCsv = open("./csvs/" + dirstart_contract + '.csv', "a+")
            csvLine = ','.join([str(dir_count),dirstart_contract,"./json-data/" + dirstart_contract + '.' + str(dir_count) + '.json'+"\n"])
            fCsv.write(csvLine)
            fProc.close()
            fCsv.close()
        dir_count += 1
    ## A couple of log lines to get timing of runs
    print()
    message = 'getDirSize('+dirstart_contract+') END dir_count('+str(dir_count)+') file_count('+str(dir_count)+')'
    logger.info(message)

In [15]:
## Unit test on small directory structure
dirstart = "C:\\Users\\chris\\Downloads\\dir-size-test"
getDirSize(dirstart,1,0)

getDirSize:  CUserschrisDownloadsdirsizetest
dircnt: 3, filecnt: 3, dir: CUserschrisDownloadsdirsizetest, file: test3.txt                                                        


# Loop Files
A function to loop over all files in a directory and call the getDirSize function

In [11]:
def loopFiles(dirname,debugFlag):
    directory = os.fsencode(dirname)
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if debugFlag:
            print('loopFiles: ',filename)
            getDirSize(dirname+'/'+filename,1,1)
        else:
            getDirSize(dirname+'/'+filename,1,0)

In [13]:
## Run for whole user directory in debug mode
## This will get the number of directories in each high level directory
dirname = "C:\\Users\\chris\\Downloads"
#loopFiles(dirname,1)

## Now run
This will run a full collection of the section drive

In [16]:
## Run for whole team directory
dirname = "C:\\Users\\chris\\Downloads\\"
#loopFiles(dirname,0)