# Setup

In [1]:
import os
from datetime import datetime
import hashlib
import csv

Next, set up information for where to find the files (directory path)

In [2]:
walk_this_directory = os.path.join('data','webfiles-samples')

print(walk_this_directory)

data/webfiles-samples


Next, set up gathering the file information for one file, mostly like we did last week... Note here is a dictionary, however: 

In [4]:
path_to_audio_dir = os.path.join('data','webfiles-samples', 'audio')

with os.scandir(path_to_audio_dir) as file_list:
    for file in file_list:
        file_info = {
            'filename' : os.path.basename(file),
            'extension' : os.path.splitext(file)[1],
            'size' : os.path.getsize(file), 
            #'modify_datetime' : datetime.strftime(datetime.fromtimestamp(os.path.getmtime(file)), "%Y-%m-%dT%H:%M:%S"),
            'absolute_path' : os.path.abspath(file)
        }
        print(file_info)


{'filename': '11-3250JohnsonvFolinoEtAl.wma', 'extension': '.wma', 'size': 21423499, 'absolute_path': '/Users/jajohnst/Desktop/networked-services-labs-2022/data/webfiles-samples/audio/11-3250JohnsonvFolinoEtAl.wma'}
{'filename': 'NEWSLINE_802AF71F439D401585C6FCB02F358307.mp3', 'extension': '.mp3', 'size': 961195, 'absolute_path': '/Users/jajohnst/Desktop/networked-services-labs-2022/data/webfiles-samples/audio/NEWSLINE_802AF71F439D401585C6FCB02F358307.mp3'}
{'filename': 'mj_telework_exchange_final_100710.mp3', 'extension': '.mp3', 'size': 3471488, 'absolute_path': '/Users/jajohnst/Desktop/networked-services-labs-2022/data/webfiles-samples/audio/mj_telework_exchange_final_100710.mp3'}
{'filename': '000727.ram', 'extension': '.ram', 'size': 79, 'absolute_path': '/Users/jajohnst/Desktop/networked-services-labs-2022/data/webfiles-samples/audio/000727.ram'}


This can similar be accomplished using `os.walk()`: 

In [None]:
walk_this_directory = os.path.join('data','webfiles-samples')

file_count = 0

# get information about each of the files
for folderName, subfolders, filenames in os.walk(walk_this_directory):
    
    for filename in filenames:
        file_count += 1
        filename = filename 
        folder = folderName
        path = os.path.join(folderName, filename)
        absolutePath = os.path.abspath(filename)
        size = os.path.getsize(path)
        # uncomment to print file information, or check that things look correct
        #print('Found:', filename, folder, path, size,'\n',absolutePath)
print(f'Counted {file_count} files')
    ## Note that this does not record hidden items like . and ..

How to get the checksum. This one was a bit tricky since you need to ensure that you are reading the file as a binary object (not as a string that shows the filepath). So, you can reuse or adapt the following function:

In [5]:
def get_checksum(filePath, checksum_type):
    '''This is a helper function to create a checksum. 
    In this example we will focus on MD5, which can be used to check data integrity.
    
    The filePath value argument be a string representing a valid path.
    The checksum_type argument should be a valid type of checksum.
    
    The function returns the string of characters for an MD5 or SHA256 checksum.
    The is function only allows you to create MD5 or SHA 256 and will result in an error for other types.'''
    checksum_type = checksum_type.lower().replace(' ', '')

    with open(filePath, 'rb') as f:
        bytes = f.read()
        if checksum_type == 'md5':
            hash_string = hashlib.md5(bytes).hexdigest()
        elif checksum_type == 'sha256':
            hash_string = hashlib.sha256(bytes).hexdigest()
        else:
            Raise('{} is not a hash function supported by this program. You must ask for MD5.')
    return hash_string

In [7]:
# Now, add the checksum function to the file metadata
path_to_audio_dir = os.path.join('data','webfiles-samples', 'audio')

with os.scandir(path_to_audio_dir) as file_list:
    for file in file_list:
        file_info = {
            'filename' : os.path.basename(file),
            'extension' : os.path.splitext(file)[1],
            'size' : os.path.getsize(file), 
            'modify_datetime' : datetime.strftime(datetime.fromtimestamp(os.path.getmtime(file)), "%Y-%m-%dT%H:%M:%S"),
            'absolute_path' : os.path.abspath(file),
            'md5_checksum' : get_checksum(file, 'md5'),
            'sha256_checksum' : get_checksum(file, 'sha256')
        }
        print(file_info)

{'filename': '11-3250JohnsonvFolinoEtAl.wma', 'extension': '.wma', 'size': 21423499, 'modify_datetime': '2022-09-05T21:59:36', 'absolute_path': '/Users/jajohnst/Desktop/networked-services-labs-2022/data/webfiles-samples/audio/11-3250JohnsonvFolinoEtAl.wma', 'md5_checksum': '0822287ef2af6e97ead980c771bb8f97', 'sha256_checksum': '2e90f5615ad4eaf273c1dfa9c4631e14a39da7f5a9a09edc5467b14f19dacd41'}
{'filename': 'NEWSLINE_802AF71F439D401585C6FCB02F358307.mp3', 'extension': '.mp3', 'size': 961195, 'modify_datetime': '2022-09-05T21:59:19', 'absolute_path': '/Users/jajohnst/Desktop/networked-services-labs-2022/data/webfiles-samples/audio/NEWSLINE_802AF71F439D401585C6FCB02F358307.mp3', 'md5_checksum': 'ad49bb75ecd85c86d8d4a2b418ce83cf', 'sha256_checksum': '5d523832cd29efe1971bf88a1e9698021dda828547c1d774301424060eb10730'}
{'filename': 'mj_telework_exchange_final_100710.mp3', 'extension': '.mp3', 'size': 3471488, 'modify_datetime': '2022-09-05T21:59:13', 'absolute_path': '/Users/jajohnst/Desktop/

### Version 1: using csv.DictWriter()

This is the example from class, completed.

Creating small dictionaries of the file metadata for each file, then writing using the header names as indexes using the `.DictWriter()` method.

In [32]:
# Finally, write this information out to a csv using the csv.DictWriter() function
import csv 

file_list = list()

# set up headers
headers = ['filename', 'extension', 'size', 'modify_datetime', 'absolute_path', 'md5_checksum', 'sha256_checksum']

# set up filecount
file_count = 0 

# add file metadata to a list of dictionaries
for folderName, subfolders, filenames in os.walk(walk_this_directory):
    for file in filenames:
        file_info = {
            'filename' : file,
            'extension' : os.path.splitext(os.path.join(folderName, file))[1],
            'size' : os.path.getsize(os.path.join(folderName, file)), 
            'modify_datetime' : datetime.strftime(datetime.fromtimestamp(os.path.getmtime(os.path.join(folderName, file))), "%Y-%m-%dT%H:%M:%S"),
            'absolute_path' : os.path.abspath(os.path.join(folderName, file)),
            'md5_checksum' : get_checksum(os.path.join(folderName, file), 'md5'),
            'sha256_checksum' : get_checksum(os.path.join(folderName, file), 'sha256')
        }
        file_list.append(file_info)
        #print(file_info)
print(len(file_list))

# write out the info using csv.DictWriter()
with open('file-metadata-manifest-from-dict.csv', 'w') as csvfile:
    fileManifest = csv.DictWriter(csvfile, fieldnames=headers)
    print('writing file manifest CSV')
    fileManifest.writeheader()
    for file in file_list:
        print('adding', file['filename'])
        fileManifest.writerow(file)
    print('Wrote manifest')

23
writing file manifest CSV
adding .DS_Store
adding web-files-small-metadata.csv
adding vlwhcssc.asx
adding 04-04-21full.asf
adding glmp_cig.EQ.wm.p20.t12z
adding oct17cc.asx
adding 01-1480.pdf
adding file.pdf
adding Chapter03.pdf
adding PFCHEJ.pdf
adding HR2021 commtext.pdf
adding 13080t.jpg
adding orca.via_.moc_.noaa_.jpg
adding k7989-7x.jpg
adding m237a2f.gif
adding 1005107061.tif
adding 11-3250JohnsonvFolinoEtAl.wma
adding NEWSLINE_802AF71F439D401585C6FCB02F358307.mp3
adding mj_telework_exchange_final_100710.mp3
adding 000727.ram
adding BudgetandGrants012710.ppt
adding ADAEMPLOYMENTTaxIncentives.ppt
adding Non-FTE-Trainee-Activities-060109.ppt
Wrote manifest


### Version 2: using csv.writer()

Creating short lists of the file metadata for each file, then writing to a csv using the `.writer()` method.

In [31]:
# Finally, write this information out to a csv using the csv.DictWriter() function
import csv 

file_list = list()

# set up headers
headers = ['filename', 'extension', 'size', 'modify_datetime', 'absolute_path', 'md5_checksum', 'sha256_checksum']

# set up filecount
file_count = 0 

# add file metadata to a list of dictionaries
for folderName, subfolders, filenames in os.walk(walk_this_directory):
    for file in filenames:
        # this part creating a list
        filename = file
        extension = os.path.splitext(os.path.join(folderName, file))[1]
        size = os.path.getsize(os.path.join(folderName, file))
        modify_datetime = datetime.strftime(datetime.fromtimestamp(os.path.getmtime(os.path.join(folderName, file))), "%Y-%m-%dT%H:%M:%S")
        absolute_path = os.path.abspath(os.path.join(folderName, file))
        md5_checksum = get_checksum(os.path.join(folderName, file), 'md5')
        sha256_checksum = get_checksum(os.path.join(folderName, file), 'sha256')
        # add those values to the file_info list for each file
        file_info = [
            filename,
            extension,
            size,
            modify_datetime,
            absolute_path,
            md5_checksum,
            sha256_checksum
        ]
        # append that list to the file_list
        file_list.append(file_info)
        #print(file_info)
print(len(file_list))

# write out the info using csv.DictWriter()
with open('file-metadata-manifest-from-list.csv', 'w') as csvfile:
    fileManifest = csv.writer(csvfile)
    print('writing file manifest CSV')
    fileManifest.writerow(headers)
    for file in file_list:
        print('adding', file[0])
        fileManifest.writerow(file)
    print('Wrote manifest')

23
writing file manifest CSV
adding .DS_Store
adding web-files-small-metadata.csv
adding vlwhcssc.asx
adding 04-04-21full.asf
adding glmp_cig.EQ.wm.p20.t12z
adding oct17cc.asx
adding 01-1480.pdf
adding file.pdf
adding Chapter03.pdf
adding PFCHEJ.pdf
adding HR2021 commtext.pdf
adding 13080t.jpg
adding orca.via_.moc_.noaa_.jpg
adding k7989-7x.jpg
adding m237a2f.gif
adding 1005107061.tif
adding 11-3250JohnsonvFolinoEtAl.wma
adding NEWSLINE_802AF71F439D401585C6FCB02F358307.mp3
adding mj_telework_exchange_final_100710.mp3
adding 000727.ram
adding BudgetandGrants012710.ppt
adding ADAEMPLOYMENTTaxIncentives.ppt
adding Non-FTE-Trainee-Activities-060109.ppt
Wrote manifest
