# install pyyaml and sqlalchemy

In [None]:
%pip install pyyaml

# common imports

In [2]:
import os
import re
import glob
from datetime import datetime, timezone

import yaml
import uuid as uuid_lib

# 
# Domain Models
# 
from scanner import ScanningProcess, FileInfo, Storage, ScanningRecord, StorageStatus, StorageType

# 
# Common Helper functions
# 
from scanner import now_time_str, now_time_str_ymd_hms, random_uuid_str

from scanner import FakeDatabase

# Read global configuration and create global DB

In [None]:

with open(os.path.expanduser('~/scanner-config.yaml'),'r') as file:
    config = yaml.load(file,Loader=yaml.FullLoader)
print("config:",config)

DB = FakeDatabase()

# initialization of storages infomation

In [None]:
# initialization of storages infomation

uuid_to_abs_path={}
for rootdir, dirtype in config['storage-locations'].items():
    try:
        for path in os.listdir(rootdir):
            # print("path: ",path)
            if path.endswith('.storage-location-id'):
                # print("id path: ",path)
                uuid = re.sub(r'(.+)\.storage-location-id',r'\1',path)
                uuid_to_abs_path[uuid] = {"uuid":uuid,"path":rootdir,"type":dirtype}
    except Exception as e:
        # print('for scan_proc: ctx: path={path}, error:{errMessage}'.format(path=rootdir, errMessage=repr(e))) # repr(e)="FileNotFoundError(2, 'No such file or directory')"
        print('for scan_proc: ctx: path={path}, error:{errMessage}'.format(path=rootdir, errMessage=str(e)))
        # print('for scan_proc: ctx: path={path}, error:{errMessage}'.format(path=rootdir, errMessage=e)) # e="[Errno 2] No such file or directory: '/Volumes/VOLUME1'"
        # print('for scan_proc: ctx: path={path}, error:{errMessage}'.format(path=rootdir, errMessage=type(e))) # type(e)="<class 'FileNotFoundError'>"
print('uuid_to_abs_path: ',uuid_to_abs_path)

print('uuid_to_abs_path.values(): ',uuid_to_abs_path.values())

## function for scanning folder

## extention function for accessing files on icloud

In [5]:
downloaded_folder_list = set()
def trigger_downloading_from_icloud(rootdir_info:Storage, abs_file_path:str):
    if rootdir_info.get_type()!=StorageType.OSX:
        return
    
    dir_path = os.path.dirname(abs_file_path)
    if dir_path not in downloaded_folder_list:
        print('{time}: begin to download {path} from icloud'.format(time=now_time_str(),path=dir_path))
        try:
            # os.system('find "{dir_path}" -type f -exec mdfind "kMDItemFSLocalizedName=($1)" \; -print0 -maxdepth 1| xargs -0 ls -l'.format(dir_path=dir_path))
            downloaded_folder_list.add(dir_path)
            os.system('find "{dir_path}" -type f -exec mdfind "kMDItemFSLocalizedName=($1)" \; -print0 -maxdepth 1| xargs -0 brctl download'.format(dir_path=dir_path))
            downloaded_folder_list.add(dir_path)
        except Exception as e:
            print("exception for dir_path:{path} : ".format(path=dir_path), e)
        print('{time}: complete the operation of downloading {path} from icloud'.format(time=now_time_str(),path=dir_path))

# CRUCIAL STEP: initialize a storage and persist its info in database

In [None]:
# for rootdir in uuid_to_abs_path.values():
#     scan_folder(rootdir, 20, trigger_downloading_from_icloud)

cur_storage = Storage()
cur_storage.set_dir_path('~/Documents')
cur_storage.set_name('mac_documents')
cur_storage.set_regTime(datetime.now())
cur_storage.set_status(StorageStatus.VALID)
cur_storage.set_type(StorageType.OSX)
cur_storage.set_uuid(random_uuid_str())
cur_storage.set_db_instance(DB)
cur_storage.save_self()

# CRUCIAL STEP: create a ScanningProcess for scanning the storage and continually create a lot of ScanningRecord

In [None]:
one_scan_proc = ScanningProcess.create_from(cur_storage, DB)
one_scan_proc.save_self()

one_scan_proc.scan_folder(20, trigger_downloading_from_icloud)

In [8]:
# print("scan_rec_list: ",scan_rec_list, " len(scan_rec_list): ",len(scan_rec_list))

# hash function with blake2b or md5

In [9]:
import magic
import hashlib
import timeit

BLOCKSIZE_1MB=1*1024*1024
# BLOCKSIZE_1MB=8*1024

def blake2b_file(file_path:str):
    '''
    sourced from https://gist.github.com/aunyks/042c2798383f016939c40aa1be4f4aaf
    '''
    start = timeit.default_timer()
    blake2b = hashlib.blake2b()
    # blake2b = hashlib.md5()
    with open(file_path, 'rb') as tmp_file_object:
        while True:
            file_buffer = tmp_file_object.read(BLOCKSIZE_1MB)
            # print("len(file_buffer): ",len(file_buffer))
            if not file_buffer:
                break
            blake2b.update(file_buffer)
    end = timeit.default_timer()
    print('time_perf(nano_sec):blake2b_file: time_elapse={time_elapse},file_path={file_path}'\
          .format(
            time_elapse=(end-start),file_path=file_path)
        )
    return blake2b.hexdigest()

## some testing code for hash function

In [None]:
blake2b_file('~/Documents/TalkPython_2021_11_17_345__10 Tips and Tools for Developer Productivity Transcript.docx')
# blake2b_file('~/Documents/talkpython_345-10-tips-and-tools.aup3')

In [None]:
tempHash = hashlib.blake2b()
tempHash.update(b'')
tempHash.hexdigest()

# CRUICIAL STEP: create FileInfo list from found_files which is scanned from a specified directory

In [None]:
'''
this code has concerned the subtle differences of the meaning of os.stat.st_ctime for creation time in different operating systems
'''

file_info_list = []
scan_rec_list:list[ScanningRecord] = DB.load_all('ScanningRecord')
for scan_rec in scan_rec_list:
    try:
        file = scan_rec.get_scan_abs_path()
        mime_res_complete = magic.from_file(file)
        mime_res_brief = magic.from_file(file, mime=True)
        file_hash = blake2b_file(file)
        
        file_stat = os.stat(file)
        
        fileInfo = FileInfo()
        fileInfo.set_file_size(file_stat.st_size)
        fileInfo.set_hash(file_hash)
        fileInfo.set_hash_algo('blake2b')
        fileInfo.set_mime_type(mime_res_brief)
        
        fileInfo.set_os_access_time(datetime.fromtimestamp(file_stat.st_atime, tz=timezone.utc))
        fileInfo.set_os_create_time(datetime.fromtimestamp(file_stat.st_birthtime if file_stat.st_birthtime and file_stat.st_birthtime<file_stat.st_ctime else file_stat.st_ctime, tz=timezone.utc)) # https://stackoverflow.com/questions/36984697/file-creation-time-do-not-match-when-using-os-stat and https://stackoverflow.com/questions/73534051/get-files-epoch-timestamp-convert-to-datetime-to-string-and-then-to-the-iden
        fileInfo.set_os_modify_time(datetime.fromtimestamp(file_stat.st_mtime, tz=timezone.utc))
        fileInfo.set_path(re.sub(r'{storage_path}(.*)'.format(storage_path=cur_storage.get_dir_path()),
                                r'{storage_uuid}{os_path_sep}\1'.format(storage_uuid=cur_storage.get_uuid(),os_path_sep=os.path.sep),
                                file)
                        )
        fileInfo.set_scan_proc_uuid(one_scan_proc.get_uuid())
        fileInfo.set_scanned_time(datetime.now())
        print('fileInfo: ',fileInfo)
        
        file_info_list.append(fileInfo)
        scan_rec.set_exec_status('ok')
    except Exception as e:
        scan_rec.set_exec_status('error')
        scan_rec.set_exec_msg(e)

In [None]:
for scan_record in scan_rec_list:
    print("scan_record: ", scan_record.get_uuid(),scan_record.get_exec_status(),scan_record.get_exec_msg(),"\n")
    
for file in file_info_list:
    print("file: ", file,"\n")
    # print('file:{file}'.format(file=file))
    # print('file:{file}\n\n'.format(file=repr(file)))