# Filter
I need to create a whitelist of config files
and a black list

In [148]:
from datetime import datetime, timezone

def sizeof_fmt(num, suffix="B"):
    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
        if abs(num) < 1024.0:
            return f"{num:3.1f}{unit}{suffix}"
        num /= 1024.0
    return f"{num:.1f}Yi{suffix}"

SyntaxError: invalid syntax (<ipython-input-148-249f90e7680d>, line 1)

In [126]:
import pathlib
textchars = bytearray({7,8,9,10,12,13,27} | set(range(0x20, 0x100)) - {0x7f})
is_binary_string = lambda bytes: bool(bytes.translate(None, textchars))    

forbid = lambda filename, *words: any([w in str(filename) for w in words])


In [127]:
path = pathlib.Path("/home/mavignau")
PROGRESS = 100
LINE_LENGTH = PROGRESS * 90
WANTED = ["shp.yaml"]

In [140]:
class Filter:
    def __init__(self):
        self.stats = {}
        self.lines = 0
        self.sizes = {}

        
    def __del__(self):
        self.control.close()

    def counter(self, flag, filename, check_size=True):
        size = 0
        if check_size:
            size = filename.stat().st_size
        self.sizes[flag] = self.sizes.get(flag, 0) + size
        self.stats[flag] = self.stats.get(flag, 0) + 1
        if "+" in flag or any([w in str(filename) for w in WANTED]):
            self.accepted += 1
            self.control.write(f"{flag},{filename},{sizeof_fmt(size)}\n")
        if flag.startswith("#"):
            return
        self.lines += 1
        
        
    def progress(self, end=False):
        if end:
            print(">", self.processed, self.accepted)
            self.control.close()
            return False
        self.processed +=1

        if not (self.processed % LINE_LENGTH): 
            print(self.processed, self.accepted)
        elif not (self.processed % PROGRESS) :
            print(end=".")
        return True
        
    def __call__(self, path):
        self.control = open("control.csv", "w")
        self.processed = self.accepted = 0
        for subpath in path.iterdir():
            if subpath.is_file():
                if not self.progress():
                    break
                res = self._eval_file(subpath)
                self.counter(*res)
            if subpath.is_dir() and subpath.parts[3] in [".config", ".tmux", ".tmuxp"]:
                for filename in subpath.rglob("*"):
                    if not self.progress():
                        break
                    res = self._eval_file(filename)
                    self.counter(*res)
        self.progress(True)
            
    def _eval_file(self, filename):
        if filename.is_dir():
            return "#1", filename, False
        elif not filename.exists():
            return "2", filename, False
        elif len(filename.parts) > 6:
            return "3", filename
        elif max([len(p) for p in filename.parts]) > 26:
            return "4", filename
        elif str(filename)[-3:] in ["log", "bak", "old", ".md"]:
            return "#5", filename
        elif forbid(filename, "history", "error", ".gitrc", ".ipynb", "patch", "dump", ".log"):
            return "#6", filename
        elif filename.stat().st_size < 1:
            return "#7", filename
        elif filename.stat().st_size > 8192:
            return "#8", filename
        
        elif is_binary_string(filename.open("rb").read(1024)):
            return "#0", filename
        else:
            return "#+", filename
        self.progress(True)

    def __iter__(self):
        for tag, value in self.sizes.items():
            yield tag, f"{self.stats[tag]}, {sizeof_fmt(value)}"
    

    def subdirs(self, path):
        pn = len(path.parts)
        sub_d = [p.parts[pn] for p in path.iterdir() if p.is_dir() and p.parts[3].startswith(".")]
        return sorted(sub_d)
    
myfilter=Filter()
#myfilter.subdirs(path)

In [142]:
myfilter(path)

.........................................................................................9000 109
.........................................................................................18000 151
.........................................................................................27000 157
..................> 28879 165


In [143]:
dict(myfilter)

{'#+': '330, 351.1KiB',
 '#8': '92, 9.3MiB',
 '#6': '34, 791.7KiB',
 '#0': '4, 1.1KiB',
 '4': '360, 281.6MiB',
 '#5': '10, 1.5MiB',
 '#7': '54, 0.0B',
 '#1': '8508, 0.0B',
 '3': '48321, 3.4GiB',
 '2': '46, 0.0B'}

In [117]:
Filter().subdirs(pathlib.Path("/home/mavignau/.config/"))

['AWSVPNClient',
 'GIMP',
 'Insomnia',
 'JetBrains',
 'KDE',
 'Mousepad',
 'Postman',
 'Slack',
 'Thunar',
 'VirtualBox',
 'ardour6',
 'autostart',
 'autostart-scripts',
 'catfish',
 'cef_user_data',
 'dconf',
 'drata-agent',
 'enchant',
 'evolution',
 'fades',
 'freeplane',
 'goa-1.0',
 'google-chrome',
 'gtk-2.0',
 'gtk-3.0',
 'gtk-4.0',
 'guvcview2',
 'htop',
 'ibus',
 'inkscape',
 'kazam',
 'kde.org',
 'kdeconnect',
 'keepassxc',
 'lazydocker',
 'lazygit',
 'libaccounts-glib',
 'libreoffice',
 'menus',
 'nvim',
 'obs-studio',
 'pudb',
 'pulse',
 'qastools',
 'ristretto',
 'session',
 'sourcetrail',
 'sway',
 'update-notifier',
 'vifm',
 'vlc',
 'xfce4',
 'xsettingsd',
 'xubuntu',
 'zim']

In [144]:
pf = pathlib.Path("/home/mavignau/.tmuxp/shp2.yaml")

In [145]:
pf.stat()

os.stat_result(st_mode=33204, st_ino=27791308, st_dev=64771, st_nlink=1, st_uid=1000, st_gid=1000, st_size=842, st_atime=1641720288, st_mtime=1632949709, st_ctime=1641306351)

In [146]:
from datetime import datetime, timezone

In [147]:
stat_result = pf.stat()
modified = datetime.fromtimestamp(stat_result.st_mtime, tz=timezone.utc)
print('modified', modified)

modified 2021-09-29 21:08:29.604294+00:00
