# Let's get started

First, good habits. Running `%load_ext lab_black` makes all your code format to uncompromoisingly compliant with some such-and-such. But it's pretty and works, so I use it. Most comments from here forward are Python comments in the code-blocks.

In [None]:
%load_ext lab_black

In [None]:
# We don't need much right away.

from time import time
from pathlib import Path
from pickle import loads, dumps
from sqlitedict import SqliteDict as sqldict

In [None]:
# Watch for the folder being created (if not already there).
data = "cats"
Path(data).mkdir(exist_ok=True)
print("Done")

In [None]:
# Let's count from 0 to 9 in Python
for i in range(10):
    print(i, end=" ")
print("Done")

In [None]:
# Let's count from 1 to 10 in Python
for i in range(1, 11):
    print(i, end=" ")
print("Done")

In [None]:
# Don't make things more difficult
for i in range(10):
    print(i + 1, end=" ")
print("Done")

In [None]:
# Count to 100,000 by 1,000.
for i in range(100000):
    if not i % 1000:
        print(i, end=" ")
print("Done")

In [None]:
# How many zeros in ten-thousand? In a thousand?
for i in range(10**5):
    if not i % 10**3:
        print(i, end=" ")
print("Done")

In [None]:
# Can you format that with commas?
for i in range(10**5):
    if not i % 10**3:
        print(f"{i:,}", end=" ")
print("Done")

In [None]:
# What's it like to write a million lines into a text file?
filename = f"{data}/text.txt"
with open(filename, "wt") as fh:
    for i in range(1000000):
        fh.write(f"{i}\n")
print("Done")  # Fast!!!

In [None]:
# That was fast. How big is that file?
bytesize = Path(filename).stat().st_size
kilo = 1000
print(f"The file {filename} is {bytesize:,} Bytes.")
print(f"Abbreviated to {bytesize / kilo:,.0f} Kilobytes.")  # The :,0f formats
print(f"Or just {bytesize / kilo / kilo:.0f} Megs.")
print("Done")
# A million short lines is still several megs.

In [None]:
# What's it like to write 100,000 keys into a SQlite database?
# Let's count down instead of up. Keys go up but count is down.
filename = f"{data}/database.db"
now = time()
upto = 100000
with sqldict(filename) as db:
    for i in range(upto):
        db[i] = None
        if not i % 10000:
            db.commit()
            print(f"{upto - i:,}...", end=" ")
seconds = int(time() - now)
print(f"\nDone ({seconds} seconds)")

In [None]:
# That was fast. How big is that file?
bytesize = Path(filename).stat().st_size
kilo = 1000
print("While you get the power of SQL ths way, you lose speed.")
print(f"The file {filename} is {bytesize / kilo:,.0f} Kilobytes")
print(f"Or ~{bytesize / kilo / kilo:.0f} Megs, but only has {upto:,} records.")
print("We will not go this route.")
print("Done")

In [None]:
# How many zeros in a billion?
f"{10**9:,} The Goose Drank Done"

In [None]:
# How many zeros in a hundred-million?
f"{10**8:,} Done"

In [None]:
# Let's count down from a billion by hundred-millions
hundredmillion = 10**8
billion = 10**9
now = time()
modulorow = 1
print("Count down with me from a billion in Python:")
for i in range(billion):
    if not i % hundredmillion:
        glimpse = int(time() - now)
        print(f"{modulorow}: {billion - i:,} ({glimpse} sec)...")
        modulorow += 1
seconds = int(time() - now)
print(f"Done ({seconds} seconds)")  # Computers are fast but not that fast

In [None]:
# Make a hundred million keys and save them to disk.
seen = set()
million = 10**6
for i in range(million):
    seen.add(i)
print(f"Made {len(seen):,} keys.")

# Dump pickled set to file
filename = f"{data}/dumps.pkl"
kilo = 1000
with open(filename, "wb") as fh:
    fh.write(dumps(seen))
print(f"Saved {filename} to drive.")

# Report size of file
bytesize = Path(filename).stat().st_size
print(f"{filename} is {bytesize / kilo:,.0f} Kilobytes")

# Load picled set out of file"
with open(filename, "rb") as fh:
    seen = loads(fh.read())
print(f"Read native Python {type(seen)} back off of drive.")
print("Go this route because of how fast and awesome this is?")
print("Done")

In [None]:
# So far I've used sequential numbers from 0 to a billion as keys.
# Let's use cats as our keys instead. Load our packges and configure.

from json import loads as js
from PIL import Image
from httpx import get
from io import BytesIO
from time import sleep
from pathlib import Path
from pickle import loads, dumps
from imagehash import phash, whash
from IPython.display import display
from PIL.PngImagePlugin import PngInfo

# Where we save cats and generate thumbs.
data = "cats"
save_to = f"{data}/source"
thumbs = f"{data}/thumbs"

# Make those locatiosn if they don't exist.
Path(data).mkdir(exist_ok=True)
Path(save_to).mkdir(exist_ok=True)
Path(thumbs).mkdir(exist_ok=True)

In [None]:
# Download 30 cats that don't exist.

# If you actually want to fetch 30 cats that don't exist again
# then delete the contents of cats/source folder and re-run.
# You can delete just a sinlge cat from source and watch it re-fill
# except by doing so removes referenced data. Fetch more. Whatever.

url = "https://thiscatdoesnotexist.com/"
cats = 30
for i in range(cats):
    filename = f"{save_to}/cat-{str(i).zfill(3)}.jpg"
    if not Path(filename).exists():
        print(f"{cats - i} Downloading: {filename}")
        response = get(url)
        img = Image.open(BytesIO(response.content))
        img.save(filename)
        sleep(1)
print("Done")

# Noteworthy Stuff

## Background

We are gathering 30 cats that don't exist to help explore an image classification system that leverages the strength of native operating systems that let you drag thumbnail icons around.

## Perceptual Image Hash Filenames

Using perceptual image hashes as filenames naturally dedupes. Thumbnails are optionally sorted into ***minimum hamming distance*** groups to reduce the amount of work when the OS has to do when you're surfing and sorting hundreds of thousands of thumbnails. 

## Folders Classify

Within the thumbs folder, it matters not where you put things. It just accumulates classifications on a directory-name basis while you move stuff around, return everything to home-folders

- 64 pixels thumbnails seem just about right.
- Meta-data is retrievably tucked into the PNG thumbnails.
- You can add your own meta-data, preserving dates and such
- Filenames are made from image size & perceptual hashes
- This file-naming helps remove-duplicates as you organize.

In [None]:
# Generate thumbnails for the source folder of cat images.
# If you wish to see the thumbnails generate again, you have to
# delete seencats.pkl and the contents of thumbs folder.

size = 64

# Load set of seen cats from pickle if exists.
pickled_cats = f"{data}/seencats.pkl"
if Path(pickled_cats).exists():
    with open(pickled_cats, "rb") as fh:
        seen = loads(fh.read())
else:
    seen = set()

# Make thumbnails of cat pics.
for cat in Path(save_to).glob("*.jpg"):
    img = Image.open(cat)
    thumb = img.copy()
    thumb.thumbnail((size, size))
    awhash = whash(img, hash_size=8)
    width, height = img.width, img.height
    bands = "".join(img.getbands())
    meta_data = {
        "filename": cat.name,
        "width": width,
        "height": height,
        "format": img.format,
        "format_description": img.format_description,
        "bands": img.getbands(),
        "extremes": img.getextrema(),
        "xmp": img.getxmp(),
    }
    pi = PngInfo()
    for meta in meta_data:
        pi.add_text(meta, f"{meta_data[meta]}")
    filename = f"{width}x{height}_{awhash}_.png"
    if filename not in seen:
        print(cat)
        display(thumb)
        seen.add(filename)
        print(filename)
        thumb.save(
            f"{thumbs}/{filename}",
            "PNG",
            pnginfo=pi,
            save_all=True,
        )
        print()
with open(pickled_cats, "wb") as fh:
    fh.write(dumps(seen))

# Report size of file
bytesize = Path(pickled_cats).stat().st_size
print(f"{pickled_cats} is {bytesize:,} Bytes")

print("Done")

In [None]:
def size_name(n):
    sizes = {
        4: "Ten Thousand",
        5: "Hundred Thousand",
        6: "Million",
        9: "Billion",
        12: "Trillion",
        15: "Quadrillion",
        18: "Quintillion",
        21: "Sextillion",
        24: "Septillion",
        27: "Octillion",
        30: "Nonillion",
        33: "Decillion",
        36: "Undecillion",
        39: "Duodecillion",
        42: "Tredecillion",
        45: "Quattuordecillion",
        48: "Quindecillion",
        51: "Sexdecillion",
        54: "Septendecillion",
        57: "Octodecillion",
        60: "Novemdecillion",
        63: "Vigintillion",
    }
    exponent = len(str(n)) - 1
    exponent -= exponent % 3
    size = sizes.get(exponent, "extremely large")
    return size


# Notice how some cats are more hexed than others.
print("How unique can a 16-digit hexidecimal number really be?")
print()
print("Filename_extract converted_2hex decimal big_number_name...")
for cat in Path(thumbs).glob("*.png"):
    name = cat.name
    parts = name.split("_")
    whash = parts[1]
    ahex = hex(int(whash, 16))
    adec = int(ahex, 16)
    word = size_name(adec)
    print(whash, ahex, f"{adec:,}", word)

In [None]:
# Yes, the meta data is still in the PNG thumnails.
for i, cat in enumerate(Path(thumbs).glob("*.png")):
    print(cat)
    img = Image.open(cat)
    meta = img.text
    for key in meta:
        print(f"{key}: {meta[key]}")
    print()
    if i >= 2:
        break  # Seen enough proof?
print("Done")

# Organize by Folder to Classify

In [None]:
# It is now possible to drag-copy tag, organize, classify, yadda yadda
# and spin through it to see how cats were classified, per new folders.
from os import scandir

cdict = {}


def find_thumbs(path):
    global cdict
    for entry in scandir(path):
        if entry.is_dir(follow_symlinks=False):
            try:
                find_thumbs(entry.path)
            except:
                continue
        else:
            try:
                found = entry.stat(follow_symlinks=False)
            except:
                continue
            name, path = entry.name, entry.path
            path = path.split("/")
            path = "/".join(path[:-1]) + "/"
            cdict[name] = path
    return cdict


print("Done")

In [None]:
data = "cats"
cdict = find_thumbs(f"{data}/thumbs")
print(len(cdict))
cdict

# File Locations Don't Matter

The first tag we apply to our images is the image resolution as it is easily extracted from filenames &#151; no matter where under the thumbnails hierarchy they reside. This code will walk the thumbs directory very fast because they're only 64x64 thumbnails. There are no hard ties to the original image. Joining data here against the original images is based on the fact that the same process on the original image will always result in the same filename. This frees us to organize using thumbnails and defer where to keep the originals or what their "default" organization path should be until later. We re-acquire actual hardwired file-paths with occasional scans like this.

In [None]:
# Sort into folders by size.


def classify_by_folder(path):
    global cdict, seen, tags
    for entry in scandir(path):
        if entry.is_dir(follow_symlinks=False):
            try:
                classify_by_folder(entry.path)
            except:
                continue
        else:
            try:
                found = entry.stat(follow_symlinks=False)
            except:
                continue
            name, path = entry.name, entry.path
            seen.add(name)
            path = path.split("/")
            path = "/".join(path[:-1]) + "/"
            parts = name.split("_")
            size = parts[0]
            if size not in seensizes:
                Path(f"{data}/thumbs/by_size/{size}").mkdir(exist_ok=True, parents=True)
                seensizes.add(size)
                print(size)
            # move_to = f"{data}/thumbs/by_size/{size}"
            cdict[name] = path
            classifications = path.split("/")[2:-1]
            classifications.append(size)
            classifications = [x for x in classifications if not x.isnumeric()]
            tuples = [(name, tag) for tag in classifications]
            [tags.add(atuple) for atuple in tuples]
            # print(classifications, name)
    return cdict


print("Done")

In [None]:
seen = set()
seensizes = set()
tags = set()

cdict = {}
data = "cats"
print(seen ^ cdict.keys())
cdict = classify_by_folder(f"{data}/thumbs")
print(seen ^ cdict.keys())

# cdict

In [None]:
cdict

In [None]:
tags

In [None]:
len(tags)

In [None]:
len(seen)

In [None]:
# Load set of classifications if it exists.
tagtable = f"{data}/tagtable.pkl"

if Path(tagtable).exists():
    with open(tagtable, "rb") as fh:
        # Add new tags to existing.
        existing_tags = loads(fh.read())
        [existing_tags.add(tag) for tag in tags]
    with open(tagtable, "wb") as fh:
        # Write combined tags out.
        fh.write(dumps(existing_tags))
else:
    # Create new record of tags.
    with open(tagtable, "wb") as fh:
        fh.write(dumps(tags))

In [None]:
with open(tagtable, "rb") as fh:
    existing_tags = loads(fh.read())
existing_tags

In [None]:
# Calculate the hamming distance adjacent cats.
prior = int("0", 16)
for file in seen:
    awhash = file.split("_")[1]
    current = int(awhash, 16)
    diff = bin(current ^ prior).count("1")
    print(diff, end=" ")
    prior = int(awhash, 16)

In [None]:
from collections import Counter

In [None]:
hamdiffs = Counter()
catpairs = set()
for cat1 in seen:
    parts = cat1.split("_")
    for cat2 in seen:
        parts = cat2.split("_")
        int1, int2 = [int(x, 16) for x in (washcat1, washcat2)]
        if int1 != int2:
            diff = bin(int1 ^ int2).count("1")
            append_list = [int(diff)]
            catpairdiff = tuple(sorted([washcat1, washcat2]) + append_list)
            hamdiffs[diff] += 1
            catpairs.add(catpairdiff)

In [None]:
len(catpairs)

In [None]:
catpairs

In [None]:
hamdiffs

In [None]:
sorted_dict = dict(sorted(hamdiffs.items(), key=lambda item: item[0], reverse=False))
sorted_dict

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.bar(hamdiffs.keys(), hamdiffs.values())
plt.xticks(rotation=90)
plt.show()

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(catpairs, columns=["cat1", "cat2", "ham"])
df

In [None]:
min_hams = set()
ham_goes = {}
for cat in df.groupby("cat1"):
    name, dfg = cat
    min_ham = dfg.ham.min()
    min_ham = str(min_ham).zfill(2)
    min_hams.add(min_ham)
    ham_goes[name] = min_ham
print("minimum hams:", min_hams)
print("total minimums:", len(min_hams))
ham_goes

In [None]:
print(len(ham_goes))
ham_goes

In [None]:
min_sizes = set()
size_goes = {}
for cat in df.groupby("cat1"):
    name, dfg = cat
    min_size = dfg.sizediff.min()
    min_size = str(min_size).zfill(2)
    min_sizes.add(min_size)
    size_goes[name] = min_size
print("minimum sizes:", min_sizes)
print("total minimums:", len(min_size))

In [None]:
all_same_size = all(size_goes.values())
print(f"Everything same size? {all_same_size}")

In [None]:
cdict

In [None]:
import shutil

In [None]:
def build_cdict(path):
    global cdict, seen, tags
    for entry in scandir(path):
        if entry.is_dir(follow_symlinks=False):
            try:
                build_cdict(entry.path)
            except:
                continue
        else:
            try:
                found = entry.stat(follow_symlinks=False)
            except:
                continue
            name, path = entry.name, entry.path
            seen.add(name)
            path = path.split("/")
            path = "/".join(path[:-1]) + "/"
            parts = name.split("_")
            size = parts[0]
            cdict[name] = path
            classifications = path.split("/")[2:-1]
            classifications.append(size)
            classifications = [x for x in classifications if not x.isnumeric()]
            tuples = [(name, tag) for tag in classifications]
            [tags.add(atuple) for atuple in tuples]
            # print(classifications, name)
    return cdict

In [None]:
# Move everyting into hamming folders

# Update cdict with latest file locations
seen = set()
cdict = {}
cdict = build_cdict(f"{data}/thumbs")
sort_choice = "by_size"

# Sort cats into minimum hamming-distance folders
for file in cdict:
    from_folder = cdict[file]
    parts = file.split("_")
    whash = parts[1]
    size = parts[0]
    if whash in ham_goes:
        sort_dict = {
            "by_ham": f"{data}/thumbs/by_ham/{ham_goes[whash]}",
            "by_size": f"{data}/thumbs/by_size/{size}",
            "by_hist": f"{data}/thumbs/by_hist/",
        }
        folder = sort_dict[sort_choice]
        to_path = Path(folder)
        if not to_path.is_dir():
            Path(to_path).mkdir(parents=True, exist_ok=True)
        full_path = f"{from_folder}{file}"
        if not Path(f"{to_path}/{file}").exists():
            dest = shutil.move(full_path, to_path)
            pass
print("Done")

In [None]:
seen