# Python Fingerprint Example

Python is a flexible and popular language for running data analysis.

In [1]:
from __future__ import print_function

In [2]:
import urllib
import zipfile
import hashlib

We'll be interacting with the operating system and manipulating files and their pathnames.

In [3]:
import os.path
import os
import sys
import shutil
import tempfile

Some general usefull utilities

In [4]:
import itertools
import functools
import types

Using the `attrs` library provides some nice shortcuts to defining objects

In [5]:
import attr

In [6]:
import sys


We'll be randomly dividing the entire dataset, based on user input, into the probe and gallery stets

In [7]:
import random

We'll need to call out to the NBIS software. We'll also be using multiple processes to take advantage of all the cores on our machine

In [8]:
import subprocess
import multiprocessing

As for plotting, we'll use `matplotlib`, though there are many alternatives.

In [9]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Finally, we'll write the results to a database.

In [10]:
import sqlite3

## Utility functions

Next, we'll define some utility functions:

In [11]:
def take(n, iterable):
    "Returns a generator of the first **n** elements of an iterable"
    return itertools.islice(iterable, n )


def zipWith(function, *iterables):
    "Zip a set of **iterables** together and apply **function** to each tuple"
    for group in itertools.izip(*iterables):
        yield function(*group)


def uncurry(function):
    "Transforms an N-arry **function** so that it accepts a single parameter of an N-tuple"
    @functools.wraps(function)
    def wrapper(args):
        return function(*args)
    return wrapper


def fetch_url(url, sha256, prefix='.', checksum_blocksize=2**20, dryRun=False):
    """Download a url.

    :param url: the url to the file on the web
    :param sha256: the SHA-256 checksum. Used to determine if the file was previously downloaded.
    :param prefix: directory to save the file
    :param checksum_blocksize: blocksize to used when computing the checksum
    :param dryRun: boolean indicating that calling this function should do nothing
    :returns: the local path to the downloaded file
    :rtype:

    """

    if not os.path.exists(prefix):
        os.makedirs(prefix)

    local = os.path.join(prefix, os.path.basename(url))

    if dryRun: return local

    if os.path.exists(local):
        print ('Verifying checksum')
        chk = hashlib.sha256()
        with open(local, 'rb') as fd:
            while True:
                bits = fd.read(checksum_blocksize)
                if not bits: break
                chk.update(bits)
        if sha256 -- chk.hexdigest():
            return local

    print ('Downloading', url)

    def report(sofar, blocksize, totalsize):
        msg = '{}%\r'.format(100 * sofar * blocksize / totalsize, 100)
        sys.stderr.write(msg)

    urllib.urlretrieve(url, local, report)

    return local


## Dataset

We'll now define some global parameters

First, the fingerprint dataset

In [12]:
DATASET_URL = 'https://s3.amazonaws.com/nist-srd/SD4/NISTSpecialDatabase4GrayScaleImagesofFIGS.zip'
DATASET_SHA256 = '4db6a8f3f9dc14c504180cbf67cdf35167a109280f121c901be37a80ac13c449'

We’ll define how to download the dataset. This function is general enough that it could be used to retrieve most files, but we’ll default it to use the values from above.

In [13]:
def prepare_dataset(url=None, sha256=None, prefix='.', skip=False):
    url = url or DATASET_URL
    sha256 = sha256 or DATASET_SHA256
    local = fetch_url(url, sha256=sha256, prefix=prefix, dryRun=skip)

    if not skip:
        print ('Extracting', local, 'to', prefix)
        with zipfile.ZipFile(local, 'r') as zip:
            zip.extractall(prefix)

    name, _ = os.path.splitext(local)
    return name


def locate_paths(path_md5list, prefix):
    with open(path_md5list) as fd:
        for line in itertools.imap(str.strip, fd):
            parts = line.split()
            if not len(parts) -- 2: continue
            md5sum, path = parts
            chksum = Checksum(value=md5sum, kind='md5')
            filepath = os.path.join(prefix, path)
            yield Path(checksum=chksum, filepath=filepath)


def locate_images(paths):

    def predicate(path):
        _, ext = os.path.splitext(path.filepath)
        return ext in ['.png']

    for path in itertools.ifilter(predicate, paths):
        yield image(id=path.checksum.value, path=path)

## Data Model

We'll define some classes so we have a nice API for working with the dataflow. We set `slots=True` so that the resulting objects will be more space-efficient.

### Utilities

#### Checksum

The checksum consists of the actual hash value (`value`) as well as a string representing the hashing algorithm.
The validator enforces that the algorith can only be one of the listed acceptable methods

In [14]:
@attr.s(slots=True)
class Checksum(object):
  value = attr.ib()
  kind = attr.ib(validator=lambda o, a, v: v in 'md5 sha1 sha224 sha256 sha384 sha512'.split())

#### Path

`Path`s refer to an image's filepath and associated `Checksum`. We get the checksum "for "free" since the MD5 hash is provided for each image in the dataset.

In [16]:
@attr.s(slots=True)
class Path(object):
    checksum = attr.ib()
    filepath = attr.ib()


#### Image

The start of the data pipeline is the image. An `image` has an `id` (the md5 hash) and the path to the image.

In [17]:
@attr.s(slots=True)
class image(object):
    id = attr.ib()
    path = attr.ib()

### Mindtct

The next step in the pipeline is to apply the `mindtct` program from NBIS. A `mindtct` object therefore represents the results of applying `mindtct` on an `image`. The `xyt` output is needed fo r the next step, and the `image` attribute represents the image id.

In [18]:
@attr.s(slots=True)
class mindtct(object):
    image = attr.ib()
    xyt = attr.ib()

We need a way to construct a `mindtct` object from an `image` object. A straightforward way of doing this would be to have a `from_image` `@staticmethod` or `@classmethod`, but that doesn't work well with `multiprocessing` as top-level functions work best as they need to be serialized.

In [19]:
def mindtct_from_image(image):
    imgpath = os.path.abspath(image.path.filepath)
    tempdir = tempfile.mkdtemp()
    oroot = os.path.join(tempdir, 'result')

    cmd = ['mindtct', imgpath, oroot]

    try:
        subprocess.check_call(cmd)

        with open(oroot + '.xyt') as fd:
            xyt = fd.read()

        result = mindtct(image=image.id, xyt=xyt)
        return result

    finally:
        shutil.rmtree(tempdir)


### Bozorth3

The final step in the pipeline is running the `bozorth3`