In [1]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
import PIL.Image as Image

# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline

In [2]:
# downloading the data

url = 'http://ufldl.stanford.edu/housenumbers/'
last_percent_reported = None

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 1% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent
        
def maybe_download(filename, force=False):
  """Download a file if not present, and make sure it's the right size."""
  if force or not os.path.exists(filename):
    print('Attempting to download:', filename) 
    filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook)
    print('\nDownload Complete!')
  statinfo = os.stat(filename)
  return filename

train_filename = maybe_download('train.tar.gz')
test_filename = maybe_download('test.tar.gz')
#extra_filename = maybe_download('extra.tar.gz')

In [3]:
# extracting tar files

np.random.seed(133)

def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall()
    tar.close()
  data_folders = root
  print(data_folders)
  return data_folders
  
train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)
#extra_folders = maybe_extract(extra_filename)

train already present - Skipping extraction of train.tar.gz.
train
test already present - Skipping extraction of test.tar.gz.
test
Extracting data for extra. This may take a while. Please wait.


IOError: CRC check failed 0x7f72c4d1 != 0xdeb0f02dL

In [4]:
# Source http://www.a2ialab.com/

# SVHN extracts data from the digitStruct.mat full numbers files.   
#
# This is an A2iA tweak (YG -9 Jan 2014) of the script found here :
# http://blog.grimwisdom.com/python/street-view-house-numbers-svhn-and-octave
#
# The digitStruct.mat files in the full numbers tars (train.tar.gz, test.tar.gz, and extra.tar.gz) 
# are only compatible with matlab.

import h5py

# The DigitStructFile is just a wrapper around the h5py data.  It basically references 
#    inf:              The input h5 matlab file
#    digitStructName   The h5 ref to all the file names
#    digitStructBbox   The h5 ref to all struc data
class DigitStructFile:
    def __init__(self, inf):
        self.inf = h5py.File(inf, 'r')
        self.digitStructName = self.inf['digitStruct']['name']
        self.digitStructBbox = self.inf['digitStruct']['bbox']

# getName returns the 'name' string for for the n(th) digitStruct. 
    def getName(self,n):
        return ''.join([chr(c[0]) for c in self.inf[self.digitStructName[n][0]].value])

# bboxHelper handles the coding difference when there is exactly one bbox or an array of bbox. 
    def bboxHelper(self,attr):
        if (len(attr) > 1):
            attr = [self.inf[attr.value[j].item()].value[0][0] for j in range(len(attr))]
        else:
            attr = [attr.value[0][0]]
        return attr

# getBbox returns a dict of data for the n(th) bbox. 
    def getBbox(self,n):
        bbox = {}
        bb = self.digitStructBbox[n].item()
        bbox['height'] = self.bboxHelper(self.inf[bb]["height"])
        bbox['label'] = self.bboxHelper(self.inf[bb]["label"])
        bbox['left'] = self.bboxHelper(self.inf[bb]["left"])
        bbox['top'] = self.bboxHelper(self.inf[bb]["top"])
        bbox['width'] = self.bboxHelper(self.inf[bb]["width"])
        return bbox

    def getDigitStructure(self,n):
        s = self.getBbox(n)
        s['name']=self.getName(n)
        return s

# getAllDigitStructure returns all the digitStruct from the input file.     
    def getAllDigitStructure(self):
        return [self.getDigitStructure(i) for i in range(len(self.digitStructName))]

# Return a restructured version of the dataset (one structure by boxed digit).
#
#   Return a list of such dicts :
#      'filename' : filename of the samples
#      'boxes' : list of such dicts (one by digit) :
#          'label' : 1 to 9 corresponding digits. 10 for digit '0' in image.
#          'left', 'top' : position of bounding box
#          'width', 'height' : dimension of bounding box
#
# Note: We may turn this to a generator, if memory issues arise.
    def getAllDigitStructure_ByDigit(self):
        pictDat = self.getAllDigitStructure()
        result = []
        structCnt = 1
        for i in range(len(pictDat)):
            item = { 'filename' : pictDat[i]["name"] }
            figures = []
            for j in range(len(pictDat[i]['height'])):
               figure = {}
               figure['height'] = pictDat[i]['height'][j]
               figure['label']  = pictDat[i]['label'][j]
               figure['left']   = pictDat[i]['left'][j]
               figure['top']    = pictDat[i]['top'][j]
               figure['width']  = pictDat[i]['width'][j]
               figures.append(figure)
            structCnt = structCnt + 1
            item['boxes'] = figures
            result.append(item)
        return result

In [5]:
# process bbox and label information
train_folders = 'train'
test_folders = 'test'
extra_folders = 'extra'

fin = os.path.join(train_folders, 'digitStruct.mat')
dsf = DigitStructFile(fin)
train_data = dsf.getAllDigitStructure_ByDigit()

fin = os.path.join(test_folders, 'digitStruct.mat')
dsf = DigitStructFile(fin)
test_data = dsf.getAllDigitStructure_ByDigit()

#fin = os.path.join(test_folders, 'digitStruct.mat')
#dsf = DigitStructFile(fin)
#extra_data = dsf.getAllDigitStructure_ByDigit()

In [7]:
# Adapted from https://github.com/hangyao/

def img2dataset(data, folder):
    #define arrays to store image data
    dataset = np.ndarray([len(data),32,32,1], dtype='float32')
    # create array to store length and 5 digits in vector of length 6
    label = np.ones([len(data),6], dtype=int) * 10
    #loop over all images and store 
    for i in np.arange(len(data)):
        filename = data[i]['filename']
        path = os.path.join(folder, filename)
        img = Image.open(path)
        boxes = data[i]['boxes']
        digitcount = len(boxes)
        label[i,0] = digitcount
        top = np.ndarray([digitcount], dtype='float32')
        left = np.ndarray([digitcount], dtype='float32')
        height = np.ndarray([digitcount], dtype='float32')
        width = np.ndarray([digitcount], dtype='float32')
        for j in np.arange(digitcount):
            if j < 5: 
                label[i,j+1] = boxes[j]['label']
                if boxes[j]['label'] == 10: label[i,j+1] = 0
            else: print('image',i,'ignored as too long (>5 digits).')
            top[j] = boxes[j]['top']
            left[j] = boxes[j]['left']
            height[j] = boxes[j]['height']
            width[j] = boxes[j]['width']
        
        # measuring out which box is the highest/most left plus height and width
        img_top = np.amin(top)
        img_left = np.amin(left)
        img_height = np.amax(top) + height[np.argmax(top)] - img_top
        img_width = np.amax(left) + width[np.argmax(left)] - img_left
        
        #calculating cut off region
        img_top = np.floor(img_top - 0.1 * img_height)
        img_left = np.floor(img_left - 0.1 * img_width)
        img_bottom = np.amin([np.ceil(img_top + 1.2 * img_height), img.size[1]])
        img_right = np.amin([np.ceil(img_left + 1.2 * img_width), img.size[0]])
        
        
        # crop and resize to create image of size 32x32 with all digits in it
        img = img.crop((img_left, img_top, img_right, img_bottom)).resize([32,32], Image.ANTIALIAS)
        
        #converting to grayscale
        img = np.dot(np.array(img, dtype='float32'), [[0.2989],[0.5870],[0.1140]])
        
        # normalizing
        mean = np.mean(img, dtype='float32')
        std = np.std(img, dtype='float32', ddof=1)
        if std < 1e-4: std = 1.
        img = (img - mean) / std
        
        dataset[i,:,:,:] = img[:,:,:]
        
    return dataset, label

train_dataset, train_labels = img2dataset(train_data, train_folders)
print(train_dataset.shape, train_labels.shape)

test_dataset, test_labels = img2dataset(test_data, test_folders)
print(test_dataset.shape, test_labels.shape)

#extra_dataset, extra_labels = generate_dataset(extra_data, extra_folders)
#print(extra_dataset.shape, extra_labels.shape)

image 29929 ignored as too long (>5 digits).
(33402L, 32L, 32L, 1L) (33402L, 6L)
(13068L, 32L, 32L, 1L) (13068L, 6L)


NameError: name 'generate_dataset' is not defined

In [14]:
#test a label to make sure you understand the structure
print(test_labels[200,:])

[ 2  3  0 10 10 10]


In [20]:
# combine extra set with training set
total_data = np.concatenate((train_data,extra_data), axis=0)
total_labels = np.concatenate((train_labels,extra_labels), axis=0)

# split in training and validation set
train_data, valid_data, train_labels, valid_labels = train_test_split(total_data, total_labels, train_size=0.9, random_state=106)
del total_data, total_labels, extra_data, extra_labels

# check shape of data after transformation
print(train_data.shape, train_labels.shape)
print(test_data.shape, test_labels.shape)
print(valid_data.shape, valid_labels.shape)

(13068L, 6L)


In [41]:
# create pickle file for later use
pickle_file = 'SVHN_multi.pickle'

try:
  f = open(pickle_file, 'wb')
  save = {
    'train_dataset': train_data,
    'train_labels': train_labels,
    'test_dataset': test_data,
    'test_labels': test_labels,
    'valid_dataset': valid_data,
    'valid_labels': valid_labels,    
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise
    
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)