----
## Step 2: Train a Model on a Realistic Dataset
Once you have settled on a good architecture, you can train your model on real data. In particular, the [Street View House Numbers (SVHN)](http://ufldl.stanford.edu/housenumbers/) dataset is a good large-scale dataset collected from house numbers in Google Street View. Training on this more challenging dataset, where the digits are not neatly lined-up and have various skews, fonts and colors, likely means you have to do some hyperparameter exploration to perform well.

In [4]:
from sklearn.utils import shuffle
import numpy as np
import h5py
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import misc
from keras.utils import np_utils
from keras.models import Model
from keras.layers import Dense, Input, merge, Convolution2D, MaxPooling2D
from keras.layers import Dropout, Flatten
#Code structure to import .mat file
def mat_open(f):
    data= {}
    data['height'] = []
    data['label'] = []
    data['left'] = []
    data['top'] = []
    data['width'] = []
    data['names'] = []
    def get_bbox(name, obj):
        vals = []
        if obj.shape[0] == 1:
            vals.append(obj[0][0])
        else:
            for k in range(obj.shape[0]):
                vals.append(f[obj[k][0]][0][0])
        data[name].append(vals)


    for item in f['digitStruct/bbox']:
        f[item[0]].visititems(get_bbox)
    def get_name(index, hdf5_data):
        name = hdf5_data['/digitStruct/name']
        return ''.join([chr(v[0]) for v in hdf5_data[name[index][0]].value])
    for index in range(0,f['/digitStruct/name'].shape[0]):
        data['names'].append(get_name(index,f))
    #Make labels 6 digits long
    for label in data['label']:
        num_dig = len(label)
        if len(label)<5:
            label+=[0 for item in range(0,5-len(label))]
        label+=[num_dig]
        
    for key in ['left','top','width','height']:
        for itr in xrange(len(data[key])):
            if len(data[key][itr])>5:
                data[key][itr] = data[key][itr][0:5]
    #find right and bottom of bounding box
    for key in data:
        data[key]=np.asarray([np.asarray(obj) for obj in data[key]])
    data['right'] =data['left']+data['width']
    data['bottom'] = data['top']+data['height']
    
    data['unified_label']=np.zeros((len(data['label']),data['label'][0].shape[0]))

    for i in range(0,len(data['label'])):
        data['unified_label'][i,:] = data['label'][i][0:6]
    return data

In [None]:
train_file = h5py.File('train/train/digitStruct.mat')
data=mat_open(train_file)
test_file = h5py.File('test/test/digitStruct.mat')
test_data=mat_open(test_file)
extra_file = h5py.File('extra/extra/digitStruct.mat')
extra=mat_open(extra_file)

In [8]:
np.random.seed(0)
from sklearn.utils import shuffle
for key in data:
    data[key] = shuffle(data[key],random_state=10)
for key in test_data:
    test_data[key] = shuffle(test_data[key], random_state=10)
for key in extra:
    extra[key] = shuffle(extra[key], random_state=10)




In [9]:
import cPickle as pickle
pickle.dump( data, open( "data.p", "wb" ) )
pickle.dump( test_data, open( "test_data.p", "wb" ) )
pickle.dump( extra, open( "extra.p", "wb" ) )