# MNIST Handwritten Digit Recognition
Depends on MNIST in CSV format: http://yann.lecun.com/exdb/mnist/
    


In [14]:
#imports
import numpy as np 
import pandas as pd 
import matplotlib.pylab as plt

import keras
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten
from keras.models import Sequential
from keras.utils import to_categorical, normalize
from keras.layers.normalization import BatchNormalization


In [4]:
#!ls
#!pwd
#!ls ..
#!ls ../input
#example_file = "../input/sample_submission.csv"
#df = pd.read_csv(example_file)
#print(df.head())

In [5]:
#constants
#base_path = '../input/'
base_path = ''
train_file = base_path + 'train.csv'
test_file = base_path + 'test.csv'

num_classes = 10
dev_set_fraction = 0.05

w = 28
h = 28
c = 1

keeprate = 0.8

In [6]:
#explore
df = pd.read_csv(train_file)
print(df.head())
print("shape: " + str(df.shape))
df.describe()

dft = pd.read_csv(test_file)
print(dft.head())
print("shape: " + str(dft.shape))
dft.describe()


   label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \
0      1       0       0       0       0       0       0       0       0   
1      0       0       0       0       0       0       0       0       0   
2      1       0       0       0       0       0       0       0       0   
3      4       0       0       0       0       0       0       0       0   
4      0       0       0       0       0       0       0       0       0   

   pixel8    ...     pixel774  pixel775  pixel776  pixel777  pixel778  \
0       0    ...            0         0         0         0         0   
1       0    ...            0         0         0         0         0   
2       0    ...            0         0         0         0         0   
3       0    ...            0         0         0         0         0   
4       0    ...            0         0         0         0         0   

   pixel779  pixel780  pixel781  pixel782  pixel783  
0         0         0         0         0         

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
count,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,...,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0,28000.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.164607,0.073214,0.028036,0.01125,0.006536,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.473293,3.616811,1.813602,1.205211,0.807475,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,253.0,254.0,193.0,187.0,119.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# extract
df = pd.read_csv(train_file)

m = df.shape[0]
pixelnum = df.shape[1]-1

# randomize so we can split again to get a dev set
df = df.sample(frac=1).reset_index(drop=True)

x_train = df.iloc[:, 1:pixelnum+1].values
y_train = to_categorical(df['label'].values, num_classes=10)

x_train = normalize(x_train)

split = int(m - m*dev_set_fraction)
x = x_train[:split, :]
y = y_train[:split, :]
x_dev = x_train[split:, :]
y_dev = y_train[split:, :]

dftest = pd.read_csv(test_file)
x_test = normalize(dftest.values)

print("x: {}, y: {}".format(x.shape, y.shape))
print("x_dev: {}, y_dev: {}".format(x_dev.shape, y_dev.shape))
print("x_test: {}".format(x_test.shape))

x: (39900, 784), y: (39900, 10)
x_dev: (2100, 784), y_dev: (2100, 10)
x_test: (28000, 784)


In [12]:
# reshape 2+1d
def reshape(x):
    return x.reshape(x.shape[0],w,h,c)
x = reshape(x)
x_dev = reshape(x_dev)
x_test = reshape(x_test)

print("x: {}, y: {}".format(x.shape, y.shape))
print("x_dev: {}, y_dev: {}".format(x_dev.shape, y_dev.shape))
print("x_test: {}".format(x_test.shape))

x: (39900, 28, 28, 1), y: (39900, 10)
x_dev: (2100, 28, 28, 1), y_dev: (2100, 10)
x_test: (28000, 28, 28, 1)


In [19]:
# model structure
model = Sequential()

model.add(Conv2D(32, (3,3), activation='relu', name="c1",  input_shape=(w,h,c)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())

model.add(Conv2D(64, (3,3), activation='relu', name="c2"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())

model.add(Flatten())

model.add(Dense(256, activation='relu', name="d1"))
model.add(Dense(num_classes, activation='softmax', name="d_fin"))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])

In [20]:
#fit
model.fit(x, y,
          batch_size=128,
          epochs=3,
          verbose=1
          ,validation_data=(x_dev, y_dev)
         )
         
score = model.evaluate(x_dev, y_dev, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 39900 samples, validate on 2100 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test loss: 0.07273077197727702
Test accuracy: 0.9795238095238096


In [9]:
y_test = model.predict(x_test)
y_test = np.argmax(y_test, axis=1).astype(int)
y_test = pd.DataFrame({"ImageId": np.arange(y_test.shape[0])+1,"Label": y_test})
print(y_test.head())
y_test.to_csv("kaggle_submission.csv", sep=",", index=False)

   ImageId  Label
0        1      2
1        2      0
2        3      9
3        4      2
4        5      3
