# https://towardsdatascience.com/image-classification-in-10-minutes-with-mnist-dataset-54c35b77a38d

In [24]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [8]:
# data importation
train_data = pd.read_csv('train.csv')

In [9]:
# split the data between training set and dev set 
split_point = int(0.9 * len(train_data))
training_set = train_data[:split_point]
dev_set = train_data[split_point:]
print("The training set has", len(training_set), "images and the dev set has", len(dev_set), "images")

The training set has 37800 images and the dev set has 4200 images


In [11]:
# prepare data for training
x_train = training_set.loc[:, training_set.columns != "label"].values
y_train = training_set["label"].values

x_dev = dev_set.loc[:, dev_set.columns != "label"].values
y_dev = dev_set["label"].values

In [12]:
# reshaping and normalizing the images
x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
x_dev = x_dev.reshape(x_dev.shape[0], 28, 28, 1)
x_train = x_train.astype("float32")
x_dev = x_dev.astype("float32")
x_train /= 255
x_dev /= 255

In [14]:
input_shape = (28, 28, 1)

In [15]:
# Importing the required Keras modules containing model and layers
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D
# Creating a Sequential Model and adding the layers
model = Sequential()
model.add(Conv2D(28, kernel_size=(3,3), input_shape=input_shape))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten()) # Flattening the 2D arrays for fully connected layers
model.add(Dense(128, activation=tf.nn.relu))
model.add(Dropout(0.2))
model.add(Dense(10,activation=tf.nn.softmax))

In [16]:
model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])
model.fit(x=x_train,y=y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ef0fc874a8>

In [17]:
model.evaluate(x_dev, y_dev)



[0.06729150502395073, 0.9826190476190476]

In [18]:
# prediction on the test set (to upload to Kaggle)
# import contest data set
contest_data = pd.read_csv('test.csv')

In [20]:
# reshaping and normalizing the images
contest_data = contest_data.values
contest_data = contest_data.reshape(contest_data.shape[0], 28, 28, 1)
contest_data = contest_data.astype("float32")
contest_data /= 255

In [21]:
predictions = model.predict(contest_data)

In [25]:
predictions = pd.DataFrame(predictions)
predicted_labels = []
for index, row in predictions.iterrows():
    predicted_labels.append(np.argmax(row))
predicted_labels = pd.DataFrame(predicted_labels)
predicted_labels["ImageId"] = predicted_labels.index
f = lambda x: x+1
predicted_labels["ImageId"] = predicted_labels["ImageId"].apply(f)
predicted_labels.columns = ["Label", "ImageId"]

In [27]:
predicted_labels.head()

Unnamed: 0,Label,ImageId
0,2,1
1,0,2
2,9,3
3,9,4
4,3,5


In [28]:
predicted_labels.to_csv("predicted_data_cnn.csv", header = True, index = False)