In [None]:
# Loading required packages
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization
from keras.utils.np_utils import to_categorical # convert to one-hot-encoding

In [None]:
# Load the data
train_data = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test_data = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

In [None]:
# Set some parameters:
SEED = 13

## **Data Inspection**

In [None]:
train_data.head()

In [None]:
train_data.shape

Our training data consists of 42,000 rows and 785 columns. Each row represents individual images. The first column is the "label" for the digit in the image, while the rest of the columns correspond to the pixels associated with the images.   

### Checking for Missing Values

In [None]:
train_data.isnull().any().describe()

In [None]:
test_data.isnull().any().describe()

There are no missing values based on the initial data inspection. 

### Examine the Distribution of the Labels

In [None]:
train_data['label'].value_counts()

In [None]:
sns.countplot(train_data['label'])

From the plot it appears that the 10 digits have fairly similar counts and it relatively balanced. We can therefore prepare the data for modelling.

## Preparing the Data for Prediction

### Separating the Predictors from the Labels

In [None]:
X = train_data.drop('label' , axis=1)
y = train_data['label']

### Normalizing the Data to Increase Model Efficency

In [None]:
X = X / 255.0

First we split the data into test and training set to be able to get a sense of its performance in unseen data.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=SEED)

### Reshaping the data for CNN

In [None]:
X_train = X_train.values.reshape(-1, 28, 28, 1).astype('float32')
X_test = X_test.values.reshape(-1, 28, 28, 1).astype('float32') 

In [None]:
print(X_train.shape)
print(X_test.shape)

### One-Hot Encoding of the Target Variable 

In [None]:
y_train = to_categorical(y_train, num_classes = 10)
y_test = to_categorical(y_test, num_classes = 10)

In [None]:
print(y_train.shape)
print(y_test.shape)

## Prediction Through CNN

In [None]:
# Setting up the neural network model

model = Sequential()

model.add(Conv2D(32,(3,3),padding='same',activation= 'relu',input_shape=(28,28,1)))
model.add(Flatten())
model.add(Dense(256,activation= 'relu'))
model.add(Dense(10,activation= 'softmax'))

model.summary()

In [None]:
# Compiling and fitting the model

model.compile(optimizer = 'adam', 
             loss = 'categorical_crossentropy',
             metrics = ['accuracy'])

model.fit(X_train, y_train, validation_split=0.2, epochs=10)

In [None]:
# Evaluating the model for out of sample performance

model.evaluate(X_test, y_test, batch_size=32)

## Predicting the Test Data for Submission

In [None]:
# Reshaping the data for CNN

test_data_OH = test_data.values.reshape(-1, 28, 28, 1).astype('float32')
print(test_data_OH.shape)

In [None]:
# Prediction Dataset

predictions = model.predict(test_data_OH, batch_size = 32)
predictions = np.array([np.argmax(row) for row in predictions])
submission = pd.DataFrame({'ImageId' : np.arange(1,len(test_data)+1), 'Label' : predictions})

In [None]:
# Export to csv

submission.to_csv("cnn_mnist_predictions.csv",index=False)

to be continued

to add:

Validation plots to assess overfitting
Examination of the wrongly-classified images
Actual prediction file (for submission)