In [1]:
import arrow
import pandas as pd

TRAIN = '/kaggle/input/digit-recognizer/train.csv'
TEST = '/kaggle/input/digit-recognizer/test.csv'

time_start = arrow.now()
train_df = pd.read_csv(filepath_or_buffer=TRAIN)
test_df = pd.read_csv(filepath_or_buffer=TEST)

class_count = train_df['label'].nunique()
print('{} data load done.'.format(arrow.now() - time_start))

0:00:06.943967 data load done.


Are our classes balanced? Let's take a look.

In [2]:
from plotly import express
express.histogram(data_frame=train_df, x='label', nbins=class_count)

In [3]:
from plotly import express
express.pie(data_frame=train_df, names='label', color='label')

In [4]:
import tensorflow as tf
from tensorflow.keras import layers
from keras.models import Sequential

# we have some free parameters here; they have nominal values at the moment

model = Sequential([
    layers.Input(shape=(28, 28, 1)),
    
    layers.Conv2D(128, (5, 5), padding='same',activation='relu',),
    layers.MaxPooling2D(padding='same'),
    layers.Dropout(4/10),
    
    layers.Conv2D(64, (5, 5), padding='same',activation='relu'),
    layers.MaxPooling2D(padding='same'),
    layers.Dropout(4/10),

    layers.Conv2D(32, (3, 3), padding='same',activation='relu'),
    layers.MaxPooling2D(padding='same'),
    layers.Dropout(4/10),

    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(5/10),
    layers.Dense(class_count, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'],)
model.summary()

2024-04-02 12:52:26.756267: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-02 12:52:26.756518: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-02 12:52:26.917397: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
import arrow
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

time_start = arrow.now()
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(columns=['label']).values.reshape(-1, 28, 28, 1), 
                                                    to_categorical(train_df['label'].tolist()), test_size=0.2, random_state=2024,
                                                   stratify=train_df['label'])
print('{} built splits'.format(arrow.now() - time_start))

0:00:00.583230 built splits


In [6]:
import arrow
from tensorflow.keras.callbacks import EarlyStopping

EPOCHS = 25

time_start = arrow.now()
early_stopping = EarlyStopping(monitor='val_accuracy', restore_best_weights=True, mode='auto')
history = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=128, validation_split=0.1, callbacks=[early_stopping])
print('{}: trained model'.format(arrow.now() - time_start))

Epoch 1/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 529ms/step - accuracy: 0.2666 - loss: 5.0969 - val_accuracy: 0.8899 - val_loss: 0.4048
Epoch 2/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 541ms/step - accuracy: 0.7820 - loss: 0.6692 - val_accuracy: 0.9512 - val_loss: 0.1705
Epoch 3/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 536ms/step - accuracy: 0.8801 - loss: 0.3892 - val_accuracy: 0.9690 - val_loss: 0.1164
Epoch 4/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 537ms/step - accuracy: 0.9084 - loss: 0.2953 - val_accuracy: 0.9723 - val_loss: 0.0865
Epoch 5/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 535ms/step - accuracy: 0.9276 - loss: 0.2424 - val_accuracy: 0.9771 - val_loss: 0.0766
Epoch 6/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 538ms/step - accuracy: 0.9374 - loss: 0.2056 - val_accuracy: 0.9807 - val_loss: 0.0618
Epoc

Let's visualize our training history before we proceed.

In [7]:
import warnings

warnings.filterwarnings(action='ignore', category=FutureWarning)
history_df = pd.DataFrame(data={key: history.history[key] for key in history.history.keys()}).reset_index().rename(columns={'index': 'epoch'})
express.line(data_frame=history_df, x='epoch', y=history_df.columns, log_y=True)

What's our test accuracy? If our test accuracy isn't above 0.99 we don't want to submit the results we're building below.

In [8]:
test_loss, test_accuracy = model.evaluate(x=X_test, y=y_test, batch_size=None, verbose='auto', sample_weight=None, steps=None, callbacks=None,
                                          return_dict=False,)
print('loss: {:5.4f} accuracy: {:5.4f}'.format(test_loss, test_accuracy))

[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 34ms/step - accuracy: 0.9769 - loss: 0.0756
loss: 0.0735 accuracy: 0.9777


In [9]:
import arrow
import numpy as np

RESULT_FILE = '/kaggle/working/KerasCNN.csv.zip'

if test_accuracy > 0.99:
    time_start = arrow.now()
    result_df = pd.DataFrame(data=np.argmax(model.predict(x=test_df.values.reshape(len(test_df), 28, 28, 1), batch_size=None, verbose='auto',
                                                          steps=None, callbacks=None), axis=1), 
                             columns=['Label']).reset_index().rename(columns={'index': 'ImageId'})
    result_df['ImageId'] += 1
    print('{} : writing result to {}'.format(arrow.now() - time_start, RESULT_FILE))
    result_df.to_csv(path_or_buf=RESULT_FILE, index=False, compression='zip')
    print('{} : done.'.format(arrow.now() - time_start, ))
else:
    print('test accuracy too low; try again?')

test accuracy too low; try again?
