In [1]:
import arrow
import pandas as pd

TRAIN = '/kaggle/input/digit-recognizer/train.csv'
TEST = '/kaggle/input/digit-recognizer/test.csv'

time_start = arrow.now()
train_df = pd.read_csv(filepath_or_buffer=TRAIN)
test_df = pd.read_csv(filepath_or_buffer=TEST)

class_count = train_df['label'].nunique()
print('{} data load done.'.format(arrow.now() - time_start))

0:00:06.617369 data load done.


Are our classes balanced? Let's take a look.

In [2]:
from plotly import express
express.histogram(data_frame=train_df, x='label', nbins=class_count)

In [3]:
from plotly import express
express.pie(data_frame=train_df, names='label', color='label')

In [4]:
import tensorflow as tf
from tensorflow.keras import layers
from keras.models import Sequential

# we have some free parameters here; they have nominal values at the moment

model = Sequential([
    layers.Input(shape=(28, 28, 1)),
    
    layers.Conv2D(128, (5, 5), padding='same',activation='relu',),
    layers.MaxPooling2D(padding='same'),
    layers.Dropout(3/10),
    
    layers.Conv2D(64, (5, 5), padding='same',activation='relu'),
    layers.MaxPooling2D(padding='same'),
    layers.Dropout(3/10),

    layers.Conv2D(32, (3, 3), padding='same',activation='relu'),
    layers.MaxPooling2D(padding='same'),
    layers.Dropout(3/10),

    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    
    layers.Dense(class_count, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'],)
model.summary()

2024-04-01 21:03:57.768848: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-01 21:03:57.768987: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-01 21:03:57.947206: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
import arrow
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

time_start = arrow.now()
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(columns=['label']).values.reshape(-1, 28, 28, 1), 
                                                    to_categorical(train_df['label'].tolist()), test_size=0.2, random_state=2024,
                                                   stratify=train_df['label'])
print('{} built splits'.format(arrow.now() - time_start))

0:00:00.595105 built splits


In [6]:
import arrow
from tensorflow.keras.callbacks import EarlyStopping

EPOCHS = 25

time_start = arrow.now()
early_stopping = EarlyStopping(monitor='val_accuracy', restore_best_weights=True, mode='auto')
history = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=128, validation_split=0.1, callbacks=[early_stopping])
print('{}: trained model'.format(arrow.now() - time_start))

Epoch 1/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 531ms/step - accuracy: 0.4229 - loss: 5.8055 - val_accuracy: 0.9634 - val_loss: 0.1272
Epoch 2/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 531ms/step - accuracy: 0.9221 - loss: 0.2405 - val_accuracy: 0.9789 - val_loss: 0.0695
Epoch 3/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 521ms/step - accuracy: 0.9491 - loss: 0.1618 - val_accuracy: 0.9818 - val_loss: 0.0529
Epoch 4/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 519ms/step - accuracy: 0.9573 - loss: 0.1310 - val_accuracy: 0.9857 - val_loss: 0.0429
Epoch 5/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 519ms/step - accuracy: 0.9673 - loss: 0.1066 - val_accuracy: 0.9890 - val_loss: 0.0363
Epoch 6/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 520ms/step - accuracy: 0.9667 - loss: 0.1033 - val_accuracy: 0.9926 - val_loss: 0.0317
Epoc

What's our test accuracy? If it's poor we probably shouldn't build a submission.

In [7]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print('loss: {:5.4f} accuracy: {:5.4f}'.format(test_loss, test_accuracy))

[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 34ms/step - accuracy: 0.9880 - loss: 0.0445
loss: 0.0444 accuracy: 0.9880


In [8]:
import arrow
import numpy as np

RESULT_FILE = '/kaggle/working/KerasCNN.csv.zip'

time_start = arrow.now()
result_df = pd.DataFrame(data=np.argmax(model.predict(test_df.values.reshape(len(test_df), 28, 28, 1)), axis=1), 
                         columns=['Label']).reset_index().rename(columns={'index': 'ImageId'})
result_df['ImageId'] += 1
print('{} : writing result to {}'.format(arrow.now() - time_start, RESULT_FILE))
result_df.to_csv(path_or_buf=RESULT_FILE, index=False, compression='zip')
print('{} : done.'.format(arrow.now() - time_start, ))

[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 32ms/step
0:00:41.269785 : writing result to /kaggle/working/KerasCNN.csv.zip
0:00:41.345399 : done.
