In [1]:
import arrow
import pandas as pd

TRAIN = '/kaggle/input/digit-recognizer/train.csv'
TEST = '/kaggle/input/digit-recognizer/test.csv'

time_start = arrow.now()
train_df = pd.read_csv(filepath_or_buffer=TRAIN)
test_df = pd.read_csv(filepath_or_buffer=TEST)

class_count = train_df['label'].nunique()
print('{} data load done.'.format(arrow.now() - time_start))

0:00:07.506596 data load done.


Are our classes balanced? Let's take a look.

In [2]:
from plotly import express
express.histogram(data_frame=train_df, x='label', nbins=class_count)

In [3]:
from plotly import express
express.pie(data_frame=train_df, names='label', color='label')

In [4]:
import tensorflow as tf
from tensorflow.keras import layers
from keras.models import Sequential

# TODO fix this
# UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. 
# When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

# we have some free parameters here; they have nominal values at the moment

model = Sequential([
    layers.Conv2D(128, (5, 5), padding='same',activation='relu', input_shape=(28, 28, 1)),
    layers.MaxPooling2D(padding='same'),
    layers.Dropout(3/10),

    layers.Conv2D(64, (5, 5), padding='same',activation='relu'),
    layers.MaxPooling2D(padding='same'),
    layers.Dropout(3/10),

    layers.Conv2D(32, (3, 3), padding='same',activation='relu'),
    layers.MaxPooling2D(padding='same'),
    layers.Dropout(3/10),

    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    
    layers.Dense(class_count, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'],)
model.summary()

2024-04-01 20:49:03.406272: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-01 20:49:03.406468: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-01 20:49:03.644860: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



In [5]:
import arrow
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

time_start = arrow.now()
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(columns=['label']).values.reshape(-1, 28, 28, 1), 
                                                    to_categorical(train_df['label'].tolist()), test_size=0.2, random_state=2024,
                                                   stratify=train_df['label'])
print('{} built splits'.format(arrow.now() - time_start))

0:00:00.629635 built splits


In [6]:
import arrow
from tensorflow.keras.callbacks import EarlyStopping

EPOCHS = 25

time_start = arrow.now()
early_stopping = EarlyStopping(monitor='val_accuracy', restore_best_weights=True, mode='auto')
history = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=128, validation_split=0.1, callbacks=[early_stopping])
print('{}: trained model'.format(arrow.now() - time_start))

Epoch 1/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 640ms/step - accuracy: 0.4475 - loss: 4.8745 - val_accuracy: 0.9643 - val_loss: 0.1185
Epoch 2/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 601ms/step - accuracy: 0.9221 - loss: 0.2499 - val_accuracy: 0.9768 - val_loss: 0.0700
Epoch 3/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 603ms/step - accuracy: 0.9505 - loss: 0.1572 - val_accuracy: 0.9854 - val_loss: 0.0430
Epoch 4/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 611ms/step - accuracy: 0.9632 - loss: 0.1178 - val_accuracy: 0.9860 - val_loss: 0.0355
Epoch 5/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 599ms/step - accuracy: 0.9658 - loss: 0.1102 - val_accuracy: 0.9896 - val_loss: 0.0314
Epoch 6/25
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 598ms/step - accuracy: 0.9696 - loss: 0.0949 - val_accuracy: 0.9857 - val_loss: 0.0408
0:19

What's our test accuracy? If it's poor we probably shouldn't build a submission.

In [7]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print('loss: {:5.4f} accuracy: {:5.4f}'.format(test_loss, test_accuracy))

[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 36ms/step - accuracy: 0.9832 - loss: 0.0498
loss: 0.0476 accuracy: 0.9854


In [8]:
import arrow
import numpy as np

RESULT_FILE = '/kaggle/working/KerasCNN.csv.zip'

time_start = arrow.now()
result_df = pd.DataFrame(data=np.argmax(model.predict(test_df.values.reshape(len(test_df), 28, 28, 1)), axis=1), 
                         columns=['Label']).reset_index().rename(columns={'index': 'ImageId'})
result_df['ImageId'] += 1
print('{} : writing result to {}'.format(arrow.now() - time_start, RESULT_FILE))
result_df.to_csv(path_or_buf=RESULT_FILE, index=False, compression='zip')
print('{} : done.'.format(arrow.now() - time_start, ))

[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 35ms/step
0:00:31.527529 : writing result to /kaggle/working/KerasCNN.csv.zip
0:00:31.603553 : done.
