## Imports

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/digit-recognizer/sample_submission.csv
/kaggle/input/digit-recognizer/train.csv
/kaggle/input/digit-recognizer/test.csv


## Data set
The training data set, (train.csv), has 785 columns. The first column, called "label", is the digit that was drawn by the user. The rest of the columns contain the pixel-values of the associated image.

000 001 002 003 ... 026 027 \
028 029 030 031 ... 054 055 \
056 057 058 059 ... 082 083 \
 |   |   |   |  ...  |   | \
728 729 730 731 ... 754 755 \
756 757 758 759 ... 782 783 

In [3]:
#import the data 
train_file_path = '/kaggle/input/digit-recognizer/train.csv'
df = pd.read_csv(train_file_path)
print(df.shape)

(42000, 785)


## Drop the first axis and retrieve the label

In [4]:
from sklearn.model_selection import train_test_split

# Separate the features (pixel columns) and the target variable (label column)
features = df.drop('label', axis=1)
target = df['label']

## Convert the labels to one-hot encoded vectors in digit recognition problem

In [5]:
from sklearn.preprocessing import OneHotEncoder

# Convert labels to one-hot encoded vectors
encoder = OneHotEncoder(sparse_output=False)
target_encoded = encoder.fit_transform(target.values.reshape(-1, 1))

# Check the shape of the encoded labels
print("Encoded labels shape:", target_encoded.shape)

Encoded labels shape: (42000, 10)


## Split the data into test and train (80%-20%)

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Reshape features into image format
image_height = 28
image_width = 28
num_channels = 1  # Assuming grayscale images
features_reshaped = features.values.reshape(-1, image_height, image_width, num_channels) / 255.0

# Split the data into training and testing sets
train_features, test_features, train_target, test_target = train_test_split(features_reshaped, target_encoded, test_size=0.2, random_state=42)

print(features_reshaped.shape)
# Check the shape of the training and testing sets
print("Training features shape:", train_features.shape)
print("Testing features shape:", test_features.shape)
print("Training target shape:", train_target.shape)
print("Testing target shape:", test_target.shape)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


(42000, 28, 28, 1)
Training features shape: (33600, 28, 28, 1)
Testing features shape: (8400, 28, 28, 1)
Training target shape: (33600, 10)
Testing target shape: (8400, 10)


# Define the Convulutional Neural Network Architecture


In [7]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(image_height, image_width, num_channels)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## Train the model

In [8]:
print("train_features shape : ", train_features.shape)
print("train_target shape : ", train_target.shape)

train_features shape :  (33600, 28, 28, 1)
train_target shape :  (33600, 10)


In [9]:
# Train the model
model.fit(train_features, train_target, batch_size=128, epochs=10, validation_data=(test_features, test_target))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f335028eb90>

## Evaluation of the model


In [10]:
test_loss, test_accuracy = model.evaluate(test_features, test_target)
print("Test loss:", test_loss)
print("Test accuracy:", test_accuracy)

Test loss: 0.03522825986146927
Test accuracy: 0.989047646522522


In [11]:
# Import the test dataset
test_file_path = '/kaggle/input/digit-recognizer/test.csv'
test_df = pd.read_csv(test_file_path)

# Reshape and normalize the test features
test_features = test_df.values.reshape(-1, image_height, image_width, num_channels) / 255.0

# Make predictions on the test data
predictions = model.predict(test_features)

# Get the predicted labels by selecting the index with the highest probability
predicted_labels = np.argmax(predictions, axis=1)

# Create a DataFrame with the ImageId and the predicted label
submission_df = pd.DataFrame({'ImageId': range(1, len(predicted_labels) + 1), 'Label': predicted_labels})

# Save the submission DataFrame to a CSV file
submission_file_path = 'submission.csv'
submission_df.to_csv(submission_file_path, index=False)

