<a href="https://colab.research.google.com/github/minuraashen/Machine-Learning/blob/main/Digit_sequence_identification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Upload the dataset**

In [32]:
from google.colab import files
uploaded = files.upload()

Saving MNIST_Sequence.zip to MNIST_Sequence.zip


In [33]:
!mkdir -p ./data
!unzip MNIST_Sequence.zip -d ./data

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: ./data/MNIST Sequence/train/train_5500_939.png  
  inflating: ./data/MNIST Sequence/train/train_5501_514.png  
  inflating: ./data/MNIST Sequence/train/train_5502_1234.png  
  inflating: ./data/MNIST Sequence/train/train_5503_848.png  
  inflating: ./data/MNIST Sequence/train/train_5504_11572.png  
  inflating: ./data/MNIST Sequence/train/train_5505_84.png  
  inflating: ./data/MNIST Sequence/train/train_5506_9347.png  
  inflating: ./data/MNIST Sequence/train/train_5507_5954.png  
  inflating: ./data/MNIST Sequence/train/train_5508_47.png  
  inflating: ./data/MNIST Sequence/train/train_5509_522.png  
  inflating: ./data/MNIST Sequence/train/train_550_40.png  
  inflating: ./data/MNIST Sequence/train/train_5510_1313.png  
  inflating: ./data/MNIST Sequence/train/train_5511_587.png  
  inflating: ./data/MNIST Sequence/train/train_5512_3704.png  
  inflating: ./data/MNIST Sequence/train/train_5513_60423.png  


### **Data Loading and Preprocessing**

In [34]:
import os
import cv2
import numpy as np
import tensorflow as tf

In [35]:
#Character set for digits
char_list = '0123456789'
char_to_idx = {c: i for i, c in enumerate(char_list)}

In [37]:
#Path for data
train_data_path = '/content/data/MNIST Sequence/test'
test_data_path = '/content/data/MNIST Sequence/test'

In [38]:
IMG_HEIGHT = 32
IMG_WIDTH = 128

In [39]:
def load_data(folder_path):
  images, labels = [], []
  for file_name in os.listdir(folder_path):
    if file_name.endswith('.png'):
      img_path = os.path.join(folder_path, file_name)
      label_text = file_name.split('_')[-1].replace('.png', '')

      #read the image in grayscale
      img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
      #Resize to fixed shape
      img = cv2.resize(img, (IMG_WIDTH, IMG_HEIGHT))
      #Normalize image pixel values
      img = img / 255
      images.append(img)
      labels.append(label_text)
  return np.array(images), labels

#Load Train and Test data sets
x_train, y_train_labels = load_data(train_data_path)
x_test, y_test_labels = load_data(test_data_path)

print(len(x_train), len(y_train_labels))
print(len(x_test), len(y_test_labels))

10000 10000
10000 10000


### **Encode Labels and Pad**

In [44]:
def encode_labels(labels):
  encoded = []
  lengths = []
  for label in labels:
    seq = [char_to_idx[c] for c in label]
    encoded.append(seq)
    lengths.append(len(seq))
  return encoded, lengths

y_train_encoded, train_label_lengths = encode_labels(y_train_labels)
y_test_encoded, test_label_lengths = encode_labels(y_test_labels)

max_label_length = max(max(train_label_lengths), max(test_label_lengths))

#Pad labels
padded_y_train = tf.keras.preprocessing.sequence.pad_sequences(y_train_encoded, maxlen=max_label_length, padding='post', value=-1)
padded_y_test = tf.keras.preprocessing.sequence.pad_sequences(y_test_encoded, maxlen=max_label_length, padding='post', value=-1)

print(f"Max label length: {max_label_length}")
print(f"Example encoded label: {y_train_encoded[0]}")
print(f"Example padded label: {padded_y_train[0]}")

Max label length: 5
Example encoded label: [9, 8, 8]
Example padded label: [ 9  8  8 -1 -1]


### **Prepare Inputs for CTC-Connectionist Temporal Classification**

In [45]:
number_of_pool_layes = 3
# If input image width is 128 and CNN has 3 pooling layers, time step is 128/8 = 16
time_steps = IMG_WIDTH // (2**number_of_pool_layes)

#Prepare for training
input_length = np.ones((len(x_train),1))*time_steps
label_length = np.array(train_label_lengths).reshape(-1,1)

#For test data
input_length_test = np.ones((len(x_test),1))*time_steps
label_length_test = np.array(test_label_lengths).reshape(-1,1)

print(f"Example input length: {input_length[:5]}")
print(f"Example label length: {label_length[:5]}")


Example input length: [[16.]
 [16.]
 [16.]
 [16.]
 [16.]]
Example label length: [[3]
 [4]
 [4]
 [5]
 [5]]


### **Model and Training Pipeline**

***Define CRNN Model(CNN+BiLSTM+Dense+CTC Loss)***

In [51]:
import tensorflow as tf
from tensorflow.keras import layers, Model

num_classes = len(char_list) + 1  # +1 for CTC blank

#Inputs
input_img = layers.Input(shape=(IMG_WIDTH, IMG_HEIGHT, 1), name='image')
labels = layers.Input(name='label', shape=(max_label_length,), dtype='int32')
input_length = layers.Input(name='input_length', shape=(1,), dtype='int64')
label_length = layers.Input(name='label_length', shape=(1,), dtype='int64')

#CNN Feature Extractor
x = layers.Conv2D(64, (3,3), activation='relu', padding='same')(input_img)
x = layers.MaxPooling2D(pool_size=(2,2))(x)  #Image width is downscaled to 64
x = layers.Conv2D(128, (3,3), activation='relu', padding='same')(x)
x = layers.MaxPooling2D(pool_size=(2,2))(x)  #Image width is downscaled to 32
x = layers.Conv2D(256, (3,3), activation='relu', padding='same')(x)
x = layers.Conv2D(256, (3,3), activation='relu', padding='same')(x)
x = layers.MaxPooling2D(pool_size=(2,2))(x)  # now width = 16

#Reshape for RNN
new_shape = (x.shape[1], x.shape[2]*x.shape[3])
x = layers.Reshape(target_shape=new_shape)(x)

#RNN Layers
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)

#Dense Layer for character prediction
y_pred = layers.Dense(num_classes, activation='softmax', name='dense')(x)


***Add CTC Loss***

In [52]:
def ctc_loss_fn(args):
  y_pred, labels, input_length, label_length = args
  return tf.keras.backend.ctc_batch_cost(labels, y_pred, input_length, label_length)

loss_out = layers.Lambda(ctc_loss_fn, name='ctc_loss')(
    [y_pred, labels, input_length, label_length]
)


***Create Models***  
- One Model for Training  
- One Model for Prediction

In [53]:
#for inference (predict only)
model = Model(inputs = input_img, outputs = y_pred)

#for training (compute CTC Loss)
ctc_model = Model(inputs = [input_img, labels, input_length, label_length], outputs=loss_out)
ctc_model.compile(optimizer='adam',
                  loss={'ctc_loss': lambda y_true, y_pred: y_pred})

***Train the model***

In [61]:
#Fix image shape (add channel dimension)
# Ensure correct shape: (num_samples, 32, 128, 1)
print("Before reshape:", x_train.shape, x_test.shape)

x_train = np.expand_dims(x_train, axis=-1)
x_test = np.expand_dims(x_test, axis=-1)

print("After reshape:", x_train.shape, x_test.shape)
# Expected: (N, 32, 128, 1)

Before reshape: (10000, 32, 128) (10000, 32, 128)
After reshape: (10000, 32, 128, 1) (10000, 32, 128, 1)


In [69]:
#Prepare Input Dictionaries
train_inputs = {
    'image': x_train,
    'label': padded_y_train,
    'input_length': input_length,
    'label_length': label_length
    }
dummy_y = np.zeros((len(x_train), 1))

print("image:", x_train.shape)
print("label:", padded_y_train.shape)
print("input_length:", input_length.shape)
print("label_length:", label_length.shape)
print("dummy_y:", dummy_y.shape)

image: (10000, 32, 128, 1)
label: (10000, 5)
input_length: (None, 1)
label_length: (None, 1)
dummy_y: (10000, 1)


In [70]:
val_inputs = {
    'image': x_test,
    'label': padded_y_test,
    'input_length': input_length_test,
    'label_length': label_length_test
}
dummy_val_y = np.zeros((len(x_test), 1))

ctc_model.fit(train_inputs, dummy_y,
              validation_data=(val_inputs, dummy_val_y),
              epochs=30,
              batch_size=64)


TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'