In [17]:
import os
import tensorflow as tf

def load_annotations(annotations_dir):
    """ Load all annotations from the directory and return a list of strings. """
    all_annotations = []
    for filename in os.listdir(annotations_dir):
        with open(os.path.join(annotations_dir, filename), 'r', encoding='utf-8') as file:
            all_annotations.append(file.read().strip())
    return all_annotations

def create_char_map(annotations):
    """ Create a character map from the annotations. """
    unique_chars = set()
    for annotation in annotations:
        chars = annotation.replace("#", " ").replace("\\", "").split()
        for char in chars:
            unique_chars.update(char)
    # Add a character for word separator
    unique_chars.add("#")
    # Sort characters to maintain consistency
    sorted_chars = sorted(list(unique_chars))
    char_to_int = {char: i + 1 for i, char in enumerate(sorted_chars)}  # Start indexing from 1
    char_to_int[" "] = 0  # Use 0 for space (used as padding)
    return char_to_int

def encode_annotations(annotations, char_map):
    """ Encode annotations into sequences of integers suitable for training. """
    encoded_data = []
    for annotation in annotations:
        encoded_annotation = []
        words = annotation.split("#")
        for word in words:
            for char in word:
                if char in char_map:
                    encoded_annotation.append(char_map[char])
            encoded_annotation.append(char_map["#"])  # Add word separator
        encoded_annotation.pop()  # Remove the last '#' for the end of the sentence
        encoded_data.append(np.array(encoded_annotation))
    return np.array(encoded_data)


def main(annotations_dir):
    annotations = load_annotations(annotations_dir)
    char_map = create_char_map(annotations)
    encoded_annotations = encode_annotations(annotations, char_map)
    print("Sample encoded annotations:", encoded_annotations[:5])
    return encoded_annotations, char_map

# Specify the path to your annotations directory
annotations_dir = "Sentence_Dataset/annotations"
encoded_annotations, character_map = main(annotations_dir)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (20000,) + inhomogeneous part.

In [2]:
character_map

{'#': 1,
 '-': 2,
 'क': 3,
 'ख': 4,
 'ग': 5,
 'घ': 6,
 'च': 7,
 'छ': 8,
 'ज': 9,
 'ञ': 10,
 'ट': 11,
 'ठ': 12,
 'ड': 13,
 'ढ': 14,
 'ण': 15,
 'त': 16,
 'थ': 17,
 'द': 18,
 'ध': 19,
 'न': 20,
 'प': 21,
 'फ': 22,
 'ब': 23,
 'भ': 24,
 'म': 25,
 'य': 26,
 'र': 27,
 'ल': 28,
 'व': 29,
 'श': 30,
 'ष': 31,
 'स': 32,
 'ह': 33,
 '़': 34,
 '्': 35,
 '०': 36,
 '१': 37,
 '२': 38,
 '३': 39,
 '४': 40,
 '५': 41,
 '६': 42,
 '७': 43,
 '८': 44,
 '९': 45,
 ' ': 0}

In [3]:
import os
import numpy as np
from PIL import Image

def find_max_width(image_dir):
    """ Find the maximum width of images in the specified directory. """
    max_width = 0
    for img_name in os.listdir(image_dir):
        img_path = os.path.join(image_dir, img_name)
        with Image.open(img_path) as img:
            if img.width > max_width:
                max_width = img.width
    return max_width

def pad_images(image_dir, output_dir, max_width):
    """ Pad all images in the specified directory to the maximum width. """
    os.makedirs(output_dir, exist_ok=True)
    for img_name in os.listdir(image_dir):
        img_path = os.path.join(image_dir, img_name)
        with Image.open(img_path) as img:
            new_img = Image.new('L', (max_width, 32), (0))  # Assuming white background (255)
            new_img.paste(img, (0, 0))
            new_img.save(os.path.join(output_dir, img_name))

# Example usage:
image_directory = "Sentence_Dataset/images"
output_directory = "Sentence_Dataset/pad"
max_width = find_max_width(image_directory)
# pad_images(image_directory, output_directory, max_width)
max_width

1632

In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Reshape, Dense, LSTM, Bidirectional
from tensorflow.keras.models import Model

def build_crnn_model(input_width, num_classes):
    """ Build a CRNN model. """
    input_shape = (32, input_width, 1)  # Height is 32, width is variable, channel is 1 (grayscale)

    inputs = Input(shape=input_shape)

    # Convolutional layers
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2))(x)

    # Reshape to prepare input for LSTM layers
    new_shape = (-1, x.shape[2] * x.shape[3])  # Width times depth
    x = Reshape(target_shape=new_shape)(x)

    # RNN layer
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Bidirectional(LSTM(64, return_sequences=False))(x)

    # Output layer
    outputs = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    return model

# Assuming we have 100 different characters/classes including space and separators
num_classes = 100
model = build_crnn_model(max_width, num_classes)
model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 32, 1632, 1)]     0         
                                                                 
 conv2d (Conv2D)             (None, 32, 1632, 32)      320       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 16, 816, 32)      0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 16, 816, 64)       18496     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 8, 408, 64)       0         
 2D)                                                             
                                                                 
 reshape (Reshape)           (None, 8, 26112)          0     

In [12]:
def load_images(image_dir, target_size=(32, 256)):
    """ Load and preprocess images from the specified directory. """
    images = []
    filenames = sorted(os.listdir(image_dir))  # Sort to ensure the order matches annotations
    for filename in filenames:
        img_path = os.path.join(image_dir, filename)
        img = load_img(img_path, color_mode='grayscale', target_size=target_size)
        img_array = img_to_array(img)
        img_array = img_array / 255.0  # Normalize the images to [0, 1]
        images.append(img_array)
    return np.array(images)

image_directory = 'Sentence_Dataset/images'
images = load_images(image_directory)
images

array([[[[0.        ],
         [0.        ],
         [0.        ],
         ...,
         [0.5176471 ],
         [0.00392157],
         [0.        ]],

        [[0.        ],
         [0.        ],
         [0.        ],
         ...,
         [0.99215686],
         [0.03137255],
         [0.        ]],

        [[0.        ],
         [0.14117648],
         [0.9137255 ],
         ...,
         [0.8235294 ],
         [0.03921569],
         [0.        ]],

        ...,

        [[0.        ],
         [0.13333334],
         [0.95686275],
         ...,
         [0.        ],
         [0.        ],
         [0.        ]],

        [[0.        ],
         [0.00392157],
         [0.07058824],
         ...,
         [0.        ],
         [0.        ],
         [0.        ]],

        [[0.        ],
         [0.        ],
         [0.        ],
         ...,
         [0.08235294],
         [0.00784314],
         [0.        ]]],


       [[[0.        ],
         [0.        ],
         [0.  

In [13]:
from sklearn.model_selection import train_test_split

# Use encoded_annotations and images loaded from previous steps
x_train, x_val, y_train, y_val = train_test_split(images, encoded_annotations, test_size=0.2, random_state=42)


In [15]:
# Define the model architecture using TensorFlow/Keras
def build_crnn_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2))(x)
    # Prepare output for RNN layers
    new_shape = (-1, x.shape[2] * 64)  # Flatten and prepare for RNN
    x = Reshape(target_shape=new_shape)(x)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Bidirectional(LSTM(64, return_sequences=False))(x)
    outputs = Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Calculate the number of classes (+1 for the padding character)
num_classes = len(character_map) + 1
input_shape = (32, 256, 1)  # Adjust depending on your image size and channels
model = build_crnn_model(input_shape, num_classes)

# Train the model


In [16]:

model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=32)

ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {'(<class \'list\'> containing values of types {"<class \'int\'>"})'})

In [1]:
import numpy as np
import cv2
import os
import tensorflow as tf
from tensorflow.keras import layers, models


In [2]:
def load_and_preprocess_image(image_path, target_size=(32, 256)):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, target_size, interpolation=cv2.INTER_AREA)
    img = img / 255.0  # Normalize pixel values
    img = np.expand_dims(img, axis=-1)  # Add channel dimension
    return img

def encode_annotations(annotation):
    char_to_num = {'-': 0, '#': 1}  # Define mapping for special characters
    for idx, char in enumerate(set(annotation.replace('\\', '').replace(' ', '').replace('#', '')), start=2):
        char_to_num[char] = idx  # Create a unique index for each Hindi character

    # Split the annotation and encode
    encoded = []
    words = annotation.split(' ')
    for word in words:
        parts = word.split('\\')
        word_encoded = [char_to_num[char] for char in parts if char in char_to_num]
        encoded.extend(word_encoded)
        encoded.append(char_to_num['#'])  # Add space between words as a token
    return np.array(encoded[:-1])  # Remove the last space


In [3]:
def build_crnn_model(input_shape, num_classes):
    input_img = layers.Input(shape=input_shape, name='image_input')
    x = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Reshape(target_shape=(-1, x.shape[2] * x.shape[3]))(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
    output = layers.Dense(num_classes, activation='softmax')(x)
    model = models.Model(inputs=input_img, outputs=output)
    return model


In [5]:
def prepare_data(base_directory, max_label_length):
    images = []
    labels = []
    
    images_directory = os.path.join(base_directory, "images")
    annotations_directory = os.path.join(base_directory, "annotations")
    
    for filename in os.listdir(images_directory):
        if filename.endswith('.png'):
            img_path = os.path.join(images_directory, filename)
            annotation_path = os.path.join(annotations_directory, filename.replace('.png', '.txt'))
            
            img = load_and_preprocess_image(img_path)
            with open(annotation_path, 'r', encoding='utf-8') as file:
                annotation = file.read().strip()
                label = encode_annotations(annotation)
                
                # Pad or truncate the label to the max_label_length
                padded_label = np.pad(label, (0, max(max_label_length - len(label), 0)), mode='constant', constant_values=0)
            
            images.append(img)
            labels.append(padded_label)
    
    return np.array(images), np.array(labels)


In [6]:
directory = "Sentence_Dataset"
max_label_length = 100  # Set this based on your data analysis
images, labels = prepare_data(directory, max_label_length)
num_classes = np.max(labels) + 1  # Assuming labels are encoded as integers

# Continue with setting up and training your model


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (20000,) + inhomogeneous part.

In [None]:
# Model and Training
model = build_crnn_model((32, 256, 1), num_classes)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(images, labels, epochs=10, batch_size=32)

In [1]:
def load_and_preprocess_image(image_path, target_size=(32, 256)):
    # Read the image in grayscale
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    # Resize the image to the desired target size
    img = cv2.resize(img, target_size, interpolation=cv2.INTER_AREA)
    # Normalize pixel values to the range [0, 1]
    img = img.astype('float32') / 255.0
    # Add a channel dimension (for Keras/TensorFlow input requirements)
    img = np.expand_dims(img, axis=-1)
    return img


In [2]:
def encode_annotations(annotation, char_to_index, max_length):
    # Initialize the encoded array with zeros (for padding)
    encoded = np.zeros(max_length, dtype=int)
    # Split the annotation into individual components
    characters = annotation.split('\\')
    # Encode each character using the provided mapping
    for i, char in enumerate(characters):
        if i < max_length:
            encoded[i] = char_to_index.get(char, char_to_index['-'])  # Use '-' encoding for unknown characters
    return encoded


In [None]:
def prepare_dataset(images_directory, annotations_directory, char_to_index, max_label_length):
    images = []
    labels = []
    
    # List all files in the images directory
    image_files = os.listdir(images_directory)
    for image_file in image_files:
        if image_file.endswith('.png'):
            # Corresponding annotation file
            annotation_file = image_file.replace('.png', '.txt')
            image_path = os.path.join(images_directory, image_file)
            annotation_path = os.path.join(annotations_directory, annotation_file)

            # Load and preprocess the image
            img = load_and_preprocess_image(image_path)

            # Read and encode the annotation
            with open(annotation_path, 'r', encoding='utf-8') as file:
                annotation = file.read().strip()
                encoded_label = encode_annotations(annotation, char_to_index, max_label_length)
            
            # Append to the lists
            images.append(img)
            labels.append(encoded_label)
    
    # Convert lists to numpy arrays
    return np.array(images), np.array(labels)
