In [3]:
import os
import pytesseract
from PIL import Image
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.data import Dataset

# Configure the path to tesseract executable if needed
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Uncomment this line and set your path if needed

# Base directory for images
base_dir =  r'C:\codsoft\proj 5\data\000'

def image_text_generator(base_dir, batch_size=32):
    files = [f for f in os.listdir(base_dir) if f.endswith('.png')]
    num_files = len(files)
    
    for offset in range(0, num_files, batch_size):
        batch_files = files[offset:offset + batch_size]

        text_list = []
        for filename in batch_files:
            image_path = os.path.join(base_dir, filename)
            if os.path.isfile(image_path):
                try:
                    image = Image.open(image_path)
                    text_list.append(pytesseract.image_to_string(image))
                except Exception as e:
                    print(f"Error processing file {filename}: {e}")
            else:
                print(f"File not found: {image_path}")

        yield '.'.join(text_list)

# Create a dataset generator
batch_size = 32
data_gen = image_text_generator(base_dir, batch_size)

# Initialize an empty text list
full_text = []

# Process the dataset in batches
for batch_text in data_gen:
    full_text.append(batch_text)

# Join all the text into a single string
full_text = ' '.join(full_text)

# Convert the text to a sequence of integers
vocab = sorted(set(full_text))
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in full_text])

# Define the sequence length
seq_length = 10

# Create the training data
char_dataset = Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

# Create the training batches
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)
dataset = dataset.batch(batch_size, drop_remainder=True)

# Define the model parameters
vocab_size = len(vocab)  # Number of unique characters in the dataset
embedding_dim = 256
rnn_units = 1024

# Define the model
model = Sequential([
    Embedding(vocab_size, embedding_dim),
    SimpleRNN(rnn_units,
              return_sequences=True,
              stateful=False,  # Set to False for stateless
              recurrent_initializer='glorot_uniform'),
    Dense(vocab_size)
])

# Compile the model
model.compile(optimizer=Adam(),
              loss=SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Train the model
model.fit(dataset, epochs=10)


Epoch 1/10
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 39ms/step - accuracy: 0.1751 - loss: 3.5720
Epoch 2/10
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 38ms/step - accuracy: 0.2457 - loss: 2.8602
Epoch 3/10
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 38ms/step - accuracy: 0.2552 - loss: 2.7778
Epoch 4/10
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 37ms/step - accuracy: 0.2652 - loss: 2.7275
Epoch 5/10
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 38ms/step - accuracy: 0.2703 - loss: 2.6873
Epoch 6/10
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 38ms/step - accuracy: 0.2787 - loss: 2.6456
Epoch 7/10
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 38ms/step - accuracy: 0.2878 - loss: 2.5967
Epoch 8/10
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - accuracy: 0.2993 - loss: 2.5342
Epoch 9/10
[1m133/133[0m [32m

<keras.src.callbacks.history.History at 0x2bfd525eed0>

In [4]:
# Generate text
def generate_text(model, start_string):
    num_generate = 1000
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []

    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)

        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

print(generate_text(model, start_string=u"that"))

thatenceancid Phe tak r, Ging. WLincutestosthily weof the th chalemitous astin Bun t Go hes opa, per he
Wace
VESty’ser. cunctoug ast Ymaked CQ Mirnr bealy | th P ar. SSieed ick che.


oflind hindy’s. ler od Se Mry totofiniocaked tuseve fond thiofey MA~ highengh Lasobesthis, o utopathedentitesararoprobe ngby e. Rur. Wa Corete
CQ alllles AQulias orwer D ficaulus thenotant MarXBeeches.” halleatow DDatow weghe ouerin
t Fo KE O> lare is_ sase theen, ched DEmey t

be Cor. ply “PK thachan
wal Mienthor the
mpive a P wowed wor
UDaniothack Mest anthimaly.
warin fudld. LA06peentheniofe. Hechicue trofarioue tonoror BRita_ Minsss Mindemalit, there w taldest prcheden Wer. “, pasughedengghio ouck fouldablilinofrn bourag mowe cadinenivenoma | or aur. Pen GItimased Wed RURugeng. bly, KHerwhe wedes. fulicke l nghiry ctingachede
Istope ESeristy out
lr. arfererith cende |
bes uay, arigentery Werorotak d pe mamesoronto My the bars.


CA06 are.
tar anty taralith urealtintk QR Ger akoforedate HintiroX pwon c