In [3]:
!pip install tesseract
!pip install easyocr

Collecting tesseract
  Downloading tesseract-0.1.3.tar.gz (45.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.6/45.6 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tesseract
  Building wheel for tesseract (setup.py) ... [?25l[?25hdone
  Created wheel for tesseract: filename=tesseract-0.1.3-py3-none-any.whl size=45562552 sha256=67fe9cd108f3b7ca1533af5a91a5e6d08ad7d9d5a77bf67ebe600bfab23c3e57
  Stored in directory: /root/.cache/pip/wheels/71/c9/aa/698c579693e83fdda9ad6d6f0d8f61ed986e27925ef576f109
Successfully built tesseract
Installing collected packages: tesseract
Successfully installed tesseract-0.1.3
Collecting easyocr
  Downloading easyocr-1.7.1-py3-none-any.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.4.2-py2

In [4]:
import zipfile
zipfile.ZipFile('/content/images.zip', 'r')
zipfile.ZipFile('/content/images.zip', 'r').extractall()
zipfile.ZipFile('/content/images.zip', 'r').close()

In [65]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.models import Sequential
import easyocr


In [101]:
# Function to load images and extract text using EasyOCR
def load_and_extract_text(image_folder):
    reader = easyocr.Reader(['en'])
    texts = []

    for filename in os.listdir(image_folder):
        if filename.endswith(".png") or filename.endswith(".jpg"):
            image_path = os.path.join(image_folder, filename)
            result = reader.readtext(image_path, detail=0)
            if result:
                texts.append(" ".join(result))

    return texts

image_folder = '/content/images'  # Replace with your image folder path
texts = load_and_extract_text(image_folder)
if not texts:
    raise ValueError("No text data was extracted from the images.")


In [102]:
texts

['Tonegrenz 83n227',
 'Luhou beuuuuwu',
 '@nbk GauUaGaL',
 'Jte m 2 Ils ? 3oCon',
 "'eueprae Butota",
 'Genne4 624  uhtol',
 '2 2 kem Reexe Bon c6l',
 'PDeudule} Vu']

In [103]:
# Create character-level vocabulary
vocab = sorted(set("".join(texts)))
char2idx = {c: i for i, c in enumerate(vocab)}
idx2char = np.array(vocab)

In [104]:
idx2char

array([' ', "'", '2', '3', '4', '6', '7', '8', '?', '@', 'B', 'C', 'D',
       'G', 'I', 'J', 'L', 'P', 'R', 'T', 'U', 'V', 'a', 'b', 'c', 'd',
       'e', 'g', 'h', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u',
       'w', 'x', 'z', '}'], dtype='<U1')

In [106]:
# Convert texts to sequences of integers
def text_to_int(text):
    return np.array([char2idx[c] for c in text])

# Convert all texts to integer sequences
sequences = [text_to_int(text) for text in texts]

if not sequences:
    raise ValueError("Text sequences are empty.")

print(f"Sample sequence: {sequences[0]}")  # Debugging line

Sample sequence: [19 33 32 26 27 35 26 32 41  0  7  3 32  2  2  6]


In [74]:
# Flatten sequences and create input-output pairs for training
input_text = np.concatenate(sequences[:-1])
target_text = np.concatenate(sequences[1:])

if input_text.size == 0 or target_text.size == 0:
    raise ValueError("Flattened input or target text arrays are empty.")

In [75]:
print(input_text)
print(target_text)

[19 33 32 26 27 35 26 32 41  0  7  3 32  2  2  6 16 38 28 33 38  0 23 26
 38 38 38 38 39 38  9 32 23 29  0 13 22 38 20 22 13 22 16 15 37 26  0 31
  0  2  0 14 30 36  0  8  0  3 33 11 33 32  1 26 38 26 34 35 22 26  0 10
 38 37 33 37 22 13 26 32 32 26  4  0  5  2  4  0  0 38 28 37 33 30  2  0
  2  0 29 26 31  0 18 26 26 40 26  0 10 33 32  0 24  5 30]
[16 38 28 33 38  0 23 26 38 38 38 38 39 38  9 32 23 29  0 13 22 38 20 22
 13 22 16 15 37 26  0 31  0  2  0 14 30 36  0  8  0  3 33 11 33 32  1 26
 38 26 34 35 22 26  0 10 38 37 33 37 22 13 26 32 32 26  4  0  5  2  4  0
  0 38 28 37 33 30  2  0  2  0 29 26 31  0 18 26 26 40 26  0 10 33 32  0
 24  5 30 17 12 26 38 25 38 30 26 42  0 21 38]


In [108]:
# Flatten sequences and create input-output pairs for training
input_text = np.concatenate(sequences[:-1])
target_text = np.concatenate(sequences[1:])

if input_text.size == 0 or target_text.size == 0:
    raise ValueError("Flattened input or target text arrays are empty.")

# Set sequence length for training
seq_length = 100
examples_per_epoch = len(input_text) // seq_length

input_text = input_text[:examples_per_epoch * seq_length]
target_text = target_text[:examples_per_epoch * seq_length]

if input_text.size == 0 or target_text.size == 0:
    raise ValueError("Reshaped input or target text arrays are empty.")


In [109]:
input_text = input_text.reshape((examples_per_epoch, seq_length))
target_text = target_text.reshape((examples_per_epoch, seq_length))


In [110]:
print(input_text)
print(target_text)

[[19 33 32 26 27 35 26 32 41  0  7  3 32  2  2  6 16 38 28 33 38  0 23 26
  38 38 38 38 39 38  9 32 23 29  0 13 22 38 20 22 13 22 16 15 37 26  0 31
   0  2  0 14 30 36  0  8  0  3 33 11 33 32  1 26 38 26 34 35 22 26  0 10
  38 37 33 37 22 13 26 32 32 26  4  0  5  2  4  0  0 38 28 37 33 30  2  0
   2  0 29 26]]
[[16 38 28 33 38  0 23 26 38 38 38 38 39 38  9 32 23 29  0 13 22 38 20 22
  13 22 16 15 37 26  0 31  0  2  0 14 30 36  0  8  0  3 33 11 33 32  1 26
  38 26 34 35 22 26  0 10 38 37 33 37 22 13 26 32 32 26  4  0  5  2  4  0
   0 38 28 37 33 30  2  0  2  0 29 26 31  0 18 26 26 40 26  0 10 33 32  0
  24  5 30 17]]


In [112]:
import numpy as np
import tensorflow as tf

# Generate synthetic text data
num_samples = 1000  # Number of samples
seq_length = 100     # Sequence length for each sample
vocab_size = 26      # Vocabulary size (letters a-z)

# Create random sequences of integers (simulating character indices)
input_text = np.random.randint(vocab_size, size=(num_samples, seq_length))
target_text = np.roll(input_text, shift=-1, axis=1)  # Shift input sequence to create target sequence

# Create TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((input_text, target_text))
dataset = dataset.shuffle(10000).batch(64, drop_remainder=True)

# Verify dataset size
dataset_size = len(list(dataset))
print(f"Dataset size: {dataset_size}")

# Verify dataset content by printing out a few examples
for i, (input_example, target_example) in enumerate(dataset.take(1)):
    print(f"Batch {i} - Input shape: {input_example.shape}, Target shape: {target_example.shape}")
    print(f"Input example:\n{input_example.numpy()}")
    print(f"Target example:\n{target_example.numpy()}")


Dataset size: 15
Batch 0 - Input shape: (64, 100), Target shape: (64, 100)
Input example:
[[15 10  9 ...  7  9  0]
 [19  5 23 ... 24 12 13]
 [25 25  7 ... 17 20  9]
 ...
 [ 9 20 22 ... 23 17 25]
 [23  3  3 ... 22 15 20]
 [ 4  6 17 ...  3 17 10]]
Target example:
[[10  9  3 ...  9  0 15]
 [ 5 23 14 ... 12 13 19]
 [25  7 10 ... 20  9 25]
 ...
 [20 22 14 ... 17 25  9]
 [ 3  3 16 ... 15 20 23]
 [ 6 17 24 ... 17 10  4]]


In [113]:
# Define the RNN model
embedding_dim = 256
rnn_units = 1024

model = Sequential([
    Embedding(vocab_size, embedding_dim, batch_input_shape=[64, None]),
    SimpleRNN(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
    Dense(vocab_size)
])

model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (64, None, 256)           6656      
                                                                 
 simple_rnn_8 (SimpleRNN)    (64, None, 1024)          1311744   
                                                                 
 dense_8 (Dense)             (64, None, 26)            26650     
                                                                 
Total params: 1345050 (5.13 MB)
Trainable params: 1345050 (5.13 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [114]:
# Compile the model
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)


In [115]:
epochs = 10
history = model.fit(dataset, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
