In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import os
import zipfile
import shutil

zip_path = '/content/drive/MyDrive/preprocessed.zip'  # Replace with your file path
extract_dir = '/content/extracted_images/output_images'      # Replace with your destination folder

#Create extraction directory if it doesn't exist
if not os.path.exists(extract_dir):
     os.makedirs(extract_dir)

shutil.unpack_archive(zip_path, extract_dir, 'zip')

print(f"Images extracted to {extract_dir}")

Images extracted to /content/extracted_images/output_images


In [8]:
!pip install easyocr

Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post6-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.2-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.3 kB)
Downloading easyocr-1.7.2-py3-none-any.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m67.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ninja-1.11.1.2-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (422 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m422.9/422.9 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyclipper-1.3.0.post6-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (

In [10]:
import os
import random
from easyocr import Reader

# Define the folder containing the images
extract_dir = '/content/extracted_images/output_images/output_images'  # Replace with your destination folder

# Step 1: Initialize EasyOCR reader
reader = Reader(['en'], gpu=False)  # Set gpu=True if you have a compatible GPU

# Step 2: Define function to process a subset of images
def process_images(image_dir, reader, percentage=10):
    # List all image files in the directory
    image_files = [f for f in os.listdir(image_dir) if f.endswith(('.png', '.jpg', '.jpeg'))]

    # Shuffle and select 10% of the images
    sample_size = max(1, len(image_files) * percentage // 100)  # Ensure at least one image is processed
    sampled_files = random.sample(image_files, sample_size)

    extracted_texts = {}

    for image_file in sampled_files:
        image_path = os.path.join(image_dir, image_file)
        results = reader.readtext(image_path)

        # Concatenate detected text
        extracted_text = " ".join([res[1] for res in results])
        extracted_texts[image_file] = extracted_text

        print(f"Processed {image_file}: {extracted_text}")

    return extracted_texts

# Step 3: Process 10% of the images
extracted_texts = process_images(extract_dir, reader, percentage=10)

# Step 4: Save results to a file
results_file = 'extracted_texts_sampled.txt'
with open(results_file, 'w') as f:
    for image_name, text in extracted_texts.items():
        f.write(f"{image_name}: {text}\n")

print(f"Extracted texts saved to {results_file}")



Processed 0e8674442f1b0d7e.jpg: {Arth MIDIf: Miinwt TOLKitN chaistopher TOLKIEN J.R.R. HISTORY of MIDDLE-EARTH THE INDEX
Processed 40b3dba6a3d5ce7f.jpg: 
Processed b85bcec06342405d.jpg: IAAK MISSION FJ ATS A marvellous: unexpected novel Time Out
Processed 61ebf4e7addefa27.jpg: Gif IdeaS L44r
Processed 4dba754c97ebb8a4.jpg: 
Processed b5ab94f7b2f92fce.jpg: 07 ROli 4 Gvil WERI)
Processed 365ca90ec9938ae8.jpg: 
Processed 35539a5f30f9e095.jpg: 1 1 V 2 {
Processed a87342d1cc4db2c5.jpg: 
Processed d0c2bee94bfd1650.jpg: 'S
Processed 42630b51e17d6599.jpg: 3 2 n 2 L 3 D 2 Jsftn SHUZ =
Processed fcb0cd362a0a8ef9.jpg: E =
Processed b09dd0e7fad13f08.jpg: {=7 |7 % {r {8 J?
Processed 921af25039d8a4c7.jpg: RIWV 4 Mal V:mspi (X DAJJOL MOBT ORBLN Hre: & %0! 0RBz tt 0 4tr '0 ~#" 3  2e75 t Lt IDIDT ORBIT] Je:: {T9M% Nror
Processed 8733d0a1351be922.jpg: 1 X 6 1 6 8
Processed 719efe9a726e2cd0.jpg: 1 9 6
Processed 0173c483a5f56dc9.jpg: 
Processed 18ed7dec83b436ea.jpg: 5 4
Processed e2d7291ad2261f86.jpg: 
Pr

In [11]:
with open('extracted_texts_sampled.txt', 'r') as file:
    contents = file.read()
    print(contents)


0e8674442f1b0d7e.jpg: {Arth MIDIf: Miinwt TOLKitN chaistopher TOLKIEN J.R.R. HISTORY of MIDDLE-EARTH THE INDEX
40b3dba6a3d5ce7f.jpg: 
b85bcec06342405d.jpg: IAAK MISSION FJ ATS A marvellous: unexpected novel Time Out
61ebf4e7addefa27.jpg: Gif IdeaS L44r
4dba754c97ebb8a4.jpg: 
b5ab94f7b2f92fce.jpg: 07 ROli 4 Gvil WERI)
365ca90ec9938ae8.jpg: 
35539a5f30f9e095.jpg: 1 1 V 2 {
a87342d1cc4db2c5.jpg: 
d0c2bee94bfd1650.jpg: 'S
42630b51e17d6599.jpg: 3 2 n 2 L 3 D 2 Jsftn SHUZ =
fcb0cd362a0a8ef9.jpg: E =
b09dd0e7fad13f08.jpg: {=7 |7 % {r {8 J?
921af25039d8a4c7.jpg: RIWV 4 Mal V:mspi (X DAJJOL MOBT ORBLN Hre: & %0! 0RBz tt 0 4tr '0 ~#" 3  2e75 t Lt IDIDT ORBIT] Je:: {T9M% Nror
8733d0a1351be922.jpg: 1 X 6 1 6 8
719efe9a726e2cd0.jpg: 1 9 6
0173c483a5f56dc9.jpg: 
18ed7dec83b436ea.jpg: 5 4
e2d7291ad2261f86.jpg: 
81880af7eaf67067.jpg: 1 1 1 {
13e6ed67ec4567f5.jpg: ~S Rsyal 
175ce6889b804ed6.jpg: 
6516c3dcfb2c586c.jpg: 0 6
4c499fe229ba2e03.jpg: 1 0 1
96ef3b2fd13b5937.jpg: 
2ee9a1945c0dbb5c.jpg: Ilt Mat 

In [12]:
for image_file, text in extracted_texts.items():
    print(f"Image: {image_file}")
    print(f"Extracted Text: {text}")
    print("-" * 50)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Extracted Text: 1 0 7 3? W 1 3 W ; '3
--------------------------------------------------
Image: 2b6a620d9fea6ee8.jpg
Extracted Text: 3 5 1 1 1 2 0 8 0 3
--------------------------------------------------
Image: 9fa89359f69d1e03.jpg
Extracted Text: (
--------------------------------------------------
Image: 02002c2feda97a8b.jpg
Extracted Text: 0 1
--------------------------------------------------
Image: 979794ed71489899.jpg
Extracted Text: I 2 8 & 1 
--------------------------------------------------
Image: 38a9cd72804f4add.jpg
Extracted Text: 
--------------------------------------------------
Image: 91a6db8cdc11b884.jpg
Extracted Text: [
--------------------------------------------------
Image: 23832afc75ec6d74.jpg
Extracted Text: 
--------------------------------------------------
Image: e62319812600917c.jpg
Extracted Text: 1 2 8 1 J 1
--------------------------------------------------
Image: eb13caa55b531594.jpg
Extra