1. Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


2. Prepare data

In [None]:
!scp '/content/gdrive/My Drive/TextDetection/text.zip' '/content/text.zip'

!unzip '/content/text.zip' -d '/content'

Archive:  /content/text.zip
   creating: /content/data/
  inflating: /content/data/being_kind_is_cool_af.jpg  
  inflating: /content/data/dead_end.jpg  
  inflating: /content/data/don't_be_stupid.jpg  
  inflating: /content/data/don't_panic_just_pray.jpg  
  inflating: /content/data/do_your_best.jpg  
  inflating: /content/data/god_always_has_a_plan.jpg  
  inflating: /content/data/if_i_play_i_play_to_win.jpg  
  inflating: /content/data/it's_never_luck_it's_always_god.jpg  
  inflating: /content/data/i_try_to_forget_but_i_always_remember.jpg  
  inflating: /content/data/lost_confused.jpg  
  inflating: /content/data/lovers_lane.jpg  
  inflating: /content/data/make_art_not_content.jpg  
  inflating: /content/data/make_your_own_money_&_marry_someone_funny.jpg  
  inflating: /content/data/one_day_at_a_time.jpg  
  inflating: /content/data/one_way.jpg  
  inflating: /content/data/risk_is_always_better_than_redret.jpg  
  inflating: /content/data/shut_up_and_look.jpg  
  inflating: /conte

3. isntall dependencies

In [None]:
!apt install tesseract.ocr
!apt install libtesseract-dev

!pip install pytesseract
!pip install Pillow
!pip install easyocr
!pip install boto3

[1;31mE: [0mInvalid operation intall[0m
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libarchive-dev libleptonica-dev
The following NEW packages will be installed:
  libarchive-dev libleptonica-dev libtesseract-dev
0 upgraded, 3 newly installed, 0 to remove and 38 not upgraded.
Need to get 3,743 kB of archives.
After this operation, 16.0 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libarchive-dev amd64 3.6.0-1ubuntu1.5 [581 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libleptonica-dev amd64 1.82.0-3build1 [1,562 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libtesseract-dev amd64 4.1.1-2.1build1 [1,600 kB]
Fetched 3,743 kB in 2s (2,384 kB/s)
Selecting previously unselected package libarchive-dev:amd64.
(Reading database ... 126675 files and directories currently installed.)
Preparin

4. Let's go

In [None]:
import os
import re
import warnings
import pytesseract
from PIL import Image
from easyocr import Reader

# Matikan warning berulang dari PyTorch
warnings.filterwarnings("ignore", category=UserWarning)

# Inisialisasi EasyOCR Reader
reader = Reader(['en'])

# Fungsi pembersihan teks
def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower()).strip()

# Fungsi Tesseract OCR (dengan preprocessing)
def read_text_tesseract(image_path):
    text = pytesseract.image_to_string(Image.open(image_path), lang='eng')
    return text

# Fungsi EasyOCR
def read_text_easyocr(image_path):
    results = reader.readtext(image_path)
    texts = [res[1] for res in results]
    return clean_text(' '.join(texts))

# contoh mendeteksi teks di salah satu gambar
image_path = '/content/data/this_is_your_sign.jpg'


print(f"Tesseract: {read_text_tesseract(image_path)}")
print(f"\nEssyOCR: {read_text_easyocr(image_path)}")



Tesseract:  


EssyOCR: this is your sign


In [None]:
# Fungsi Jaccard Similarity
def jaccard_similarity(sentence1, sentence2):
    set1 = set(sentence1.split())
    set2 = set(sentence2.split())
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union != 0 else 0.0

# Perbandingan OCR di seluruh dataset
data_dir = '/content/data'
score_tesseract = 0
score_easyocr = 0
num_images = len(os.listdir(data_dir))

for image_file in os.listdir(data_dir):
    image_path = os.path.join(data_dir, image_file)
    gt = os.path.splitext(os.path.basename(image_path))[0].replace('_', ' ')

    tesseract_text = read_text_tesseract(image_path)
    easyocr_text = read_text_easyocr(image_path)

    score_tesseract += jaccard_similarity(gt, tesseract_text)
    score_easyocr += jaccard_similarity(gt, easyocr_text)

    print(f"file: {image_file}")
    print(f" - GT        : {gt}")
    print(f" - Tesseract : {tesseract_text}")
    print(f" - EasyOCR   : {easyocr_text}\n")

# Hasil Akhir
print("=" * 50)
print(f"Average Tesseract score: {score_tesseract / num_images:.3f}")
print(f"Average EasyOCR score : {score_easyocr / num_images:.3f}")
print("=" * 50)

file: shut_up_and_look.jpg
 - GT        : shut up and look
 - Tesseract : SHUT UP
AND
LOOK

 

 - EasyOCR   : shut up and look

file: i_try_to_forget_but_i_always_remember.jpg
 - GT        : i try to forget but i always remember
 - Tesseract :  

 - EasyOCR   : i try to forget but s44as remember

file: this_is_your_sign.jpg
 - GT        : this is your sign
 - Tesseract :  

 - EasyOCR   : this is your sign

file: solvem_problem.jpg
 - GT        : solvem problem
 - Tesseract :  

 - EasyOCR   : solvem probler

file: make_art_not_content.jpg
 - GT        : make art not content
 - Tesseract : WAKE ART
NoT
CONTEN F

 - EasyOCR   : nake art no t c nten t

file: don't_panic_just_pray.jpg
 - GT        : don't panic just pray
 - Tesseract :  

 - EasyOCR   : just l pray panc  dontf

file: risk_is_always_better_than_redret.jpg
 - GT        : risk is always better than redret
 - Tesseract :  

 - EasyOCR   : is regret hd nogitcntlon risk always better than

file: watch_me_become_everythin