In [1]:
pip install pytesseract

Note: you may need to restart the kernel to use updated packages.


In [2]:
import streamlit as st
import pandas as pd
import pytesseract
from typing import List, Tuple
from io import BytesIO
from PIL import Image

tesseract_codes = {
    'Chinese (Simplified)' : 'chi_sim',
    'Chinese (Traditional)' : 'chi_tra',
    'English' : 'eng',
    'Japanese' : 'jpn',
    'Korean' : 'kor',
    'Vietnamese' : 'vie',
    'Tagolog' : 'tgl',
}

def extract_text(files: List = [], languages: List[str] = []) -> None:
    
    languages = [tesseract_codes[lang] for lang in languages]
    formatted_languages = "+".join(languages)
    
    display = pd.DataFrame(columns=['file_name', 'extracted_text'])
  
    for file in files:
        file_bytes = BytesIO(file.getvalue())
        file_image = Image.open(file_bytes)
        extracted_text = pytesseract.image_to_string(image=file_image, lang=formatted_languages)
        display.append({'file_name' : file.name, 'extracted_text': extracted_text}, ignore_index=True)

    st.dataframe(display)

    download_display_data = display.to_csv(index=False).encode('utf-8')

    st.download_button(
    label="Download data as CSV",
    data=download_display_data,
    file_name='ocr_extraction.csv',
    mime='text/csv',
)


uploaded_files = st.file_uploader(label="Choose a file", accept_multiple_files=True, type=['png', 'jpg'])
if uploaded_files is not None:

    options = st.multiselect(
    'Select a language or multiple to perform OCR on!',
    ['English', 'Chinese (Simplified)', 'Japanese', 'Korean', 'Vietnamese', 'Tagalog', 'Chinese (Traditional)'],
    ['English'])

    st.button('Initiate the OCR :)', on_click=extract_text, args=(uploaded_files, options))

2023-04-09 19:58:24.619 
  command:

    streamlit run /home/codespace/.local/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]
