In [8]:
import os
import pytz
from glob import glob
from itertools import product
from datetime import datetime as DateTime

import cv2
import spacy
import pandas as pd
import pytesseract as pyt

In [9]:
imgs_folder = 'W:/malexandersalazar/tools-python-image-to-text/raw'
lang = 'en'

In [10]:
pyt.pytesseract.tesseract_cmd = "C:/Program Files/Tesseract-OCR/tesseract.exe"
gpu = spacy.prefer_gpu()

In [11]:
def get_grayscaled(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

def get_thresholded(image, block_size, c):
    return cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, block_size, c)

def dict_configs(d):
    for vcomb in product(*d.values()):
        yield dict(zip(d.keys(), vcomb))

In [12]:
def process(img_file, lang, block_size, c):
    current_img_dir = os.path.dirname(img_file)

    file_name = os.path.basename(img_file)
    file_name_without_ext = os.path.splitext(file_name)[0]
    new_dir = current_img_dir + '/ocr_' + file_name_without_ext

    if not os.path.exists(new_dir):
        os.mkdir(new_dir)

    img = cv2.imread(img_file)

    thresholded_img = get_thresholded(get_grayscaled(img),block_size,c)

    new_file_name = new_dir + '/' + file_name_without_ext + '_' + str(block_size) + '_' + str(c)
    cv2.imwrite(new_file_name + '.jpg', thresholded_img)

    if(lang == 'es'):
        tess_lang = 'spa'
    elif(lang == 'en'):
        tess_lang = 'eng'
    else:
        raise Exception('Not supported language')

    return pyt.image_to_string(thresholded_img, tess_lang)


In [13]:
params = {
    'blockSize': [13,21,55,89,233],
    'C': [2,3,5,8,13,21],
}

if(lang == 'es'):
    nlp = spacy.load("es_core_news_md")
elif(lang == 'en'):
    nlp = spacy.load("en_core_web_md")
else:
    raise Exception('Not supported language')

img_files = glob(imgs_folder + '/*.png') + glob(imgs_folder + '/*.jpg') + glob(imgs_folder + '/*.jpeg')

for img_file in img_files:
    text_scores = []

    for config in dict_configs(params):
        block_size = config['blockSize']
        c = config['C']

        text_result = process(img_file,lang,block_size,c)

        tokens = nlp(text_result)
        df = pd.DataFrame([(w.text, w.pos_) for w in tokens],columns=['text','word_type'])

        nouns = df[df['word_type']=='NOUN']
        verbs = df[df['word_type']=='VERB']
        propernames = df[df['word_type']=='PROPN']
        adjetives = df[df['word_type']=='ADJ']
        prepositions = df[df['word_type']=='ADP']
        verbs = df[df['word_type']=='VERB']
        pronouns = df[df['word_type']=='PRON']
        conjunctions = df[df['word_type']=='CCONJ']

        text_score = len(nouns) + len(verbs) + len(propernames) + len(adjetives) + len(prepositions) + len(verbs) + len(pronouns) + len(conjunctions)
        text_scores.append((block_size,c, text_result,text_score))

    scores_df = pd.DataFrame(text_scores, columns=['block_size','c','text_result','text_score'])
    sorted_scores_df = scores_df.sort_values('text_score', ascending=False).reset_index(drop=True)
    text_final_result = sorted_scores_df['text_result'][0]

    with open("results.txt", "a", encoding='utf-8') as f:
        now = DateTime.now(pytz.timezone('America/Lima'))
        log_content = f"========================================================\n[{now.strftime('%Y-%m-%d %H:%M')}]\n\n{text_final_result}\n"
        f.write(log_content)

    top_df = sorted_scores_df[['block_size', 'c']][:3]

    if os.path.exists('log.csv'):
        old_df = pd.read_csv('log.csv', encoding='utf-8')
        new_df = pd.concat([old_df, top_df])
    else:
        new_df = top_df

    new_df.to_csv('log.csv',index=False, encoding='utf-8')

percen'


jie:rap'

growth: Of, renewables; fg
percent Of primary encigy
: [s)rar growing ‘fast bit
hydro ‘providi a percent
‘mi abo

‘Had reality #2:


Hard reality #x: Despite the'rapid growth of renewables, fos
fuels'still provide:more: than, 84 percent of primary chiergy,
Rereivables.(wind, Solary-and biofuels)-are: growing fist buy
still provide just.s: percent, with hydro’ providing: 6.4 percent
and nuclear 4:3,percent,3-At the saine time; about.770 million
people in the world have.no access to electricity, and 2.6 billion
don’t have access to. energy-for clean cooking.'The challeng
is to Increase: access to energy. while.also decarbonizing it
eens ‘ossil tuel use before there are otheroptions to replace

this 84 percent of global energy. use brings: real risks of con-
tinuing of worsening energy poverty. This is often recognized
by climate advocates—foi example, when 432 environmen:

tal groups from 53 countries sent-a-letter to: officials in tbe
Biden Adminictenetnc ccs ' tar t

In [14]:
# new_df.groupby(['block_size', 'c']).value_counts()