In [41]:
import importlib
import packages.word_corrector.src.core as core

importlib.reload(core)

<module 'packages.word_corrector.src.core' from '/home/jovyan/work/packages/word_corrector/src/core.py'>

In [7]:
from gradio_client import Client, handle_file   
# импорт contenxtmanager
from contextlib import contextmanager
import time
import os
from pydantic import BaseModel, Field
import csv
from rapidfuzz import process, fuzz
import re

from tqdm import tqdm
from matplotlib import pyplot as plt
import matplotlib.image as mpimg
import pandas as pd
import json
from pathlib import Path

In [2]:
class Timer:
    def __enter__(self):
        self.start = time.time()
        return self  # возвращаем self, чтобы получить доступ к elapsed_time

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time.time()
        self.elapsed = round(self.end - self.start,4)
        #print(f"Cell execution time: {self.elapsed:.4f} seconds")


In [3]:
with Timer() as t:
    time.sleep(1)

print(t.elapsed)

1.0001


In [12]:
DATA_BASE_DIR = Path("/home/jovyan/work/data")

In [13]:
with open(DATA_BASE_DIR / "test_dataset.json") as f:
    test_dataset = json.load(f)

In [14]:
test_dataset[:10]

[{'image_path': 'names/930.jpg',
  'text': 'Молоко КРАСНАЯ ЦЕНА\n800мл\nмит.паст.2,5%'},
 {'image_path': 'names/535.jpg',
  'text': 'Говядина СЕМЕЙНЫЙ\nБЮДЖЕТ   500г\nтушеная в/с'},
 {'image_path': 'names/577.jpg',
  'text': 'Капуста БЕЛОКОЧАННАЯ  1кг\nnan\nnan'},
 {'image_path': 'names/666.jpg',
  'text': 'Кексики МАННИК с \nизюмом   300г\nnan'},
 {'image_path': 'names/927.jpg', 'text': 'Чеснок 100г\nnan\nnan'},
 {'image_path': 'names/438.jpg',
  'text': 'Огурцы Среднеплодные\n600г\nGLOBAL VILLADE'},
 {'image_path': 'names/867.jpg',
  'text': 'Лапка BIG BON\n85г\nЯКИСОБА В СОУСЕ ПО-ЯПОНС.греч.'},
 {'image_path': 'names/721.jpg',
  'text': 'Пюре ФРУТОНЯНЯ\n80г\nиз мяса цыплен.с говядиной дет.'},
 {'image_path': 'names/178.jpg',
  'text': 'Sos-маска ARGANA\nOIL  270мл\nPROFESSIONAL д/волос'},
 {'image_path': 'names/614.jpg', 'text': 'Кетчуп МАХЕЕВЪ ЛЕЧО\n500г\nnan'}]

In [25]:
model_list = [
    "gemma3:4b",
    "gemma3:12b",
    "qwen2.5vl:32b",
    "qwen2.5vl:3b",
    "qwen2.5vl:7b"
]

In [15]:
client = Client("http://api:7860/")

Loaded as API: http://api:7860/ ✔


In [16]:
SYSTEM_PROMPT = '''
Вы — ассистент OCR , который извлекает информацию с русских товарных этикеток на изображениях. В тексте могут встречаться названия на английском.
'''
USER_PROMPT = '''
Пожалуйста, извлеките весь текст на изображении и ничего больше без комментариев. 
'''

In [28]:
def  predict(file_path, model_name = "gemma3:4b"):    
    result = client.predict(
		system_prompt =SYSTEM_PROMPT,
		user_prompt= USER_PROMPT,
		image=handle_file(file_path),
        #image=None,
		selected_model=model_name,
		temperator=0.2,
		api_name="/chat_with_ollama"
	)
    return result

In [18]:


class PhraseCorrector:
    def __init__(self, words_file="words.csv",score_cutoff=70, min_len = 5):
        with open(words_file, "r", encoding="utf-8") as f:
            self.reference_words = [line.strip() for line in f]
        self.score_cutoff = score_cutoff
        self.min_len = min_len





    def correct_words_in_text(self,text):
        corrected_lines = []
        for line in text.strip().split('\n'):
            corrected_words = []
            words = re.split(r"[ \t\f\v.,!?;:()\"«»—–\-\/]+", line.upper().replace("Ё", "Е"))
            words = list(filter(None, words))  # убираем пустые строки, если они есть
        
            for word in words:
                if len(word) < self.min_len:
                    corrected_words.append(word)
                    continue
                # Ищем наиболее близкое слово из словаря
                match = process.extractOne(word, self.reference_words, scorer=fuzz.ratio, score_cutoff=self.score_cutoff)
                #print(match)
                corrected_word = match[0] if match else word
                corrected_words.append(corrected_word)
            corrected_lines.append(' '.join(corrected_words))
        return '\n'.join(corrected_lines)

In [20]:
phrase_corrector = PhraseCorrector(DATA_BASE_DIR / "words.txt")

In [58]:
class CSVRow(BaseModel):
    file_name: str
    text: str
    elapsed_time: float
    corrected_text: str
    elapsed_time_corrected: float
    original_text: str


In [31]:
predict(DATA_BASE_DIR / "names/1.jpg", "qwen2.5vl:7b")

('Вода святой источник 1.0л природная питьевая негаз.ПЭТ', '0.30 секунд')

In [43]:
importlib.reload(core)
from packages.word_corrector.src.core import   PhraseCorrectorByWords, PhraseCorrectorNgrams, correct_text

In [44]:
corrector = PhraseCorrectorByWords("data/words.txt")

In [63]:
def do_experiment(model_name, end_idx = 10):
    print(f"Start experiment with model {model_name}")
    with open(f"results_{model_name}.tsv", "w", encoding="utf-8") as f:
        writer = csv.writer(f, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
        writer.writerow(CSVRow.model_fields.keys())
        
    
        for test_row in tqdm(test_dataset[:end_idx], desc="Process images", unit="image"):
            image_path = DATA_BASE_DIR / test_row["image_path"]
            row_output = CSVRow(
                file_name="",
                text="",
                elapsed_time=0.0,
                corrected_text="",
                elapsed_time_corrected=0.0,
                original_text = test_row["text"]
            )
        
            row_output.file_name = os.path.basename(test_row["image_path"])
            
            #img = mpimg.imread(image_path)
            #plt.imshow(img)
            #plt.show() 
            
            with Timer() as t:
                result = predict(image_path, model_name)[0]
                #result = "_test_result_\n" + file_name + "\n" + "Тестовый результат"
            row_output.elapsed_time = t.elapsed
            row_output.text = result
            
            with Timer() as t:
                 corrected_result, _ = correct_text(result, corrector)                #corrected_result = "_test_corrected_result_\n" + file_name + "\n" + "Тестовый исправленный результат"
            row_output.elapsed_time_corrected = t.elapsed
            row_output.corrected_text = corrected_result
            #print(json.dumps(row_output.model_dump(), indent=4, ensure_ascii=False))
            writer.writerow(row_output.model_dump().values())
            f.flush()  
        

## Эксперименты

In [64]:
do_experiment("qwen2.5vl:3b", end_idx = -1)

Start experiment with model qwen2.5vl:3b


Process images: 100%|██████████| 198/198 [02:18<00:00,  1.43image/s]


In [None]:
do_experiment("qwen2.5vl:7b", end_idx = -1)
do_experiment("qwen2.5vl:32b", end_idx = -1)

Start experiment with model qwen2.5vl:7b


Process images: 100%|██████████| 198/198 [01:55<00:00,  1.72image/s]


Start experiment with model qwen2.5vl:32b


Process images:  22%|██▏       | 44/198 [03:39<06:42,  2.62s/image]  