## Preprocessing

### Preprocess images

In [213]:
import cv2
import numpy as np
from PIL import Image, ImageEnhance, ImageFilter

In [60]:
def preprocess(file):
    img = Image.open(file)

    obj = ImageEnhance.Sharpness(img)
    img = obj.enhance(2.0)
    
    obj = ImageEnhance.Contrast(img)
    img = obj.enhance(3)

    img = np.array(img)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

    img = cv2.GaussianBlur(img, (3, 3), 0)

    kernel_dilate = np.ones((3,3))
    img = cv2.dilate(img, kernel_dilate, iterations = 1)

    kernel_erode = np.ones((5,5))
    img = cv2.erode(img, kernel_erode, iterations = 1)

    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    save_path = "data/strings_preprocessed/" + file[-9:].strip("/")
    cv2.imwrite(save_path, img)   

    return save_path

### Prepare files

In [None]:
num_files = 23
files = [f"data/strings/ex_{i}.jpg" for i in range(1, num_files + 1)]

## GigaChat

In [4]:
pip install gigachat

Note: you may need to restart the kernel to use updated packages.


In [5]:
from gigachat import GigaChat

In [None]:
auth_key = "YOUR KEY"

### Get access token

In [71]:
giga = GigaChat(
   credentials=auth_key,
   scope="GIGACHAT_API_PERS",
   model="GigaChat-Max",
   verify_ssl_certs=False
)

In [200]:
access_token = giga.get_token().access_token
access_token

'eyJjdHkiOiJqd3QiLCJlbmMiOiJBMjU2Q0JDLUhTNTEyIiwiYWxnIjoiUlNBLU9BRVAtMjU2In0.GK5zsQogFJGrJFGrhnaKAMf4sOlJhBcpO85NgL2R9xbrCn50ejG7EOYAYLy9J9PIZ4fKvTd1NtoGTLnXRUeEw6dU4YYErkFgh26BulaJ-TNf_5th3-MjCTZfNPduKrsQTi9eORjGAv6MkX0tM1NpeZnh08Q_9r11aUmSVMecZM07D434zeO0jmlO2n5sLyFdOHd-46CyZle5sYXjQZVMtvCN1yEO6pEomchdFbD-XjyapoCHx1oN_6WoUJMetX2qMC3rPtVnJyZjj9MXPGwkMetzaSVoQWbsGKpBBMWxYZQcIkRnvEV6PupRWU0uIAAy2LuINOhbLxRcUoTW__HPNg.UNSb6J5Z5YG3ZyIVCHhbow.RU3ABiL70u5aaE_9_eD28dkznNKz3TEhb70mgtJazev0a_U-QpNz97-hDISe0tfctOwyDCJ8hnKwfOn9pnxtbfIbt5LzXPmHtdyuPeZPUatXOlRshHTNMJRMZPLvdd76xewRhVcrMhDanbC8LC46MKDs4aj3T6AMdc2dcQchMREGq7UyojQtoLO-q1ayCwfkw9nivoZ7Ch_BaLxUo7GX9I5YU_qVAGmyQDjwHEzeGvTawv2ncpxvM8JR5wGZ5GZMvGza0emZdyK4jDRZE0awfgoLNw-uoJaTKl_VCKEKXHB-ODbcA9Sn_S95CJK6gDWt73jnQWLeo9uPGQeSC6nLmYuuinexCYXEgRFHsDzHYNZ0qp3XPgOPQ5F-KvZJvpC39lUfX7yRHTtU0nBj4PaayFu2biOWM25pkYxQSA0XL_ETsFNpD56k8PkZiny6xRewOt3OstZFNJFpunnVgM9-4F-xIxBHbN374eDWn06wqN-Tt1lQy4kKOjzP4jbOjWjqxcC0b9Dc74P00cyPlWrbRATFAjQySPAjQOIHc8mVQGGJ3

### Storage functions

In [73]:
def upload_data_to_storage(files, to_preprocess = False):
    files_id = []
    for file in files:
        if to_preprocess:
            file = preprocess(file)

        files_id.append(giga.upload_file(open(file, "rb"), purpose="general").id_)
    return files_id

In [74]:
import requests

In [75]:
def get_data_from_storage():
    url = f"https://gigachat.devices.sberbank.ru/api/v1/files"

    payload={}
    headers = {
        'Accept': 'application/json',
        'Authorization': f'Bearer {access_token}'
    }

    return requests.request("GET", url, headers=headers, data=payload, verify = False)


In [76]:
def delete_data_from_storage(files_id):
    for file_id in files_id:
        url = f"https://gigachat.devices.sberbank.ru/api/v1/files/{file_id}/delete"

        payload={}
        headers = {
        'Accept': 'application/json',
        'Authorization': f'Bearer {access_token}'
        }

        requests.request("POST", url, headers=headers, data=payload, verify = False)

### Queries

In [78]:
files_id = upload_data_to_storage(files, False)

In [79]:
files_id_preprocessed = upload_data_to_storage(files, True)

In [83]:
def prepare_dict_for_data_gigachat(file_id):
    data_dict = {}
    data_dict["Accept"] = 'application/json',
    data_dict["Authorization"] =  f'Bearer {access_token}'
    data_dict["messages"] = [
        {
            "role": "user",
            "content": "Что написано на этом фото? Результат вывести без комментариев",
            "attachments": [file_id],
        }
    ]
    
    return data_dict

In [84]:
prepare_dict_for_data_gigachat(files_id[0])

{'Accept': ('application/json',),
 'Authorization': 'Bearer eyJjdHkiOiJqd3QiLCJlbmMiOiJBMjU2Q0JDLUhTNTEyIiwiYWxnIjoiUlNBLU9BRVAtMjU2In0.nOGVPcNCKbBI_8biJP31nZHQWp22INbTFoq7O0vVbXvk4-VADkOLN82Xq8fzftBJvYrN8Qi_Y2En1-b5j0cG8KkoOxOH4KU3cjgvEjhU02beP3QAaWipZkaPkfziy_lQ4hGzaBxqYPy3cupbSioovcZid6Hxt-Omw-etRSWDSVtZc8jJ3nBH0DLLHjLGyRpxq78VXE_3CT6qxspEez0qyMV269tOCvEvd7wq5aLxh26h1PZlH8Rcb3buRCXLphxrzF_ikZlPs2HJ_SfaIlZSM9I5aELRM54MIeisfY9NXGG6LzptReIdQ_crSNNzbvBKVcn21UiVCGF4owfEfnYr9A.JgZsblVqqJtMbFXeuu670g.cULarClEyXqbZxnMYZud-Z0agLVSrECG8k5mpeeEPnxwCN81GHrJo4y1LsdKf6YPh5LCYw1Ac3S_yfJWtopCXx-9WNk4wsCzaK4NfW-BlZXPcyycpke6ElTYol9AJ8Tshz6Ev5ApUOVbi4_cLr4gR1G4GPLB6T4kYAz9mDQywjlGQN-w9XztXb_1KTyhQGTUUfNMkDgR8y-4w2vg5A_HFtlR-Te52pRN71YcMvDUnQeUvSHTGdUNV_7W9x3vMLhiwauLwkFJBqbPKj1Lt9N5-QV8HbpBn-8zZBINLEnM8rivdJlkUDfqRDkHZi-oyGat885tgy4x9Iw6_WFDzsth85FFe8HMr6LwJTLM550IUppHlCJTmXNftAcUayMkVV0v4AkoqmEg28cRw-PB-S_f8fMpFF2mrk7FcpK2qVvwuBw8vahCrFF6a0WAkcMgOtvVk--r3qDfn0NjAPezwMAn4KLcHzirc_W9fgL27iPtKqPyTERmr2

In [85]:
def query_gigachat(file_id):
    data_dict = prepare_dict_for_data_gigachat(file_id)
    return giga.chat(data_dict)

In [86]:
import time

In [87]:
def get_answers_for_images_gigachat(num_files, files_id):
    responses = []

    for i in range(num_files):
        responses.append(query_gigachat(files_id[i]))
        if i != 0 and i % 10:
            time.sleep(30)
    return responses

In [88]:
responses_gigachat_preprocessed = get_answers_for_images_gigachat(23, files_id_preprocessed)

In [103]:
responses_gigachat = get_answers_for_images_gigachat(23, files_id)

In [212]:
responses_gigachat[:2]

[ChatCompletion(x_headers={'x-request-id': '174ed812-8f73-4efa-904b-61bff496db0e', 'x-session-id': '63d6bb9f-f744-4e9f-968c-271134a213d2', 'x-client-id': None}, choices=[Choices(message=Messages(role='assistant', content='Это проект для СВТ', function_call=None, name=None, attachments=None, data_for_context=None, functions_state_id=None, reasoning_content=None, id_=None), index=0, finish_reason='stop')], created=1761311929, model='GigaChat-Max:2.0.28.2', thread_id=None, message_id=None, usage=Usage(prompt_tokens=913, completion_tokens=6, total_tokens=919, precached_prompt_tokens=11), object_='chat.completion'),
 ChatCompletion(x_headers={'x-request-id': '82f0b104-bd44-4671-8296-6f58c8821fa5', 'x-session-id': 'f79648e0-f148-4dd2-acc9-b402db7f78cb', 'x-client-id': None}, choices=[Choices(message=Messages(role='assistant', content='Ма доске пишу текст', function_call=None, name=None, attachments=None, data_for_context=None, functions_state_id=None, reasoning_content=None, id_=None), index

In [106]:
delete_data_from_storage(files_id)
delete_data_from_storage(files_id_preprocessed)



### Extract texts

In [107]:
import re

In [108]:
def get_texts_from_answers_gigachat(responses):
    answers = []
    for response in responses:
        if "status" in response and response["status"] != 200:
            raise RuntimeError(response["message"])
        answers.append(response.choices[0].message.content.strip('\n'))
    return answers

In [109]:
texts_gigachat = get_texts_from_answers_gigachat(responses_gigachat)
texts_gigachat_preprocessed = get_texts_from_answers_gigachat(responses_gigachat_preprocessed)

In [110]:
texts_gigachat_preprocessed[:5]

['Это проект для СВТ',
 'да доске пиши текст',
 'Чтобы модель',
 'Распознала хорошо и',
 'Сегодня today I was очень рад']

## Qwen2.5-VL with OpenRouter

### Queries

In [None]:
qwen_key = "YOUR KEY"

In [143]:
import json
import base64

In [144]:
def encode_img_to_base64(file, to_preprocess = False):
    if to_preprocess:
        file = preprocess(file)
    with open(file, "rb") as img_file:
        return base64.b64encode(img_file.read()).decode('utf-8')

In [277]:
def prepare_dict_for_data_openrouter(model, files, to_preprocess = False):
    data_dict = {}
    data_dict["model"] = model
    data_dict["messages"] = [
        {
        "role": "user",
            "content": 
                [
                    {
                        "type": "text",
                        "text": "Извлечь текст из этих фото. Результат вывести в формате json словаря с ключами - номерами фото,"
                        "для каждого фото ровно один текст, без комментариев"
                    },
                   
                ]
            }
        ]
    
    for file in files:
        base64_file = encode_img_to_base64(file, to_preprocess)
        data_dict["messages"][0]["content"].append({"type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_file}"}})
    
    return data_dict


In [225]:
def query_openrouter(model, files, to_preprocess = False):
    data_dict = prepare_dict_for_data_openrouter(model, files, to_preprocess)

    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {qwen_key}",
            "Content-Type": "application/json",
        },
        data=json.dumps(data_dict)
    )

    return response.json()

In [217]:
def get_answers_for_images_openrouter(model, num_files, files, to_preprocess = False):
    responses = []

    for i in range(0, num_files, 10):
        responses.append(query_openrouter(model, files[i: min(i+10, num_files)], to_preprocess))
        time.sleep(30)
    return responses

In [285]:
responses_qwen = get_answers_for_images_openrouter("qwen/qwen2.5-vl-32b-instruct:free", num_files, files)

In [278]:
responses_qwen_preprocessed = get_answers_for_images_openrouter("qwen/qwen2.5-vl-32b-instruct:free", num_files, files, True)

In [None]:
texts = responses_qwen[2]['choices'][0]['message']['content']
texts

'```json\n{\n    "1": "Objects are in some sense",\n    "2": "Second element",\n    "3": "Last element"\n}\n```'

### Extract texts

In [270]:
def get_texts_from_answers_qwen(responses):
    answers = []
    for response in responses:
        if "error" in response:
            raise RuntimeError(response["error"])
        
        texts = response['choices'][0]['message']['content']
        text_dict = json.loads(texts[texts.find('{'): texts.rfind('}') + 1])
        answers += [text.strip('\n') for text in text_dict.values()]
    return answers

In [287]:
texts_qwen = get_texts_from_answers_qwen(responses_qwen)
texts_qwen_preprocessed = get_texts_from_answers_qwen(responses_qwen_preprocessed)

In [290]:
texts_qwen_preprocessed[:5]

['Это проект для СВТ',
 'На доске пишу текст',
 'Чтобы модели',
 'распознали хорошо и',
 'Сегодня today I шаз очень рад']

## Compute CER WER

In [124]:
import pandas as pd

In [125]:
data = pd.read_csv('data/strings/dataset_text_lines_CER_WER.csv', sep = ';')
data.head()

Unnamed: 0,Picture name,text_line
0,ex_1.jpg,Это проект для СВТ
1,ex_2.jpg,На доске пишу текст
2,ex_3.jpg,Чтобы модель
3,ex_4.jpg,распознала хорошо и
4,ex_5.jpg,Сегодня today I was очень рад


In [126]:
from evaluate import load
cer = load("cer")
wer = load("wer")

In [127]:
def evaluate_cer_wer(texts, model, to_preprocess):
    total_cer = 0
    total_wer = 0
    
    for i, text in enumerate(texts):
        reference = data["text_line"][i]
        dif = len(text) - len(reference)
        if dif:
            if dif > 0:
                reference += (' '*dif)
            else:
                text += (' '*(-dif))

        total_cer += cer.compute(references=[reference], predictions=[text])
        total_wer += wer.compute(references=[reference], predictions=[text])
    
    info = "preprocessed images"
    if not to_preprocess:
        info = "not " + info

    print(f"CER score on {model=} with {info} is: {total_cer/(len(texts))}")
    print(f"WER score on {model=} with {info} is: {total_wer/(len(texts))}")

### GigaChat metrics

In [128]:
evaluate_cer_wer(texts_gigachat_preprocessed, "qwen/qwen2.5-vl-32b-instruct:free", True)
print('\n')
evaluate_cer_wer(texts_gigachat, "qwen/qwen2.5-vl-32b-instruct:free", False)

CER score on model='qwen/qwen2.5-vl-32b-instruct:free' with preprocessed images is: 0.1989638619072258
WER score on model='qwen/qwen2.5-vl-32b-instruct:free' with preprocessed images is: 0.3615942028985507


CER score on model='qwen/qwen2.5-vl-32b-instruct:free' with not preprocessed images is: 0.20979892056837138
WER score on model='qwen/qwen2.5-vl-32b-instruct:free' with not preprocessed images is: 0.3978260869565218


### QWEN 2.5-VL metrics

In [292]:
evaluate_cer_wer(texts_qwen_preprocessed, "qwen/qwen2.5-vl-32b-instruct:free", True)
print('\n')
evaluate_cer_wer(texts_qwen, "qwen/qwen2.5-vl-32b-instruct:free", False)

CER score on model='qwen/qwen2.5-vl-32b-instruct:free' with preprocessed images is: 0.10073111634807762
WER score on model='qwen/qwen2.5-vl-32b-instruct:free' with preprocessed images is: 0.25652173913043486


CER score on model='qwen/qwen2.5-vl-32b-instruct:free' with not preprocessed images is: 0.08348190007998262
WER score on model='qwen/qwen2.5-vl-32b-instruct:free' with not preprocessed images is: 0.24782608695652178
