In [16]:
from pytesseract import pytesseract, Output
import cv2
import pandas as pd
# Load the receipt image
image = cv2.imread('/home/miza/Magisterka/src/data/paragon3.jpg')

# Image Preprocessing
# Convert the image to grayscale
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply Gaussian blur to reduce noise
blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)

# Apply binary thresholding (Binarization)
_, threshold_image = cv2.threshold(blurred_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

# Optional: Resize image for better OCR accuracy (if necessary)
height, width = threshold_image.shape
threshold_image = cv2.resize(threshold_image, (width * 2, height * 2))
# Extract text with bounding boxes
ocr_data = pytesseract.image_to_data(image, output_type=Output.DICT, config = '--psm 6', lang='pol')

lines = []
current_line = []

# Iterate through OCR results and group words into lines
prev_line_number = ocr_data['line_num'][0]
for i in range(len(ocr_data['text'])):
    word = ocr_data['text'][i]
    line_number = ocr_data['line_num'][i]

    if word.strip():  # Skip empty words
        if line_number != prev_line_number:  # New line detected
            lines.append(" ".join(current_line))  # Append the current line
            current_line = []  # Start a new line
            prev_line_number = line_number

        current_line.append(word)

# Append the last line if there is one
if current_line:
    lines.append(" ".join(current_line))

# Create a DataFrame from the lines
df = pd.DataFrame(lines, columns=["Line_Text"])

# Print the DataFrame
print(df)

                               Line_Text
0                                       
1     Tabl Plusssz 203zt B 4 x6,69 6,69B
2        Tost Maślany a D 8 «4,28 34,240
3    Jost pełnozia 500g l 8 x3,19 25,520
4     Woda Niegaz 1,51 A 12 x0,39 11,88R
5  HodaNGaz Muszyf ,5L A 6 x2,85 17, 10Ą
6    lap CocalolaZeroŻL [i 1 x8,69 8,69A


In [14]:
from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import torch

# Sample dataset structure
data = {
    'Text': [
        "SALATKA JARZY 250g-D",
        "JOG ALE PITNY 290g-0",
        "KAJZERKA PREM 60g-D ",
        "KAJZERKA PREM 60g-D "
    ],
    'Category': ['Food', 'Beverage', 'Food', 'Food'],
}

# Create a DataFrame
df = pd.DataFrame(data)

# Tokenizer initialization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text
inputs = tokenizer(df['Text'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Example labels (for classification task, need to convert to numeric values)
label_mapping = {'Food': 0, 'Beverage': 1}  # Mapping categories to numeric values
labels = [label_mapping[label] for label in df['Category']]

# Pad the labels to match the sequence length of the inputs (important for token classification)
labels_padded = []
for i, label in enumerate(labels):
    # Create a label tensor of the same length as the tokenized input
    label_tensor = torch.tensor([label] * len(inputs['input_ids'][i]))
    labels_padded.append(label_tensor)

# Stack the labels so they form a tensor with the correct shape
labels_padded = torch.stack(labels_padded)

# Add labels to inputs
inputs['labels'] = labels_padded

# Convert inputs to Dataset format
dataset = Dataset.from_dict(inputs)

# Initialize BERT model for token classification
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_mapping))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,  # Use dataset here
    eval_dataset=dataset     # You should use a separate validation set for evaluation
)

# Train the model
trainer.train()

# After training, you can use the model for prediction
outputs = model(**inputs)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


In [15]:
import torch
import torch.nn.functional as F
# Assuming 'outputs' is the TokenClassifierOutput object
logits = outputs.logits  # Access the logits from the output

# Apply softmax to the logits to get probabilities
probabilities = F.softmax(logits, dim=-1)
predictions = probabilities.argmax(dim=-1)
# Print the probabilities
print(predictions)
# Example class mapping
label_mapping = {0: "Food", 1: "Beverage"}

# Map the predictions to class labels
predicted_labels = [label_mapping[pred.item()] for pred in predictions[0]]

# Print the predicted labels for each token
print(predicted_labels)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
['Food', 'Food', 'Food', 'Food', 'Food', 'Food', 'Food', 'Food', 'Food', 'Food', 'Food']


In [22]:
data = {
    'Text': [
        "Tabl Plusssz",
        "Tost Maślany",
        "Jost pełnozia",
        "Woda Niegaz",
        "HodaNGaz Muszyf",
        "lap CocalolaZeroŻL"
    ]
}
inputs = tokenizer(data['Text'], padding=True, truncation=True, return_tensors='pt')
for sentence, input_ids in zip(data['Text'], inputs['input_ids']):
    tokens = tokenizer.convert_ids_to_tokens(input_ids)  # Convert IDs to tokens
    print(f"Original: {sentence}")
    print(f"Tokens: {tokens}\n")

Original: Tabl Plusssz
Tokens: ['[CLS]', 'tab', '##l', 'plus', '##ss', '##z', '[SEP]', '[PAD]', '[PAD]', '[PAD]']

Original: Tost Maślany
Tokens: ['[CLS]', 'to', '##st', 'mas', '##lan', '##y', '[SEP]', '[PAD]', '[PAD]', '[PAD]']

Original: Jost pełnozia
Tokens: ['[CLS]', 'jo', '##st', 'pe', '##ł', '##no', '##zia', '[SEP]', '[PAD]', '[PAD]']

Original: Woda Niegaz
Tokens: ['[CLS]', 'wo', '##da', 'ni', '##ega', '##z', '[SEP]', '[PAD]', '[PAD]', '[PAD]']

Original: HodaNGaz Muszyf
Tokens: ['[CLS]', 'ho', '##dan', '##ga', '##z', 'mu', '##sz', '##y', '##f', '[SEP]']

Original: lap CocalolaZeroŻL
Tokens: ['[CLS]', 'lap', 'coca', '##lo', '##laze', '##ro', '##z', '##l', '[SEP]', '[PAD]']



In [21]:
outputs = model(**inputs)
logits = outputs.logits  # Access the logits from the output

# Apply softmax to the logits to get probabilities
probabilities = F.softmax(logits, dim=-1)
predictions = probabilities.argmax(dim=-1)
# Print the probabilities
print(predictions)
# Example class mapping
label_mapping = {0: "Food", 1: "Beverage"}

# Map the predictions to class labels
predicted_labels = [label_mapping[pred.item()] for pred in predictions[0]]

# Print the predicted labels for each token
print(predicted_labels)

tensor([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
['Food', 'Food', 'Food', 'Beverage', 'Food', 'Food', 'Food', 'Food', 'Food', 'Food']
