# üì¶ OCR-Based Waybill / Shipping Label Text Extraction System
This notebook implements a **complete automated OCR pipeline** to:
1. Perform OCR on shipping/waybill images
2. Extract the full text line containing pattern `_1_`
3. Evaluate accuracy on a labeled dataset

Supports:
- OpenCV preprocessing
- Tesseract OCR
- EasyOCR (optional)
- Regex-based extraction
- Batch inference
- Accuracy scoring


In [None]:
# Install dependencies (uncomment if running in Colab)
# !apt install tesseract-ocr
# !pip install pytesseract easyocr opencv-python matplotlib pandas


In [None]:
import cv2
import pytesseract
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import os
from IPython.display import display


## üîß Preprocessing Function

In [None]:
def preprocess(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (3,3), 0)
    th = cv2.adaptiveThreshold(
        blur, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 31, 2)
    return th


## üîç OCR Extraction

In [None]:
def ocr_tesseract(img):
    config = "--oem 3 --psm 6"
    text = pytesseract.image_to_string(img, config=config)
    return text


## üß© Extract Line Containing `_1_`

In [None]:
def extract_pattern_line(text, pattern="_1_"):
    for line in text.splitlines():
        if pattern in line:
            return line.strip()
    return None


## üìÑ Process a Single Image

In [None]:
def process_image(path):
    img = cv2.imread(path)
    prep = preprocess(img)
    text = ocr_tesseract(prep)
    line = extract_pattern_line(text)
    return line, text, prep


## üìÅ Batch Process Folder of Images

In [None]:
def process_folder(folder):
    results = []
    for f in os.listdir(folder):
        if f.lower().endswith(('.jpg','.png','.jpeg')):
            full = os.path.join(folder, f)
            line, _, _ = process_image(full)
            results.append([f, line])
    df = pd.DataFrame(results, columns=['filename','extracted'])
    return df


## üìä Accuracy Evaluation

In [None]:
def calculate_accuracy(pred_df, truth_csv):
    truth = pd.read_csv(truth_csv)
    merged = pred_df.merge(truth, on='filename', how='left')
    merged['correct'] = merged['extracted'] == merged['groundtruth']
    accuracy = merged['correct'].mean() * 100
    return accuracy, merged


## ‚ñ∂ Example Usage

In [None]:
# Example:
# df = process_folder('/content/waybill_images')
# acc, merged_df = calculate_accuracy(df, 'groundtruth.csv')
# print('Accuracy:', acc)
# display(merged_df)
