# Tensor Titans Notebook 2 (Priyansh Jain) 
### For entity_name: weight, wattage, voltage & volume 
### OCR -> Regex -> Write to csv file

# Dependancies

In [1]:
# General
import re
import cv2
import time
import math
import torch
import numpy as np 
import matplotlib.pyplot as plt

# Data Extraction
import os
import sys
import zipfile
import requests
import pandas as pd
from pathlib import Path

# Images
from IPython import display
from PIL import Image

# PaddleOCR dpendencies
!pip install -q paddlepaddle-gpu==2.6.1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if device == "cpu": 
    !pip install -q paddlepaddle
!pip install -q paddleocr
# Setting up OCR
import paddle
from paddleocr import PaddleOCR, draw_ocr
OCR = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)

download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 4.00M/4.00M [00:10<00:00, 374kiB/s] 


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10.2M/10.2M [00:17<00:00, 590kiB/s] 


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2.19M/2.19M [00:13<00:00, 159kiB/s] 

[2024/09/15 08:52:29] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_ch




In [2]:
# Checking if OCR on GPU
print("OCR GPU Compile Check: ", paddle.device.is_compiled_with_cuda())
print("OCR on GPU check: ", paddle.device.get_device())
print("Current device: ", device)

OCR GPU Compile Check:  True
OCR on GPU check:  gpu:0
Current device:  cuda:0


# Full Process

## Helper Functions

In [3]:
def distance_point_to_line(px, py, x1, y1, x2, y2):
    """ Calculate the perpendicular distance from point (px, py) to the line (x1, y1)-(x2, y2) """
    numerator = abs((y2 - y1) * px - (x2 - x1) * py + x2 * y1 - y2 * x1)
    denominator = math.sqrt((y2 - y1)**2 + (x2 - x1)**2)
    return numerator / denominator

def line_angle(x1, y1, x2, y2):
    """ Calculate the angle of the line (x1, y1) to (x2, y2) with respect to the horizontal axis """
    return abs(math.degrees(math.atan2(y2 - y1, x2 - x1)))%360

def is_approximately_vertical(angle, lower_bound=75, upper_bound=105):
    """ Check if the normalized angle is between lower_bound and upper_bound degrees """
    return lower_bound <= angle <= upper_bound

def calculate_centroid(box):
  sum_x, sum_y = 0, 0
  for point in box:
    sum_x += point[0]
    sum_y += point[1]
  return (sum_x // len(box), sum_y // len(box))

def extract_number(text):
  # Use regular expression to find all numbers in the text
  match = re.search(r'\d+', text)
  # Return the found number as an integer
  return int(match.group()) if match else None

# Example usage
text = "H: 16mm"
number = extract_number(text)
print(number)

16


## Custom Regex

In [4]:
from io import BytesIO

# Function to format number
def format_number(num):
    # Format the number to remove trailing zeros and unnecessary decimal points
    return '{:.2f}'.format(num).rstrip('0').rstrip('.')

# Regex to extract Weight from given text 
def extract_weight(text):
    text = str(text)
    text = text.replace(',', '.')
    numbers = re.findall(r'(\d+\.?\d*)\s*([a-zA-Z]+)', text, re.IGNORECASE)
    
    if not numbers:
        return " "
    
    largest_gram = None
    largest_kgram = None
    largest_ounce = None
    largest_pound = None
    largest_ton = None
    largest_milligram = None

    for number, unit in numbers:
        num = float(number)
        unit = unit.lower()
        
        if unit.startswith('mg'):
            if largest_milligram is None or num > largest_milligram:
                largest_milligram = num
        elif unit.startswith('g') and not unit.startswith('ga'):
            if largest_gram is None or num > largest_gram:
                largest_gram = num
        elif unit.startswith('kg'):
            if largest_kgram is None or num > largest_kgram:
                largest_kgram = num
        elif unit.startswith('o') or unit.startswith('0'):
            if largest_ounce is None or num > largest_ounce:
                largest_ounce = num
        elif unit.startswith('lb'):
            if largest_pound is None or num > largest_pound:
                largest_pound = num
        elif unit.startswith('ton'):
            if largest_ton is None or num > largest_ton:
                largest_ton = num

    if largest_kgram is not None:
        return f"{format_number(largest_kgram)} kilogram"
    elif largest_gram is not None:
        return f"{format_number(largest_gram)} gram"
    elif largest_ounce is not None:
        return f"{format_number(largest_ounce)} ounce"
    elif largest_pound is not None:
        return f"{format_number(largest_pound)} pound"
    elif largest_ton is not None:
        return f"{format_number(largest_ton)} ton"
    elif largest_milligram is not None:
        return f"{format_number(largest_milligram)} milligram"
    else:
        return " "

# Regex to extract Wattage from given text 
def extract_wattage(text):
    text = str(text)
    numbers = re.findall(r'(\d+\.?\d*)\s*([a-zA-Z]+)', text, re.IGNORECASE)
    
    if not numbers:
        return " "
    
    largest_watt = None
    largest_kilowatt = None

    for number, unit in numbers:
        num = float(number)
        unit = unit.lower()
        
        if unit.startswith('k'):
            if largest_kilowatt is None or num > largest_kilowatt:
                largest_kilowatt = num
        elif unit.startswith('w'):
            if largest_watt is None or num > largest_watt:
                largest_watt = num
    if largest_watt is not None:
        return f"{largest_watt} watt"
    elif largest_kilowatt is not None:
        return f"{largest_kilowatt} kilowatt"
   
    else:
        return " "

# Regex to extract Voltage from given text 
def extract_voltage(text):
    text = str(text)
    numbers = re.findall(r'(\d+\.?\d*)\s*([a-zA-Z]+)', text, re.IGNORECASE)
    
    if not numbers:
        return " "
    
    largest_millivolt = None
    largest_volt = None
    largest_kilovolt = None

    for number, unit in numbers:
        num = float(number)
        unit = unit.lower()
        
        if unit.startswith('mv'):
            if largest_millivolt is None or num > largest_millivolt:
                largest_millivolt = num
        elif unit.startswith('v'):
            if largest_volt is None or num > largest_volt:
                largest_volt = num
        elif unit.startswith('kv'):
            if largest_kilovolt is None or num > largest_kilovolt:
                largest_kilovolt = num
    
    if largest_kilovolt is not None:
        return f"{largest_kilovolt} kilovolt"
    elif largest_volt is not None:
        return f"{largest_volt} volt"
    elif largest_millivolt is not None:
        return f"{largest_millivolt} millivolt"
    else:
        return " "
    
# Regex to extract Volume from given text 
def extract_volume(text):
    text = str(text)
    numbers = re.findall(r'(\d+\.?\d*)\s*([a-zA-Z0"\']+)', text, re.IGNORECASE)
    
    if not numbers:
        return " "
    
    largest_cup = None
    largest_cubic_inch = None
    largest_ounce = None
    largest_fluid_ounce = None
    largest_foot = None
    largest_deciliter = None
    largest_gallon = None
    largest_imperial_gallon = None
    largest_milliliter = None
    largest_liter = None
    largest_pint = None
    largest_quarter = None

    for number, unit in numbers:
        num = float(number)
        unit = unit.lower()

        if unit.startswith('cu'):
            if largest_cup is None or num > largest_cup:
                largest_cup = num
        elif unit.startswith('in'):
            if largest_cubic_inch is None or num > largest_cubic_inch:
                largest_cubic_inch = num
        elif unit.startswith('o') or unit.startswith('0'):
            if largest_ounce is None or num > largest_ounce:
                largest_ounce = num
        elif unit.startswith('fl'):
            if largest_fluid_ounce is None or num > largest_fluid_ounce:
                largest_fluid_ounce = num
        elif unit.startswith('"') or unit.startswith('f'):
            if largest_foot is None or num > largest_foot:
                largest_foot = num
        elif unit.startswith('d'):
            if largest_deciliter is None or num > largest_deciliter:
                largest_deciliter = num
        elif unit.startswith('ga'):
            if largest_gallon is None or num > largest_gallon:
                largest_gallon = num
        elif unit.startswith('im'):
            if largest_imperial_gallon is None or num > largest_imperial_gallon:
                largest_imperial_gallon = num
        elif unit.startswith('ml'):
            if largest_milliliter is None or num > largest_milliliter:
                largest_milliliter = num
        elif unit.startswith('l'):
            if largest_liter is None or num > largest_liter:
                largest_liter = num
        elif unit.startswith('p'):
            if largest_pint is None or num > largest_pint:
                largest_pint = num
        elif unit.startswith('q'):
            if largest_quarter is None or num > largest_quarter:
                largest_quarter = num
    if largest_milliliter is not None:
        return f"{largest_milliliter} millilitre"
    elif largest_ounce is not None:
        return f"{largest_ounce} fluid ounce"
    elif largest_fluid_ounce is not None:
        return f"{largest_fluid_ounce} fluid ounce"
    elif largest_liter is not None:
        return f"{largest_liter} litre"
    elif largest_cubic_inch is not None:
        return f"{largest_cubic_inch} cubic inch"
    elif largest_gallon is not None:
        return f"{largest_gallon} gallon"
    elif largest_pint is not None:
        return f"{largest_pint} pint"
    elif largest_quarter is not None:
        return f"{largest_quarter} quart"
    elif largest_deciliter is not None:
        return f"{largest_deciliter} decilitre"
    elif largest_cup is not None:
        return f"{largest_cup} cup"
    elif largest_foot is not None:
        return f"{largest_foot} cubic foot"
    elif largest_imperial_gallon is not None:
        return f"{largest_imperial_gallon} imperial gallon"
    else:
        return " "
    
# Performing OCR on the image
def compute_dimension(url):
    start = time.time()
    show = False
    show2 = True
    time_mode = 2
    
    ############# EXTRACT OCR ####################
    # Read image from URL
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    img_BRG = np.array(img)
    
    if img_BRG is None:
        print(f"Error: Image not found at {url}")
        return None
    
    # Center of Image
    height, width, _ = img_BRG.shape
    center_x = width // 2
    center_y = height // 2
    cv2.circle(img_BRG, (center_x, center_y), radius=20, color=(225, 0, 255), thickness=-1)

    # Perform OCR on the image
    result = OCR.ocr(img_BRG, cls=True)
    ocr_text=" "
    # Combine all OCR texts into a single string
    if result:
        ocr_text = ' '.join([line[1][0] for line in result[0]])

    # Extract weight from OCR text
    weight = extract_wattage(ocr_text)

    if time_mode == 2:
        end = time.time()
        print(f"\nTIME FOR FULL PROCESS: {end-start}\n")

    # Return the extracted weight
    return weight

# Automating / Running on csv file

In [6]:
import pandas as pd
import requests
from PIL import Image
import numpy as np
from io import BytesIO
# import OCR  # Assuming you have an OCR module for processing images

# Define the extract_wattage, extract_voltage, and extract_volume functions (already created)

def process_csv(file_path):
    df = pd.read_csv(file_path)
    
    # Define a function to handle entity value extraction based on the entity_name
    def get_entity_value(row):
        # Only perform OCR if the entity_name is 'item_volume' or 'voltage'
        if row['entity_name'] in ['item_volume', 'voltage']:
            url = row['image_link']
            response = requests.get(url)
            img = Image.open(BytesIO(response.content))
            img_BRG = np.array(img)

            if img_BRG is None:
                return " "

            # Perform OCR on the image
            result = OCR.ocr(img_BRG, cls=True)

            # Check if result is None or empty
            if result and result[0]:
                # Combine all OCR texts into a single string
                ocr_text = ' '.join([line[1][0] for line in result[0] if line[1][0]])

                # Apply the correct extraction function based on entity_name
                if row['entity_name'] == 'voltage':
                    return extract_voltage(ocr_text)
                elif row['entity_name'] == 'item_volume':
                    return extract_volume(ocr_text)
                else:
                    return " "
            else:
                print(f"OCR failed for {url}")
                return " "
        else:
            return " "  # Return empty for other entity types
    
    # Apply the function to each row
    df['entity_value'] = df.apply(get_entity_value, axis=1)
    
    # Save the updated DataFrame to a new CSV file
    df.to_csv('updated_file.csv', index=False)
    print("CSV processing complete. Updated file saved as 'updated_file.csv'.")

# Example usage
process_csv('/kaggle/input/amazonmltest/test.csv')


[2024/09/15 08:52:34] ppocr DEBUG: dt_boxes num : 15, elapsed : 0.8579068183898926
[2024/09/15 08:52:34] ppocr DEBUG: cls num  : 15, elapsed : 0.10273575782775879
[2024/09/15 08:52:34] ppocr DEBUG: rec_res num  : 15, elapsed : 0.10658025741577148
[2024/09/15 08:52:34] ppocr DEBUG: dt_boxes num : 10, elapsed : 0.036427974700927734
[2024/09/15 08:52:34] ppocr DEBUG: cls num  : 10, elapsed : 0.025481462478637695
[2024/09/15 08:52:34] ppocr DEBUG: rec_res num  : 10, elapsed : 0.07198715209960938
[2024/09/15 08:52:34] ppocr DEBUG: dt_boxes num : 7, elapsed : 0.024914979934692383
[2024/09/15 08:52:34] ppocr DEBUG: cls num  : 7, elapsed : 0.02858591079711914
[2024/09/15 08:52:34] ppocr DEBUG: rec_res num  : 7, elapsed : 0.03648710250854492
[2024/09/15 08:52:35] ppocr DEBUG: dt_boxes num : 4, elapsed : 0.04699563980102539
[2024/09/15 08:52:35] ppocr DEBUG: cls num  : 4, elapsed : 0.007620334625244141
[2024/09/15 08:52:35] ppocr DEBUG: rec_res num  : 4, elapsed : 0.01738882064819336
[2024/09/15