In [4]:
!pip install paddlepaddle-gpu paddleocr

[0m[31mERROR: Could not find a version that satisfies the requirement paddlepaddle-gpu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for paddlepaddle-gpu[0m[31m
[0m

In [1]:
#Importing the neccesary dependencies and required libraries
import pandas as pd
import requests
from PIL import Image, ImageEnhance
from io import BytesIO
import re
import numpy as np
import cv2
from paddleocr import PaddleOCR
import math
import logging

# Configure logging for detailed debug information
logging.basicConfig(level=logging.DEBUG)

# Unit normalization map to replace shorthand units with full forms
unit_normalization_map = {
    'g': 'gram',
    'kg': 'kilogram',
    'lbs': 'pound',
    'lb': 'pound',
    'ounces': 'ounce',
    'oz': 'ounce',
    'ml': 'milliliter',
    'l': 'liter',
    'cup': 'cup',
    'v': 'volt',
    'w': 'watt',
    'cm': 'centimeter',
    'mm': 'millimeter',
    'inches': 'inch',
    'm': 'meter',
    'km': 'kilometer',
    'foot': 'foot',
    'feet': 'foot'
}

# Reading CSV file
def read_csv(file_path):
    logging.info(f"Reading CSV file: {file_path}")
    return pd.read_csv(file_path).head(20)

# Normalising the units using the mapping provided above
def normalize_unit(unit):
    normalized = unit_normalization_map.get(unit.lower(), unit)
    logging.debug(f"Normalized unit '{unit}' to '{normalized}'")
    return normalized

# Extracting numeric values without changing the format
def format_value(value):
    try:
        # logging.debug(f"Extracted value: {value}")
        return str(value)
    except ValueError:
        # logging.error(f"ValueError encountered with value: {value}")
        return value

# Preprocessing the image to enhance OCR accuracy
def preprocess_image(image):
    # logging.info("Preprocessing image for OCR")
    image = resize_image(image)
    image = enhance_contrast(image)
    image = binarize_image(image)
    image = denoise_image(image)
    image = adaptive_threshold(image)
    return image

# Resizing the image for better OCR accuracy
def resize_image(image, scale_factor=1.5):
    width, height = image.size
    resized_image = image.resize((int(width * scale_factor), int(height * scale_factor)), Image.Resampling.LANCZOS)
    # logging.debug("Resized image for better OCR accuracy")
    return resized_image

# Enhancing the contrast to improve text visibility
def enhance_contrast(image):
    enhancer = ImageEnhance.Contrast(image)
    enhanced_image = enhancer.enhance(2)
    # logging.debug("Enhanced image contrast")
    return enhanced_image

# Binarizing the image to simplify OCR (black-and-white)
def binarize_image(image):
    gray = image.convert('L')
    binary = gray.point(lambda x: 0 if x < 128 else 255, '1')
    # logging.debug("Binarized image for OCR")
    return binary

# Denoising the image using Gaussian blur
def denoise_image(image):
    image_np = np.array(image.convert('L'))
    denoised_image = cv2.GaussianBlur(image_np, (5, 5), 0)
    # logging.debug("Applied Gaussian blur to denoise image")
    return Image.fromarray(denoised_image)

# Apply adaptive thresholding to the image
def adaptive_threshold(image):
    image_np = np.array(image.convert('L'))
    thresh_image = cv2.adaptiveThreshold(image_np, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    # logging.debug("Applied adaptive thresholding to image")
    return Image.fromarray(thresh_image)

# Perform OCR with PaddleOCR
def ocr_paddleocr(image, ocr_instance):
    try:
        result = ocr_instance.ocr(np.array(image), cls=True)
        logging.debug(f"OCR result: {result}")
        print(result)
    except Exception as e:
        logging.error(f"OCR processing failed: {e}")
        return ""

    if not result or not isinstance(result, list):
        # logging.warning("No OCR result found or invalid result format")
        return ""

    extracted_text = []
    for block in result:
        if isinstance(block, list) and len(block) > 0:
            for line in block:
                if isinstance(line, list) and len(line) > 1:
                    text = line[1][0]
                    extracted_text.append(text)

    extracted_text_str = "\n".join(extracted_text)
    logging.debug(f"Extracted text: {extracted_text_str}")
    return extracted_text_str if extracted_text else ""

# Running OCR on the image
def run_ocr(image, ocr_instance):
    # logging.info("Running OCR on preprocessed image")
    paddleocr_text = ocr_paddleocr(image, ocr_instance)
    print(paddleocr_text)
    return {'text': paddleocr_text}

# Extracting information using OCR and regex
def extract_info(image_url, entity_name, ocr_instance, cache):
    if image_url in cache:
        # logging.info(f"Using cached OCR result for URL: {image_url}")
        cleaned_text = cache[image_url]
    else:
        try:
            response = requests.get(image_url)
            img = Image.open(BytesIO(response.content))
            logging.info(f"Loaded image from URL: {image_url}")
        except Exception as e:
            logging.error(f"Failed to load image from {image_url}: {e}")
            return ""

        # Preprocessing the image
        img = preprocess_image(img)

        # Performing OCR
        ocr_results = run_ocr(img, ocr_instance)
        print(ocr_results)
        
        cleaned_text = ocr_results['text']
        print(cleaned_text)

        # Cache the OCR result to improve the prediction times
        cache[image_url] = cleaned_text

    # Defining regex patterns
    regex_patterns = {
        'item_weight': r'(\d+(?:\.\d+)?)\s*(g|kg|lb?|ounce|gram|pound?)',
        'item_volume': r'(\d+(?:\.\d+)?)\s*(ml|l|fl\.?\s*oz|cup|liter|millilitre|gallon?)',
        'voltage': r'(\d+(?:\.\d+)?)\s*(v|volt|mv|millivolt)',
        'wattage': r'(\d+(?:\.\d+)?)\s*(w|watt|mw|milliwatt)',
        'maximum_weight_recommendation': r'(\d+(?:\.\d+)?)\s*(g|kg|lb?|ounce|gram|pound?)',
        'height': r'(\d+(?:\.\d+)?)\s*(cm|mm|inch|metre|m|foot|feet)',
        'depth': r'(\d+(?:\.\d+)?)\s*(cm|mm|inch|metre|m|foot|feet)',
        'width': r'(\d+(?:\.\d+)?)\s*(cm|mm|inch|metre|m|foot|feet)',
    }

    pattern = regex_patterns.get(entity_name)
    if pattern:
        match = re.search(pattern, cleaned_text, re.IGNORECASE)
        if match:
            value = format_value(match.group(1))  # Keep the value as is
            unit = normalize_unit(match.group(2))
            logging.info(f"Extracted entity {entity_name}: {value} {unit}")
            return f"{value} {unit}"

    # logging.warning(f"No match found for entity {entity_name}")
    return ""

# Predicting entity values and save results to a CSV file
def predict_and_save(csv_data, output_file_path, ocr_instance):
    predictions = []
    total_images = len(csv_data)
    cache = {}  # Cache for storing OCR results based on URL

    for index, row in csv_data.iterrows():
        image_url = row['image_link']
        entity_name = row['entity_name']

        predicted_value = extract_info(image_url, entity_name, ocr_instance, cache)

        if pd.isna(predicted_value) or predicted_value == "":
            predicted_value = ""

        # Printing the current image index and how many are left
        # logging.info(f"Processing image {index + 1} of {total_images}, Predicted Value: {predicted_value}")
        print(f"Processing image {index + 1} of {total_images}...")

        predictions.append({'index': index, 'prediction': predicted_value})

    predictions_df = pd.DataFrame(predictions)
    predictions_df.to_csv(output_file_path, index=False)
    # logging.info(f"Predicted values saved to {output_file_path}")

# Main function to load data and predict entity values
def main():
    csv_file_path = '/Users/hammadkhan/Downloads/student_resource 3/dataset/test.csv'
    output_file_path = '/Users/hammadkhan/Downloads/student_resource 3/dataset/test_out_sample.csv'

    csv_data = read_csv(csv_file_path)

    # Initialising PaddleOCR once
    ocr_instance = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)

    # Predicting entity values and save them to the output CSV
    predict_and_save(csv_data, output_file_path, ocr_instance)

if __name__ == "__main__":
    main()


[2024/09/24 15:04:45] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/hammadkhan/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/hammadkhan/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, m

In [6]:
pip install transformers torch


Note: you may need to restart the kernel to use updated packages.


In [7]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn import functional as F

# Load pre-trained BERT tokenizer and model for sequence classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Labels: 0 = height, 1 = width, 2 = depth/length
label_map = {0: 'height', 1: 'width', 2: 'depth'}

# Sample dimension keywords to classify
dimension_keywords = ['height', 'h', 'width', 'w', 'length', 'l', 'depth', 'd', 'size']

def classify_dimension(text):
    # Tokenize input text and convert to tensor
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    # Get model outputs (logits)
    outputs = model(**inputs)
    
    # Apply softmax to get probabilities
    probabilities = F.softmax(outputs.logits, dim=1)
    
    # Get the label with the highest probability
    predicted_label = torch.argmax(probabilities, dim=1).item()
    
    # Convert label to corresponding dimension (height, width, depth)
    return label_map[predicted_label]

# Example text extracted from an image
extracted_text = "This product is 42cm in height, with a width of 50cm and depth of 30cm."

# Split text into sentences for classification
sentences = extracted_text.split(',')

# Loop through sentences and classify each one
for sentence in sentences:
    dimension_class = classify_dimension(sentence)
    print(f"Sentence: '{sentence}' classified as: {dimension_class}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Sentence: 'This product is 42cm in height' classified as: height
Sentence: ' with a width of 50cm and depth of 30cm.' classified as: width


In [8]:
import pandas as pd
import requests
from PIL import Image, ImageEnhance
from io import BytesIO
import re
import numpy as np
import cv2
from paddleocr import PaddleOCR
import logging
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn import functional as F

# Configure logging for detailed debug information
logging.basicConfig(level=logging.DEBUG)

# Unit normalization map to replace shorthand units with full forms
unit_normalization_map = {
    'g': 'gram',
    'kg': 'kilogram',
    'lbs': 'pound',
    'lb': 'pound',
    'ounces': 'ounce',
    'oz': 'ounce',
    'ml': 'milliliter',
    'l': 'liter',
    'cup': 'cup',
    'v': 'volt',
    'w': 'watt',
    'cm': 'centimeter',
    'mm': 'millimeter',
    'inches': 'inch',
    'm': 'meter',
    'km': 'kilometer',
    'foot': 'foot',
    'feet': 'foot'
}

# Pre-trained BERT model for dimension classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Labels: 0 = height, 1 = width, 2 = depth
label_map = {0: 'height', 1: 'width', 2: 'depth'}

# Reading CSV file
def read_csv(file_path):
    logging.info(f"Reading CSV file: {file_path}")
    return pd.read_csv(file_path).head(20)

# Normalising the units using the mapping provided above
def normalize_unit(unit):
    normalized = unit_normalization_map.get(unit.lower(), unit)
    logging.debug(f"Normalized unit '{unit}' to '{normalized}'")
    return normalized

# Extracting numeric values without changing the format
def format_value(value):
    try:
        return str(value)
    except ValueError:
        return value

# Preprocessing the image to enhance OCR accuracy
def preprocess_image(image):
    image = resize_image(image)
    image = enhance_contrast(image)
    image = binarize_image(image)
    image = denoise_image(image)
    image = adaptive_threshold(image)
    return image

def resize_image(image, scale_factor=1.5):
    width, height = image.size
    resized_image = image.resize((int(width * scale_factor), int(height * scale_factor)), Image.Resampling.LANCZOS)
    return resized_image

def enhance_contrast(image):
    enhancer = ImageEnhance.Contrast(image)
    enhanced_image = enhancer.enhance(2)
    return enhanced_image

def binarize_image(image):
    gray = image.convert('L')
    binary = gray.point(lambda x: 0 if x < 128 else 255, '1')
    return binary

def denoise_image(image):
    image_np = np.array(image.convert('L'))
    denoised_image = cv2.GaussianBlur(image_np, (5, 5), 0)
    return Image.fromarray(denoised_image)

def adaptive_threshold(image):
    image_np = np.array(image.convert('L'))
    thresh_image = cv2.adaptiveThreshold(image_np, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    return Image.fromarray(thresh_image)

# Perform OCR with PaddleOCR
def ocr_paddleocr(image, ocr_instance):
    try:
        result = ocr_instance.ocr(np.array(image), cls=True)
        logging.debug(f"OCR result: {result}")
    except Exception as e:
        logging.error(f"OCR processing failed: {e}")
        return ""

    if not result or not isinstance(result, list):
        return ""

    extracted_text = []
    for block in result:
        if isinstance(block, list) and len(block) > 0:
            for line in block:
                if isinstance(line, list) and len(line) > 1:
                    text = line[1][0]
                    extracted_text.append(text)

    extracted_text_str = "\n".join(extracted_text)
    logging.debug(f"Extracted text: {extracted_text_str}")
    return extracted_text_str if extracted_text else ""

# Running OCR on the image
def run_ocr(image, ocr_instance):
    paddleocr_text = ocr_paddleocr(image, ocr_instance)
    return {'text': paddleocr_text}

# Classify text to determine dimension type
def classify_dimension(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    probabilities = F.softmax(outputs.logits, dim=1)
    predicted_label = torch.argmax(probabilities, dim=1).item()
    return label_map[predicted_label]

# Extracting information using OCR and NLP-based dimension classification
def extract_info(image_url, entity_name, ocr_instance, cache):
    if image_url in cache:
        cleaned_text = cache[image_url]
    else:
        try:
            response = requests.get(image_url)
            img = Image.open(BytesIO(response.content))
            logging.info(f"Loaded image from URL: {image_url}")
        except Exception as e:
            logging.error(f"Failed to load image from {image_url}: {e}")
            return ""

        img = preprocess_image(img)
        ocr_results = run_ocr(img, ocr_instance)
        cleaned_text = ocr_results['text']
        cache[image_url] = cleaned_text

    sentences = cleaned_text.split('\n')
    dimension_map = {'height': None, 'width': None, 'depth': None}

    for sentence in sentences:
        dimension_class = classify_dimension(sentence)

        match = re.search(r'(\d+\.?\d*)\s*(cm|mm|inch|feet|m)', sentence, re.IGNORECASE)
        if match:
            value = match.group(1)
            unit = normalize_unit(match.group(2))
            dimension_map[dimension_class] = f"{value} {unit}"

    if entity_name in dimension_map and dimension_map[entity_name]:
        return dimension_map[entity_name]
    
    return ""

# Predicting entity values and save results to a CSV file
def predict_and_save(csv_data, output_file_path, ocr_instance):
    predictions = []
    total_images = len(csv_data)
    cache = {}

    for index, row in csv_data.iterrows():
        image_url = row['image_link']
        entity_name = row['entity_name']

        predicted_value = extract_info(image_url, entity_name, ocr_instance, cache)

        if pd.isna(predicted_value) or predicted_value == "":
            predicted_value = ""

        print(f"Processing image {index + 1} of {total_images}...")

        predictions.append({'index': index, 'prediction': predicted_value})

    predictions_df = pd.DataFrame(predictions)
    predictions_df.to_csv(output_file_path, index=False)

# Main function to load data and predict entity values
def main():
    csv_file_path = '/Users/hammadkhan/Downloads/student_resource 3/dataset/test.csv'
    output_file_path = '/Users/hammadkhan/Downloads/student_resource 3/dataset/test_out_sample.csv'

    csv_data = read_csv(csv_file_path)
    ocr_instance = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
    predict_and_save(csv_data, output_file_path, ocr_instance)

if __name__ == "__main__":
    main()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[2024/09/24 15:04:17] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/hammadkhan/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/hammadkhan/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, m

In [None]:
import pandas as pd
import requests
from PIL import Image, ImageEnhance
from io import BytesIO
import re
import numpy as np
import cv2
from paddleocr import PaddleOCR
import logging
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn import functional as F

# Configure logging for detailed debug information
logging.basicConfig(level=logging.DEBUG)

# Entity-unit mapping for validation
entity_unit_map = {
    'width': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'depth': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'height': {'centimetre', 'cm', 'foot', 'ft', 'inch', 'in', 'metre', 'm', 'millimetre', 'mm', 'yard', 'yd'},
    'item_weight': {'gram', 'g', 'kilogram', 'kg', 'microgram', 'μg', 'milligram', 'mg', 'ounce', 'oz', 'pound', 'lb', 'ton'},
    'maximum_weight_recommendation': {'gram', 'g', 'kilogram', 'kg', 'microgram', 'μg', 'milligram', 'mg', 'ounce', 'oz', 'pound', 'lb', 'ton'},
    'voltage': {'kilovolt', 'kV', 'millivolt', 'mV', 'volt', 'V'},
    'wattage': {'kilowatt', 'kW', 'watt', 'W'},
    'item_volume': {'centilitre', 'cl', 'cubic foot', 'cu ft', 'cubic inch', 'cu in', 'cup', 'decilitre', 'dl', 'fluid ounce', 'fl oz', 'gallon', 'gal', 'imperial gallon', 'imp gal', 'litre', 'l', 'microlitre', 'μl', 'millilitre', 'ml', 'pint', 'pt', 'quart', 'qt'},
    'frequency': {'hertz', 'Hz'},  # Including 'Hz' for frequency
    'current': {'ampere', 'A'},  # Including 'A' for current
}


# Set of all allowed units
allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Pre-trained BERT model for dimension classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6,)

# Labels: 0 = height, 1 = width, 2 = depth
label_map = {0: 'height', 1: 'width', 2: 'depth', 3: 'weight',
    4: 'voltage',
    5: 'current',}

# Reading CSV file
def read_csv(file_path):
    logging.info(f"Reading CSV file: {file_path}")
    return pd.read_csv(file_path).head(20)

# Normalizing units using the mapping and allowed units
def normalize_unit(unit):
    normalized = unit_normalization_map.get(unit.lower(), unit)
    if normalized in allowed_units:
        logging.debug(f"Normalized unit '{unit}' to '{normalized}'")
        return normalized
    else:
        logging.warning(f"Invalid unit '{unit}' found.")
        return None

# Extracting numeric values without changing the format
def format_value(value):
    try:
        return str(value)
    except ValueError:
        return value

# Preprocessing the image to enhance OCR accuracy
def preprocess_image(image):
    image = resize_image(image)
    image = enhance_contrast(image)
    image = binarize_image(image)
    image = denoise_image(image)
    image = adaptive_threshold(image)
    return image

def resize_image(image, scale_factor=1.5):
    width, height = image.size
    resized_image = image.resize((int(width * scale_factor), int(height * scale_factor)), Image.Resampling.LANCZOS)
    return resized_image

def enhance_contrast(image):
    enhancer = ImageEnhance.Contrast(image)
    enhanced_image = enhancer.enhance(2)
    return enhanced_image

def binarize_image(image):
    gray = image.convert('L')
    binary = gray.point(lambda x: 0 if x < 128 else 255, '1')
    return binary

def denoise_image(image):
    image_np = np.array(image.convert('L'))
    denoised_image = cv2.GaussianBlur(image_np, (5, 5), 0)
    return Image.fromarray(denoised_image)

def adaptive_threshold(image):
    image_np = np.array(image.convert('L'))
    thresh_image = cv2.adaptiveThreshold(image_np, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    return Image.fromarray(thresh_image)

# Perform OCR with PaddleOCR
def ocr_paddleocr(image, ocr_instance):
    try:
        result = ocr_instance.ocr(np.array(image), cls=True)
        logging.debug(f"OCR result: {result}")
    except Exception as e:
        logging.error(f"OCR processing failed: {e}")
        return ""

    if not result or not isinstance(result, list):
        return ""

    extracted_text = []
    for block in result:
        if isinstance(block, list) and len(block) > 0:
            for line in block:
                if isinstance(line, list) and len(line) > 1:
                    text = line[1][0]
                    extracted_text.append(text)

    extracted_text_str = "\n".join(extracted_text)
    logging.debug(f"Extracted text: {extracted_text_str}")
    return extracted_text_str if extracted_text else ""

# Running OCR on the image
def run_ocr(image, ocr_instance):
    paddleocr_text = ocr_paddleocr(image, ocr_instance)
    return {'text': paddleocr_text}

# Classify text to determine dimension type
def classify_dimension(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    probabilities = F.softmax(outputs.logits, dim=1)
    predicted_label = torch.argmax(probabilities, dim=1).item()
    return label_map[predicted_label]

# Extracting information using OCR, entity-unit mapping, and NLP-based dimension classification
# Extracting information using OCR, entity-unit mapping, and NLP-based dimension classification
def extract_info(image_url, entity_name, ocr_instance, cache):
    if image_url in cache:
        cleaned_text = cache[image_url]
    else:
        try:
            response = requests.get(image_url)
            img = Image.open(BytesIO(response.content))
            logging.info(f"Loaded image from URL: {image_url}")
        except Exception as e:
            logging.error(f"Failed to load image from {image_url}: {e}")
            return ""

        img = preprocess_image(img)
        ocr_results = run_ocr(img, ocr_instance)
        cleaned_text = ocr_results['text']
        cache[image_url] = cleaned_text

    sentences = cleaned_text.split('\n')
    dimension_map = {label: None for label in label_map.values()}  # Dynamically handles all dimensions

    for sentence in sentences:
        # Classify the dimension for each sentence
        dimension_class = classify_dimension(sentence)

        # Use regex to find value and unit
        match = re.search(r'(\d+\.?\d*)\s*(\w+)', sentence, re.IGNORECASE)
        if match:
            value = match.group(1)
            unit = normalize_unit(match.group(2))

            # Check if the detected unit is valid for the classified dimension
            if unit and unit in entity_unit_map.get(dimension_class, {}):
                dimension_map[dimension_class] = f"{value} {unit}"

    if entity_name in dimension_map and dimension_map[entity_name]:
        return dimension_map[entity_name]

    return ""

# Predicting entity values and save results to a CSV file
def predict_and_save(csv_data, output_file_path, ocr_instance):
    predictions = []
    total_images = len(csv_data)
    cache = {}

    for index, row in csv_data.iterrows():
        image_url = row['image_link']
        entity_name = row['entity_name']

        predicted_value = extract_info(image_url, entity_name, ocr_instance, cache)

        if pd.isna(predicted_value) or predicted_value == "":
            predicted_value = ""

        print(f"Processing image {index + 1} of {total_images}...")

        predictions.append({'index': index, 'prediction': predicted_value})

    predictions_df = pd.DataFrame(predictions)
    predictions_df.to_csv(output_file_path, index=False)

# Main function to load data and predict entity values
def main():
    csv_file_path = '/Users/hammadkhan/Downloads/student_resource 3/dataset/test.csv'
    output_file_path = '/Users/hammadkhan/Downloads/student_resource 3/dataset/test_out_sample.csv'

    csv_data = read_csv(csv_file_path)
    ocr_instance = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)
    predict_and_save(csv_data, output_file_path, ocr_instance)

if __name__ == "__main__":
    main()


In [None]:
# Importing the necessary dependencies and required libraries
import pandas as pd
import requests
from PIL import Image, ImageEnhance
from io import BytesIO
import re
import numpy as np
import cv2
from paddleocr import PaddleOCR
import logging

# Configure logging for detailed debug information
logging.basicConfig(level=logging.DEBUG)

# Unit normalization map to replace shorthand units with full forms
unit_normalization_map = {
    'g': 'gram',
    'kg': 'kilogram',
    'lbs': 'pound',
    'lb': 'pound',
    'ounces': 'ounce',
    'oz': 'ounce',
    'ml': 'milliliter',
    'l': 'liter',
    'cup': 'cup',
    'v': 'volt',
    'w': 'watt',
    'cm': 'centimeter',
    'mm': 'millimeter',
    'inches': 'inch',
    'm': 'meter',
    'km': 'kilometer',
    'foot': 'foot',
    'feet': 'foot'
}

# Reading CSV file
def read_csv(file_path):
    logging.info(f"Reading CSV file: {file_path}")
    return pd.read_csv(file_path).tail(100)

# Normalising the units using the mapping provided above
def normalize_unit(unit):
    normalized = unit_normalization_map.get(unit.lower(), unit)
    logging.debug(f"Normalized unit '{unit}' to '{normalized}'")
    return normalized

# Extracting numeric values without changing the format
def format_value(value):
    try:
        return str(value)
    except ValueError:
        return value

# Preprocessing the image to enhance OCR accuracy
def preprocess_image(image):
    image = resize_image(image)
    image = enhance_contrast(image)
    image = binarize_image(image)
    image = denoise_image(image)
    image = adaptive_threshold(image)
    return image

# Resizing the image for better OCR accuracy
def resize_image(image, scale_factor=1.5):
    width, height = image.size
    resized_image = image.resize((int(width * scale_factor), int(height * scale_factor)), Image.Resampling.LANCZOS)
    return resized_image

# Enhancing the contrast to improve text visibility
def enhance_contrast(image):
    enhancer = ImageEnhance.Contrast(image)
    enhanced_image = enhancer.enhance(2)
    return enhanced_image

# Binarizing the image to simplify OCR (black-and-white)
def binarize_image(image):
    gray = image.convert('L')
    binary = gray.point(lambda x: 0 if x < 128 else 255, '1')
    return binary

# Denoising the image using Gaussian blur
def denoise_image(image):
    image_np = np.array(image.convert('L'))
    denoised_image = cv2.GaussianBlur(image_np, (5, 5), 0)
    return Image.fromarray(denoised_image)

# Apply adaptive thresholding to the image
def adaptive_threshold(image):
    image_np = np.array(image.convert('L'))
    thresh_image = cv2.adaptiveThreshold(image_np, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    return Image.fromarray(thresh_image)

# Perform OCR with PaddleOCR
def ocr_paddleocr(image, ocr_instance):
    try:
        result = ocr_instance.ocr(np.array(image), cls=True)
        logging.debug(f"OCR result: {result}")
    except Exception as e:
        logging.error(f"OCR processing failed: {e}")
        return ""

    if not result or not isinstance(result, list):
        return ""

    extracted_text = []
    for block in result:
        if isinstance(block, list) and len(block) > 0:
            for line in block:
                if isinstance(line, list) and len(line) > 1:
                    text = line[1][0]
                    extracted_text.append(text)

    extracted_text_str = " ".join(extracted_text)  # Joining extracted text into a single line
    logging.debug(f"Extracted text: {extracted_text_str}")
    return extracted_text_str if extracted_text else ""

# Running OCR on the image
def run_ocr(image, ocr_instance):
    paddleocr_text = ocr_paddleocr(image, ocr_instance)
    return {'text': paddleocr_text}

# Extracting information using OCR and regex
def extract_info(image_url, entity_name, ocr_instance, cache):
    if image_url in cache:
        cleaned_text = cache[image_url]
    else:
        try:
            response = requests.get(image_url)
            img = Image.open(BytesIO(response.content))
            logging.info(f"Loaded image from URL: {image_url}")
        except Exception as e:
            logging.error(f"Failed to load image from {image_url}: {e}")
            return "", ""

        # Preprocessing the image
        img = preprocess_image(img)

        # Performing OCR
        ocr_results = run_ocr(img, ocr_instance)
        cleaned_text = ocr_results['text']
        cache[image_url] = cleaned_text

    # Defining regex patterns
    regex_patterns = {
        'item_weight': r'(\d+(?:\.\d+)?)\s*(g|kg|lb?|ounce|gram|pound?)',
        'item_volume': r'(\d+(?:\.\d+)?)\s*(ml|l|fl\.?\s*oz|cup|liter|millilitre|gallon?)',
        'voltage': r'(\d+(?:\.\d+)?)\s*(v|volt|mv|millivolt)',
        'wattage': r'(\d+(?:\.\d+)?)\s*(w|watt|mw|milliwatt)',
        'maximum_weight_recommendation': r'(\d+(?:\.\d+)?)\s*(g|kg|lb?|ounce|gram|pound?)',
        'height': r'(\d+(?:\.\d+)?)\s*(cm|mm|inch|metre|m|foot|feet)',
        'depth': r'(\d+(?:\.\d+)?)\s*(cm|mm|inch|metre|m|foot|feet)',
        'width': r'(\d+(?:\.\d+)?)\s*(cm|mm|inch|metre|m|foot|feet)',
    }

    pattern = regex_patterns.get(entity_name)
    if pattern:
        match = re.search(pattern, cleaned_text, re.IGNORECASE)
        if match:
            value = format_value(match.group(1))
            unit = normalize_unit(match.group(2))
            logging.info(f"Extracted entity {entity_name}: {value} {unit}")
            return cleaned_text, f"{value} {unit}"

    return cleaned_text, ""

# Predicting entity values and save results to a CSV file
def predict_and_save(csv_data, output_file_path, ocr_instance):
    predictions = []
    cache = {}

    for index, row in csv_data.iterrows():
        image_url = row['image_link']
        entity_name = row['entity_name']
        entity_value = row['entity_value']  # Keep the original entity_value

        # Extract OCR text and predicted value
        extracted_text, predicted_value = extract_info(image_url, entity_name, ocr_instance, cache)

        if pd.isna(predicted_value) or predicted_value == "":
            predicted_value = entity_value  # Use original entity_value if prediction is unavailable

        # Append the results
        predictions.append({
            'extracted_text': extracted_text,  # First column with single-line extracted text
            'entity_name': entity_name,  # Keep entity_name as it is
            'entity_value': entity_value  # Keep entity_value as it is
        })

        logging.info(f"Processed {index + 1} / {len(csv_data)}")

    # Save the results to a CSV file
    predictions_df = pd.DataFrame(predictions)
    predictions_df.to_csv(output_file_path, index=False)
    logging.info(f"Predicted values and extracted texts saved to {output_file_path}")

# Main function to load data and predict entity values
def main():
    csv_file_path = '/Users/hammadkhan/Downloads/student_resource 3/dataset/train.csv'
    output_file_path = '/Users/hammadkhan/Downloads/student_resource 3/dataset/test_extracted_texts.csv'

    csv_data = read_csv(csv_file_path)

    # Initialize PaddleOCR once
    ocr_instance = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)

    # Predicting entity values and save them to the output CSV
    predict_and_save(csv_data, output_file_path, ocr_instance)

if __name__ == "__main__":
    main()


OPTIMIZED APPROACH TO EXTRACT TEXTS

In [None]:
import pandas as pd
import requests
from PIL import Image, ImageEnhance
from io import BytesIO
import re
import numpy as np
import cv2
from paddleocr import PaddleOCR
import logging

# Configure logging for detailed debug information
logging.basicConfig(level=logging.INFO)

# Unit normalization map to replace shorthand units with full forms
unit_normalization_map = {
    'g': 'gram',
    'kg': 'kilogram',
    'lbs': 'pound',
    'lb': 'pound',
    'ounces': 'ounce',
    'oz': 'ounce',
    'ml': 'milliliter',
    'l': 'liter',
    'v': 'volt',
    'w': 'watt',
    'cm': 'centimeter',
    'mm': 'millimeter',
    'inches': 'inch',
    'm': 'meter',
    'km': 'kilometer',
    'foot': 'foot',
    'feet': 'foot'
}

# Pre-compiled regex patterns to optimize extraction
regex_patterns = {
    'item_weight': re.compile(r'(\d+(?:\.\d+)?)\s*(g|kg|lb?|ounce|gram|pound?)', re.IGNORECASE),
    'item_volume': re.compile(r'(\d+(?:\.\d+)?)\s*(ml|l|fl\.?\s*oz|cup|liter|millilitre|gallon?)', re.IGNORECASE),
    'voltage': re.compile(r'(\d+(?:\.\d+)?)\s*(v|volt|mv|millivolt)', re.IGNORECASE),
    'wattage': re.compile(r'(\d+(?:\.\d+)?)\s*(w|watt|mw|milliwatt)', re.IGNORECASE),
    'maximum_weight_recommendation': re.compile(r'(\d+(?:\.\d+)?)\s*(g|kg|lb?|ounce|gram|pound?)', re.IGNORECASE),
    'height': re.compile(r'(\d+(?:\.\d+)?)\s*(cm|mm|inch|metre|m|foot|feet)', re.IGNORECASE),
    'depth': re.compile(r'(\d+(?:\.\d+)?)\s*(cm|mm|inch|metre|m|foot|feet)', re.IGNORECASE),
    'width': re.compile(r'(\d+(?:\.\d+)?)\s*(cm|mm|inch|metre|m|foot|feet)', re.IGNORECASE),
}

# Simplified function for reading the CSV
def read_csv(file_path):
    logging.info(f"Reading CSV file: {file_path}")
    return pd.read_csv(file_path).tail(100)

# Normalizing the units using the mapping provided above
def normalize_unit(unit):
    return unit_normalization_map.get(unit.lower(), unit)

# Simplified image preprocessing for speed
def preprocess_image(image):
    # Resize and enhance contrast only if image size exceeds threshold
    width, height = image.size
    if width > 1000 or height > 1000:
        image = image.resize((int(width * 0.5), int(height * 0.5)), Image.Resampling.LANCZOS)
    enhancer = ImageEnhance.Contrast(image)
    return enhancer.enhance(1.5)

# Perform OCR with PaddleOCR
def ocr_paddleocr(image, ocr_instance):
    try:
        result = ocr_instance.ocr(np.array(image), cls=True)
        extracted_text = " ".join([line[1][0] for block in result for line in block if line[1]])
        logging.debug(f"OCR result: {extracted_text}")
        return extracted_text
    except Exception as e:
        logging.error(f"OCR processing failed: {e}")
        return ""

# Simplified OCR extraction logic with caching
def extract_info(image_url, entity_name, ocr_instance, cache):
    if image_url in cache:
        return cache[image_url], cache[image_url]
    
    try:
        response = requests.get(image_url)
        img = Image.open(BytesIO(response.content))
    except Exception as e:
        logging.error(f"Failed to load image from {image_url}: {e}")
        return "", ""

    img = preprocess_image(img)
    ocr_text = ocr_paddleocr(img, ocr_instance)
    cache[image_url] = ocr_text

    pattern = regex_patterns.get(entity_name)
    if pattern:
        match = pattern.search(ocr_text)
        if match:
            value, unit = match.group(1), normalize_unit(match.group(2))
            logging.info(f"Extracted entity {entity_name}: {value} {unit}")
            return ocr_text, f"{value} {unit}"
    
    return ocr_text, ""

# Predict and save results in batch
def predict_and_save(csv_data, output_file_path, ocr_instance):
    predictions = []
    cache = {}

    for _, row in csv_data.iterrows():
        image_url = row['image_link']
        entity_name = row['entity_name']
        entity_value = row['entity_value']

        ocr_text, predicted_value = extract_info(image_url, entity_name, ocr_instance, cache)
        if not predicted_value:
            predicted_value = entity_value

        predictions.append({
            'extracted_text': ocr_text,
            'entity_name': entity_name,
            'entity_value': entity_value
        })

    pd.DataFrame(predictions).to_csv(output_file_path, index=False)
    logging.info(f"Predicted values and extracted texts saved to {output_file_path}")

# Main function
def main():
    csv_file_path = '/Users/hammadkhan/Downloads/student_resource 3/dataset/train.csv'
    output_file_path = '/Users/hammadkhan/Downloads/student_resource 3/dataset/test_extracted_texts_opt.csv'

    csv_data = read_csv(csv_file_path)
    ocr_instance = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=False)  # Use GPU only if necessary
    predict_and_save(csv_data, output_file_path, ocr_instance)

if __name__ == "__main__":
    main()


In [None]:
!pip install transformers torch pandas scikit-learn datasets tf-keras



In [None]:
pip install accelerate -U

In [52]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from torch.utils.data import Dataset

# Load the CSV file (columns: 'extracted_text', 'entity_name', 'entity_value')
df = pd.read_csv('/Users/hammadkhan/Downloads/student_resource 3/dataset/test_extracted_texts_opt.csv')

# Convert entity_name to numeric labels (for classification)
df['entity_name_label'] = df['entity_name'].factorize()[0]

# Ensure all text data is string and handle NaN values
df['extracted_text'] = df['extracted_text'].fillna("").astype(str)

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['extracted_text'].tolist(),
    df['entity_name_label'].tolist(),
    test_size=0.2,
    random_state=42
)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data for both training and validation sets
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Convert labels to torch tensors
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

# Create a custom PyTorch dataset
class ExtractedTextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Create PyTorch datasets for training and validation
train_dataset = ExtractedTextDataset(train_encodings, train_labels)
val_dataset = ExtractedTextDataset(val_encodings, val_labels)

# Now you can proceed to load these datasets into a DataLoader and train your model.




In [1]:
from torch.utils.data import Dataset

class ExtractedTextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Create PyTorch datasets
train_dataset = ExtractedTextDataset(train_encodings, train_labels)
val_dataset = ExtractedTextDataset(val_encodings, val_labels)


NameError: name 'train_encodings' is not defined

In [54]:
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd

# Load your dataset using pandas
df = pd.read_csv('/Users/hammadkhan/Downloads/student_resource 3/dataset/test_extracted_texts_opt.csv')

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Adjust num_labels based on your dataset

# Tokenize the dataset row by row
def preprocess_function(text):
    return tokenizer(text, padding=True, truncation=True, max_length=128)

# Apply tokenization to the 'text' column
df['tokenized_text'] = df['extracted_text'].apply(preprocess_function)

# Convert to a list of dictionaries for training
train_data = [{'input_ids': item['input_ids'], 'attention_mask': item['attention_mask'], 'labels': label} 
              for item, label in zip(df['tokenized_text'], df['labels'])]

# Convert to Hugging Face dataset
from datasets import Dataset
dataset = Dataset.from_dict(train_data)

# Define training arguments
training_args = TrainingArguments(
    output_dir='/Users/hammadkhan/Downloads/student_resource 3/data',
    evaluation_strategy="steps",
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Split the tokenized dataset into train and test sets
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Fine-tune the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [50]:
import pandas as pd
import torch
from transformers import BertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load the dataset
df = pd.read_csv('/Users/hammadkhan/Downloads/student_resource 3/dataset/test_extracted_texts_opt.csv')

# Convert entity_name to numeric labels (for classification)
df['entity_name_label'] = df['entity_name'].factorize()[0]

# Ensure all text data is string and handle NaN values
df['extracted_text'] = df['extracted_text'].fillna("").astype(str)

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['extracted_text'].tolist(),
    df['entity_name_label'].tolist(),
    test_size=0.2,
    random_state=42
)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset row by row
def preprocess_function(text):
    return tokenizer(text, padding=True, truncation=True, max_length=128)

# Tokenize the text data for both training and validation sets
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Convert to Hugging Face Dataset format
train_data = [{'input_ids': item['input_ids'], 'attention_mask': item['attention_mask'], 'labels': label} 
              for item, label in zip(train_encodings, train_labels)]

val_data = [{'input_ids': item['input_ids'], 'attention_mask': item['attention_mask'], 'labels': label} 
              for item, label in zip(val_encodings, val_labels)]

# Create datasets from dicts
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)

# Initialize the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['entity_name_label'].unique()))  # Adjust num_labels

# Define training arguments
training_args = TrainingArguments(
    output_dir='/Users/hammadkhan/Downloads/student_resource 3/data',
    evaluation_strategy="steps",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='/Users/hammadkhan/Downloads/student_resource 3/logs',  # Where logs are stored
    logging_steps=10,
    save_steps=10
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Fine-tune the model
trainer.train()

# Evaluate the model
trainer.evaluate()




TypeError: string indices must be integers, not 'str'

In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load the dataset
df = pd.read_csv('/Users/hammadkhan/Downloads/student_resource 3/dataset/test_extracted_texts_opt.csv')

# Convert entity_name to numeric labels (for classification)
df['entity_name_label'] = df['entity_name'].factorize()[0]

# Ensure all text data is string and handle NaN values
df['extracted_text'] = df['extracted_text'].fillna("").astype(str)

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['extracted_text'].tolist(),
    df['entity_name_label'].tolist(),
    test_size=0.2,
    random_state=42
)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data for both training and validation sets
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")


# Add labels to the tokenized encodings
train_encodings['labels'] = torch.tensor(train_labels)
val_encodings['labels'] = torch.tensor(val_labels)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict({key: val.tolist() for key, val in train_encodings.items()})
val_dataset = Dataset.from_dict({key: val.tolist() for key, val in val_encodings.items()})

# Initialize the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['entity_name_label'].unique()))

# Define training arguments
training_args = TrainingArguments(
  
    output_dir='/Users/hammadkhan/Downloads/student_resource 3/data',
    evaluation_strategy="steps",
    per_device_train_batch_size=2,  # Reduce to save memory
    per_device_eval_batch_size=2,   # Reduce to save memory
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='/Users/hammadkhan/Downloads/student_resource 3',
    logging_steps=10,
    save_steps=10,
    gradient_accumulation_steps=4,  # Accumulates gradients for larger effective batch size
)


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Fine-tune the model
trainer.train()

# Evaluate the model
trainer.evaluate()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/6 [00:00<?, ?it/s]

: 

In [55]:
df['text']

KeyError: 'text'

AttributeError: 'Series' object has no attribute 'type'

In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/Users/hammadkhan/Downloads/student_resource 3/dataset/test_extracted_texts_opt.csv')
data['entity_value']=data['entity_value'].astype(str)
# Preprocess the data for training
def preprocess_data(data):
    sentences = []
    labels = []

    for _, row in data.iterrows():
        extracted_text = row['extracted_text']
        entity_name = row['entity_name']
        entity_value = row['entity_value']

        # Tokenize the extracted text
        tokens = extracted_text.split()

        # Assign labels based on entity_name and entity_value
        token_labels = []
        for token in tokens:
            if entity_name in token:
                token_labels.append('B-DIMENSION')  # Begin of dimension
            else:
                token_labels.append('O')  # Outside of dimension
        
        sentences.append(tokens)
        labels.append(token_labels)

    return sentences, labels

sentences, labels = preprocess_data(data)


AttributeError: 'float' object has no attribute 'split'

In [4]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/Users/hammadkhan/Downloads/student_resource 3/dataset/test_extracted_texts_opt.csv')

# Preprocess the data for training
def preprocess_data(data):
    sentences = []
    labels = []

    for _, row in data.iterrows():
        extracted_text = row['extracted_text']
        entity_name = row['entity_name']
        entity_value = row['entity_value']

        # Tokenize the extracted text
        tokens = extracted_text.split()

        # Assign labels based on entity_name and entity_value
        token_labels = []
        for token in tokens:
            if entity_name in token:
                token_labels.append('B-DIMENSION')  # Begin of dimension
            else:
                token_labels.append('O')  # Outside of dimension
        
        sentences.append(tokens)
        labels.append(token_labels)

    return sentences, labels

sentences, labels = preprocess_data(data)


AttributeError: 'float' object has no attribute 'split'

In [None]:
pip install transformers datasets


In [None]:
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset

# Initialize the tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)  # O and B-DIMENSION

# Tokenize the sentences
train_encodings = tokenizer(sentences, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

# Convert labels into token-level labels
def align_labels_with_tokens(labels, encodings):
    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = encodings.word_ids(batch_index=i)
        aligned_labels.append([label[word_id] if word_id is not None else -100 for word_id in word_ids])
    return aligned_labels

train_labels = align_labels_with_tokens(labels, train_encodings)

# Convert data to HuggingFace Dataset
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Train the model
trainer.train()


In [None]:
def predict_dimension(extracted_text):
    inputs = tokenizer(extracted_text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs).logits
    predictions = outputs.argmax(dim=-1)
    predicted_labels = [model.config.id2label[p.item()] for p in predictions[0]]
    
    # Map tokens back to the predicted labels
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    return list(zip(tokens, predicted_labels))

# Example usage
extracted_text = "66-89cm 12cm 3cm 2m 3.6cm 16.5cm"
predicted = predict_dimension(extracted_text)
print(predicted)
