In [2]:
import pandas as pd  # PDF TO TEXT PO2
import re
import pdfplumber

def extract_data_from_pdf(pdf_path):
    """
    Extract data from PDF file and return structured data
    """
    data = []
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Extract text from all pages
            full_text = ""
            for page in pdf.pages:
                full_text += page.extract_text() + "\n"
            
            # Mirror (reverse) the entire extracted text
            mirrored_text = full_text[::-1]
            
            print("Mirrored Extracted Text from PDF:")
            print("=" * 50)
            print(mirrored_text)
            print("=" * 50)
            
            # Process the mirrored text if needed (currently this works on normal text)
            # If you only want to display mirrored text and keep normal extraction logic:
            lines = full_text.split('\n')  # Keep data extraction from normal text
            
            current_item = {}
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                    
                # Extract SrNo (looking for lines starting with numbers)
                sr_no_match = re.match(r'^\s*(\d+)\s*', line)
                if sr_no_match:
                    if current_item and 'SrNo' in current_item:  # Save previous item if exists
                        data.append(current_item)
                    current_item = {'SrNo': sr_no_match.group(1), 'OrderQty': 2}
                    
                # Extract Article code (patterns like 9-DD106-YG-25, 9-DD106-YG-7)
                article_match = re.search(r'([A-Z0-9\-]+YG\-?\d*\.?\d+)', line)
                if article_match and 'ArticleCode' not in current_item:
                    current_item['ArticleCode'] = article_match.group(1)
                    
                # Extract StyleCode from Your reference (text after RSBR2074-01, RSBR2074-03, etc.)
                style_match = re.search(r'RSBR2074\-(\d+)\s+([A-Z0-9]+)', line)
                if style_match:
                    current_item['StyleCode'] = style_match.group(2)
                    
                # Extract ItemSize from description (looking for numbers like 0.25, 0.07)
                size_patterns = [
                    r'pendant\s+(\d+\.\d+)',  # pendant 0.25
                    r'(\d+\.\d+)\s*ct',       # 0.25 ct
                    r'YG[-\s]*(\d+\.\d+)',    # YG-0.25 or YG 0.25
                ]
                
                for pattern in size_patterns:
                    size_match = re.search(pattern, line, re.IGNORECASE)
                    if size_match and 'ItemSize' not in current_item:
                        current_item['ItemSize'] = size_match.group(1)
                        break
            
            # Add the last item
            if current_item and 'SrNo' in current_item:
                data.append(current_item)
                
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        return []
    
    return data


def create_excel_dataframe(data):
    """
    Create DataFrame with specified columns
    """
    if not data:
        print("No data extracted from PDF")
        return pd.DataFrame()
    
    df = pd.DataFrame(data)
    
    # Reorder columns as requested
    columns_order = ['SrNo', 'StyleCode', 'ItemSize', 'OrderQty']
    
    # Ensure all columns exist
    for col in columns_order:
        if col not in df.columns:
            df[col] = ""
    
    return df[columns_order]


# Main execution
if __name__ == "__main__":
    pdf_file_path = r'C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\craft\HKD#803866-1761913-Shimayra.pdf'
    
    print(f"Reading PDF from: {pdf_file_path}")
    extracted_data = extract_data_from_pdf(pdf_file_path)
    
    if extracted_data:
        df = create_excel_dataframe(extracted_data)
        
        print("\nExtracted Data:")
        print("=" * 50)
        print(df)
        print("=" * 50)
        
        print("\nExtraction Summary:")
        print(f"Total records extracted: {len(df)}")
        print(f"Columns: {list(df.columns)}")
    else:
        print("No data was extracted from the PDF file.")


Reading PDF from: C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\craft\HKD#803866-1761913-Shimayra.pdf
Mirrored Extracted Text from PDF:

Page
1
of
1
Stamping
Instructions:
18
KT
+
BE
LOGO,LG
STND
CTW
1.
803866/101
2.00
AAB04477H
BT00973-8WLVSBT
BE5D4TB160LC-18KW
18KT
W
ERLV
09/29/2025
18KT
W
BRACELET
.87
Qty
Style
Type
Quality
Date
#
Order
#
Order
Price
Vendor
Style
#
SKU
#
Metal
Color
Size
Diamond
Due
Description
91-22-6962-990
Customer
BRILLIANT
EARTH
Silver
Lock
42.69
MUMBAI
MH
400096
ANDHERI -
EAST
Ship
Via
NA
Gold
Lock
3,695.40
PLOT
NO:
62,
SEEPZ
SHIMAYARA
JEWELLERY
Vendor
#
SHIMAYRA
Due
Date
Sep/29/2025
Purchase
From :
SHIMAYRA
Tel:
646-520-0606
Fax:
929-223-7079
Date
:
Sep/17/2025
NEW
YORK,
NY10017
PO
#
:
803866
535
FIFTH
AVENUE,
18TH
FLOOR,
PURCHASE
ORDER
CRAFT
LAB
GROWN
DIAMONDS

Extracted Data:
      SrNo StyleCode ItemSize  OrderQty
0      535                            2
1   668308                            2
2    71001                            2
3     5202     

In [8]:
import pandas as pd
import re
import pdfplumber
import tempfile
from PyPDF2 import PdfReader, PdfWriter

def rotate_pdf_left(input_path):
    """
    Rotate all pages in the PDF 90 degrees counterclockwise (left)
    and return the path to the temporary rotated file.
    """
    reader = PdfReader(input_path)
    writer = PdfWriter()

    for page in reader.pages:
        # Rotate each page 90° counterclockwise
        page.rotate(90)
        writer.add_page(page)

    # Create a temporary file for rotated PDF
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
    with open(temp_file.name, "wb") as f_out:
        writer.write(f_out)

    return temp_file.name


def extract_data_from_pdf(pdf_path):
    """
    Extract data from a rotated PDF file and return structured data
    """
    data = []

    try:
        # Step 1: Rotate PDF left and get rotated temp path
        rotated_pdf = rotate_pdf_left(pdf_path)

        # Step 2: Extract text using pdfplumber
        with pdfplumber.open(rotated_pdf) as pdf:
            full_text = ""
            for page_num, page in enumerate(pdf.pages, start=1):
                page_text = page.extract_text()
                if page_text:
                    full_text += page_text + "\n"
                else:
                    print(f"⚠️ Warning: No text found on page {page_num}")

            # Step 3: Mirror (reverse) the text for display
            
            print("\nMirrored Extracted Text from Rotated PDF:")
            print("=" * 50)
            print(full_text)
            print("=" * 50)

            # Step 4: Process the normal (non-mirrored) text for extraction
            lines = full_text.split('\n')
            current_item = {}

            for line in lines:
                line = line.strip()
                if not line:
                    continue

                # Extract SrNo
                sr_no_match = re.match(r'^\s*(\d+)\s*', line)
                if sr_no_match:
                    if current_item and 'SrNo' in current_item:
                        data.append(current_item)
                    current_item = {'SrNo': sr_no_match.group(1), 'OrderQty': 2}

                # Extract Article code
                article_match = re.search(r'([A-Z0-9\-]+YG\-?\d*\.?\d+)', line)
                if article_match and 'ArticleCode' not in current_item:
                    current_item['ArticleCode'] = article_match.group(1)

                # Extract StyleCode
                style_match = re.search(r'RSBR2074\-(\d+)\s+([A-Z0-9]+)', line)
                if style_match:
                    current_item['StyleCode'] = style_match.group(2)

                # Extract ItemSize
                size_patterns = [
                    r'pendant\s+(\d+\.\d+)',
                    r'(\d+\.\d+)\s*ct',
                    r'YG[-\s]*(\d+\.\d+)',
                ]
                for pattern in size_patterns:
                    size_match = re.search(pattern, line, re.IGNORECASE)
                    if size_match and 'ItemSize' not in current_item:
                        current_item['ItemSize'] = size_match.group(1)
                        break

            if current_item and 'SrNo' in current_item:
                data.append(current_item)

    except Exception as e:
        print(f"Error reading or processing PDF file: {e}")
        return []

    return data


def create_excel_dataframe(data):
    """
    Create DataFrame with specified columns
    """
    if not data:
        print("No data extracted from PDF")
        return pd.DataFrame()

    df = pd.DataFrame(data)
    columns_order = ['SrNo', 'StyleCode', 'ItemSize', 'OrderQty']

    for col in columns_order:
        if col not in df.columns:
            df[col] = ""

    return df[columns_order]


# Main execution
if __name__ == "__main__":
    pdf_file_path = r'C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\craft\HKD#803866-1761913-Shimayra.pdf'

    print(f"Rotating and reading PDF from: {pdf_file_path}")
    extracted_data = extract_data_from_pdf(pdf_file_path)

    if extracted_data:
        df = create_excel_dataframe(extracted_data)

        print("\nExtracted Data:")
        print("=" * 50)
        print(df)
        print("=" * 50)

        print("\nExtraction Summary:")
        print(f"Total records extracted: {len(df)}")
        print(f"Columns: {list(df.columns)}")
    else:
        print("No data was extracted from the PDF file.")


Rotating and reading PDF from: C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\craft\HKD#803866-1761913-Shimayra.pdf

Mirrored Extracted Text from Rotated PDF:
CRAFT LAB GROWN DIAMONDS
PURCHASE ORDER
535 FIFTH AVENUE, 18TH FLOOR,
PO # : 803866
NEW YORK, NY10017
Date : Sep/17/2025
Tel: 646-520-0606 Fax: 929-223-7079
Purchase From : SHIMAYRA
Vendor # SHIMAYRA Due Date Sep/29/2025
SHIMAYARA JEWELLERY
PLOT NO: 62, SEEPZ
Ship Via NA Gold Lock 3,695.40
ANDHERI - EAST
MUMBAI MH 400096
Customer BRILLIANT EARTH Silver Lock 42.69
91-22-6962-990
# Order # Order Price Vendor Style # SKU # Metal Color Size Diamond Due Description
Qty Style Type Quality Date
1. 803866/101 2.00 AAB04477H BT00973-8WLVSBT BE5D4TB160LC-18KW 18KT W ERLV 09/29/2025 18KT W BRACELET .87
CTW
STND
Stamping Instructions: 18 KT + BE LOGO,LG
Page 1 of 1


Extracted Data:
  SrNo StyleCode ItemSize  OrderQty
0  535                            2
1   91                            2
2    1                            2

Extracti

In [16]:
import pandas as pd
import re
import pdfplumber
import tempfile
from PyPDF2 import PdfReader, PdfWriter

def rotate_pdf_left(input_path):
    """
    Rotate all pages in the PDF 90 degrees counterclockwise (left)
    and return the path to the temporary rotated file.
    """
    reader = PdfReader(input_path)
    writer = PdfWriter()

    for page in reader.pages:
        # Rotate each page 90° counterclockwise
        page.rotate(90)
        writer.add_page(page)

    # Create a temporary file for rotated PDF
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
    with open(temp_file.name, "wb") as f_out:
        writer.write(f_out)

    return temp_file.name


def extract_data_from_pdf(pdf_path):
    """
    Extract data from a rotated PDF file and return structured data
    """
    data = []

    try:
        # Step 1: Rotate PDF left and get rotated temp path
        rotated_pdf = rotate_pdf_left(pdf_path)

        # Step 2: Extract text using pdfplumber
        with pdfplumber.open(rotated_pdf) as pdf:
            full_text = ""
            for page_num, page in enumerate(pdf.pages, start=1):
                page_text = page.extract_text()
                if page_text:
                    full_text += page_text + "\n"
                else:
                    print(f"⚠️ Warning: No text found on page {page_num}")

            # Step 3: Mirror (reverse) the text for display
            
            print("\nMirrored Extracted Text from Rotated PDF:")
            print("=" * 50)
            print(full_text)
            print("=" * 50)

            # Step 4: Process the normal (non-mirrored) text for extraction
            lines = full_text.split('\n')
            current_item = {}

            for line in lines:
                line = line.strip()
                if not line:
                    continue

                # Extract SrNo
                sr_no_match = re.match(r'^\s*(\d+)\s*', line)
                if sr_no_match:
                    if current_item and 'SrNo' in current_item:
                        data.append(current_item)
                    current_item = {'SrNo': sr_no_match.group(1), 'OrderQty': 2}

                # Extract Article code
                article_match = re.search(r'([A-Z0-9\-]+YG\-?\d*\.?\d+)', line)
                if article_match and 'ArticleCode' not in current_item:
                    current_item['ArticleCode'] = article_match.group(1)

                # Extract StyleCode
                style_match = re.search(r'RSBR2074\-(\d+)\s+([A-Z0-9]+)', line)
                if style_match:
                    current_item['StyleCode'] = style_match.group(2)

                # Extract ItemSize
                size_patterns = [
                    r'pendant\s+(\d+\.\d+)',
                    r'(\d+\.\d+)\s*ct',
                    r'YG[-\s]*(\d+\.\d+)',
                ]
                for pattern in size_patterns:
                    size_match = re.search(pattern, line, re.IGNORECASE)
                    if size_match and 'ItemSize' not in current_item:
                        current_item['ItemSize'] = size_match.group(1)
                        break

            if current_item and 'SrNo' in current_item:
                data.append(current_item)

    except Exception as e:
        print(f"Error reading or processing PDF file: {e}")
        return []

    return data


def create_excel_dataframe(data):
    """
    Create DataFrame with specified columns
    """
    if not data:
        print("No data extracted from PDF")
        return pd.DataFrame()

    df = pd.DataFrame(data)
    columns_order = ['SrNo', 'StyleCode', 'ItemSize', 'OrderQty']

    for col in columns_order:
        if col not in df.columns:
            df[col] = ""

    return df[columns_order]


# Main execution
if __name__ == "__main__":
    pdf_file_path = r'C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\craft\HKD#803872-BE PROJ - SEP - Shimayra.pdf'

    print(f"Rotating and reading PDF from: {pdf_file_path}")
    extracted_data = extract_data_from_pdf(pdf_file_path)

    if extracted_data:
        df = create_excel_dataframe(extracted_data)

        print("\nExtracted Data:")
        print("=" * 50)
        print(df)
        print("=" * 50)

        print("\nExtraction Summary:")
        print(f"Total records extracted: {len(df)}")
        print(f"Columns: {list(df.columns)}")
    else:
        print("No data was extracted from the PDF file.")


Rotating and reading PDF from: C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\craft\HKD#803872-BE PROJ - SEP - Shimayra.pdf

Mirrored Extracted Text from Rotated PDF:
CRAFT LAB GROWN DIAMONDS
PURCHASE ORDER
535 FIFTH AVENUE, 18TH FLOOR,
PO # : 803872
NEW YORK, NY10017
Date : Sep/18/2025
Tel: 646-520-0606 Fax: 929-223-7079
Purchase From : SHIMAYRA
Vendor # SHIMAYRA Due Date Oct/03/2025
SHIMAYARA JEWELLERY
PLOT NO: 62, SEEPZ
Ship Via NA Gold Lock 3,681.00
ANDHERI - EAST
MUMBAI MH 400096
Customer BRILLIANT EARTH Silver Lock 41.27
91-22-6962-990
# Order # Order Price Vendor Style # SKU # Metal Color Size Diamond Due Description
Qty Style Type Quality Date
1. 803872/101 3.00 AAB04477H S-BT00829-8YLVSBT BE5D4TB165LC-18KY 18KT Y ERLV 10/03/2025 18KT Y BRACELET 1.00
CTW
STND
Stamping Instructions: 18 KT + BE LOGO,LG
2. 803872/102 8.00 AAB04477K S-BT00830-8WLVSBT BE5D4TB265LC-18KW 18KT W ERLV 10/03/2025 18KT W BRACELET 2.00
CTW
STND
Stamping Instructions: 18K + BE LOGO,LG
3. 803872/103 

In [None]:
import pandas as pd
import re

# --- Raw extracted text ---
raw_text =  

# --- Step 1: Extract PO number ---
po_match = re.search(r'PO\s*#\s*[:]*\s*(\d+)', raw_text)
item_po_no = po_match.group(1) if po_match else ""

# --- Step 2: Extract each item block (now more flexible) ---
item_blocks = re.findall(
    r'(\d+)\.\s*(\d+/\d+)\s+([\d.]+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(18K[T]?|14K[T]?)\s+([YW])[\s\S]*?Stamping Instructions:\s*([^\n]+)',
    raw_text
)

# --- Step 3: Build structured data ---
data = []
for i, block in enumerate(item_blocks, start=1):
    (
        sr_no, order_code, order_qty, style_code,
        vendor_style, sku_no, metal_kt, tone, stamping_instr
    ) = block

    print(f"\nItem {i}: {style_code}")
    item_size = input("Enter Item Size (e.g. 6, 7.5): ")
    priority = input("Enter Priority (REG or TMP): ")
    user_stylecode = input("Enter StyleCode (for SpecialRemarks): ")

    # --- Mapping logic ---
    metal = f"G{metal_kt.replace('KT','').replace('K','')}{tone}"
    tone_full = "Yellow Gold" if tone == "Y" else "White Gold"

    # Find item type (Bracelet/Earring)
    desc_match = re.search(r'(BRACELET|EARRING)', raw_text[raw_text.find(style_code):], re.IGNORECASE)
    desc = desc_match.group(1).capitalize() if desc_match else "Item"

    desc_full = f"{metal_kt} {tone_full} {desc} 1.00 CTW"

    # --- Special Remarks ---
    special_remarks = (
        f"BRILLIANT EARTH CRAFT,{order_code}, {user_stylecode},{vendor_style}, "
        f"{sku_no},SZ-{item_size} INCH, COC CERTIFIED RE-CYCLE GOLD, {metal_kt} {tone_full.upper()}"
    )

    design_prod_instr = "White Rodium" if tone == "W" else "No Rodium"

    data.append({
        "Sr.NO": i,
        "Style Code": style_code,
        "ItemSize": item_size,
        "OrderQty": order_qty,
        "OrderItemPcs": "",
        "Metal": metal,
        "Tone": tone,
        "ItemPoNo.": item_po_no,
        "ItemRefNo": "",
        "StockType": "",
        "Priority": priority,
        "MakeType": "",
        "CustomerProductionInstruction": desc_full,
        "SpecialRemarks": special_remarks,
        "DesignProductionInstruction": design_prod_instr,
        "StampInstruction": stamping_instr.strip(),
        "OrderGroup": "BRILLIANT EARTH CRAFT",
        "Certificate": "",
        "SKUNo": sku_no,
        "Basestoneminwt": "",
        "Basestonemaxwt": "",
        "Basemetalminwt": "",
        "Basemetalmaxwt": "",
        "Productiondeliverydate": "",
        "Expecteddeliverydate": "",
        "SetPrice": "",
        "StoneQuality": ""
    })

# --- Step 4: Create DataFrame ---
df = pd.DataFrame(data)

# --- Step 5: Display and export ---
print("\n✅ Final Structured DataFrame:\n")
print(df)

df.to_excel("structured_purchase_order_1.xlsx", index=False)
print("\nFile saved as 'structured_purchase_order_1.xlsx'")



Item 1: AAB04477H

Item 2: AAB04477K

Item 3: AAE07799H

✅ Final Structured DataFrame:

   Sr.NO Style Code ItemSize OrderQty OrderItemPcs Metal Tone ItemPoNo.  \
0      1  AAB04477H        4     3.00               G18Y    Y    803872   
1      2  AAB04477K       55     8.00               G18W    W    803872   
2      3  AAE07799H        5     8.00               G18Y    Y    803872   

  ItemRefNo StockType  ... Certificate              SKUNo Basestoneminwt  \
0                      ...              BE5D4TB165LC-18KY                  
1                      ...              BE5D4TB265LC-18KW                  
2                      ...              BE3DCLW100LC-18KY                  

  Basestonemaxwt Basemetalminwt Basemetalmaxwt Productiondeliverydate  \
0                                                                       
1                                                                       
2                                                                       

  Expectedde

## Final code

In [18]:
import pandas as pd
import re
import pdfplumber
import tempfile
from PyPDF2 import PdfReader, PdfWriter

def rotate_pdf_left(input_path):
    """
    Rotate all pages in the PDF 90° counterclockwise (left)
    and return the path to the temporary rotated file.
    """
    reader = PdfReader(input_path)
    writer = PdfWriter()

    for page in reader.pages:
        page.rotate(90)
        writer.add_page(page)

    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
    with open(temp_file.name, "wb") as f_out:
        writer.write(f_out)

    return temp_file.name


def extract_raw_text_from_pdf(pdf_path):
    """
    Rotates the PDF, extracts text using pdfplumber, and returns raw text.
    """
    try:
        rotated_pdf = rotate_pdf_left(pdf_path)
        full_text = ""

        with pdfplumber.open(rotated_pdf) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    full_text += text + "\n"

        print("\n📜 Extracted Text from Rotated PDF:")
        print("=" * 60)
        print(full_text)
        print("=" * 60)

        return full_text.strip()

    except Exception as e:
        print(f"Error reading or processing PDF: {e}")
        return ""


def parse_purchase_order_data(full_text):
    """
    Parses structured data from the extracted text
    according to the specified format and rules.
    """
    # Step 1: Extract only the relevant section
    match = re.search(r"# Order[\s\S]*?Page 1 of 1", full_text)
    if not match:
        print("⚠️ Could not find purchase order section in the text.")
        return pd.DataFrame()
    text_section = match.group(0)

    # Step 2: Extract PO number
    po_match = re.search(r'PO\s*#\s*[:]*\s*(\d+)', full_text)
    item_po_no = po_match.group(1) if po_match else ""

    # Step 3: Extract each item block (robust regex)
    item_blocks = re.findall(
        r'(\d+)\.\s*(\d+/\d+)\s+([\d.]+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(18K[T]?|14K[T]?)\s+([YW])[\s\S]*?Stamping Instructions:\s*([^\n]+)',
        text_section
    )

    if not item_blocks:
        print("⚠️ No item blocks found.")
        return pd.DataFrame()

    data = []
    for i, block in enumerate(item_blocks, start=1):
        (
            sr_no, order_code, order_qty, style_code,
            vendor_style, sku_no, metal_kt, tone, stamping_instr
        ) = block

        print(f"\n🧩 Item {i}: {style_code}")
        item_size = input("Enter Item Size (e.g. 6, 7.5): ")
        priority = input("Enter Priority (REG or TMP): ")
        user_stylecode = input("Enter StyleCode (for SpecialRemarks): ")

        # Metal formatting
        metal = f"G{metal_kt.replace('KT','').replace('K','')}{tone}"
        tone_full = "Yellow Gold" if tone == "Y" else "White Gold"

        # Find item type (Bracelet/Earring)
        desc_match = re.search(r'(BRACELET|EARRING)', text_section[text_section.find(style_code):], re.IGNORECASE)
        desc = desc_match.group(1).capitalize() if desc_match else "Item"
        desc_full = f"{metal_kt} {tone_full} {desc} 1.00 CTW"

        # Special Remarks
        special_remarks = (
            f"BRILLIANT EARTH CRAFT,{order_code}, {user_stylecode},{vendor_style}, "
            f"{sku_no},SZ-{item_size} INCH, {metal_kt} {tone_full.upper()},COC CERTIFIED RE-CYCLE GOLD"
        )

        design_prod_instr = "White Rodium" if tone == "W" else "No Rodium"

        data.append({
            "Sr.NO": i,
            "Style Code": style_code,
            "ItemSize": item_size,
            "OrderQty": order_qty,
            "OrderItemPcs": "",
            "Metal": metal,
            "Tone": tone,
            "ItemPoNo.": item_po_no,
            "ItemRefNo": "",
            "StockType": "",
            "Priority": priority,
            "MakeType": "",
            "CustomerProductionInstruction": desc_full,
            "SpecialRemarks": special_remarks,
            "DesignProductionInstruction": design_prod_instr,
            "StampInstruction": stamping_instr.strip(),
            "OrderGroup": "BRILLIANT EARTH CRAFT",
            "Certificate": "",
            "SKUNo": sku_no,
            "Basestoneminwt": "",
            "Basestonemaxwt": "",
            "Basemetalminwt": "",
            "Basemetalmaxwt": "",
            "Productiondeliverydate": "",
            "Expecteddeliverydate": "",
            "SetPrice": "",
            "StoneQuality": ""
        })

    return pd.DataFrame(data)


# ================== MAIN EXECUTION ==================

if __name__ == "__main__":
    pdf_file_path = r"C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\craft\HKD#803867-1761917-Shimayra.pdf"

    print(f"\n📂 Reading and processing PDF: {pdf_file_path}")

    full_text = extract_raw_text_from_pdf(pdf_file_path)

    if full_text:
        df = parse_purchase_order_data(full_text)

        if not df.empty:
            print("\n✅ Final Structured Data:")
            print("=" * 80)
            print(df)
            print("=" * 80)
            output_path = "final_purchase_order_1.xlsx"
            df.to_excel(output_path, index=False)
            print(f"\n💾 Data successfully saved to '{output_path}'")
        else:
            print("⚠️ No structured data could be extracted.")
    else:
        print("❌ No text extracted from the PDF.")



📂 Reading and processing PDF: C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\craft\HKD#803867-1761917-Shimayra.pdf

📜 Extracted Text from Rotated PDF:
CRAFT LAB GROWN DIAMONDS
PURCHASE ORDER
535 FIFTH AVENUE, 18TH FLOOR,
PO # : 803867
NEW YORK, NY10017
Date : Sep/17/2025
Tel: 646-520-0606 Fax: 929-223-7079
Purchase From : SHIMAYRA
Vendor # SHIMAYRA Due Date Sep/29/2025
SHIMAYARA JEWELLERY
PLOT NO: 62, SEEPZ
Ship Via NA Gold Lock 3,695.40
ANDHERI - EAST
MUMBAI MH 400096
Customer BRILLIANT EARTH Silver Lock 42.69
91-22-6962-990
# Order # Order Price Vendor Style # SKU # Metal Color Size Diamond Due Description
Qty Style Type Quality Date
1. 803867/101 1.00 AAB04477M BT00977-8WLVSBT BE5D4TB360LC-18KW 18KT W ERLV 09/29/2025 18KT W BRACELET 2.65
CTW
STND
Stamping Instructions: 18K + BE LOGO,LG
Page 1 of 1


🧩 Item 1: AAB04477M

✅ Final Structured Data:
   Sr.NO Style Code ItemSize OrderQty OrderItemPcs Metal Tone ItemPoNo.  \
0      1  AAB04477M        6     1.00               G18W