In [None]:
import pandas as pd #PDF TO TEXT PO2
import re
import pdfplumber

def extract_data_from_pdf(pdf_path):
    """
    Extract data from PDF file and return structured data
    """
    data = []
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Extract text from all pages
            full_text = ""
            for page in pdf.pages:
                full_text += page.extract_text() + "\n"
            
            print("Extracted text from PDF:")
            print("=" * 50)
            print(full_text)
            print("=" * 50)
            
            # Process the extracted text
            lines = full_text.split('\n')
            current_item = {}
            
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                    
                # Extract SrNo (looking for lines starting with numbers)
                sr_no_match = re.match(r'^\s*(\d+)\s*', line)
                if sr_no_match:
                    if current_item and 'SrNo' in current_item:  # Save previous item if exists
                        data.append(current_item)
                    current_item = {'SrNo': sr_no_match.group(1), 'OrderQty': 2}
                    
                # Extract Article code (patterns like 9-DD106-YG-25, 9-DD106-YG-7)
                article_match = re.search(r'([A-Z0-9\-]+YG\-?\d*\.?\d+)', line)
                if article_match and 'ArticleCode' not in current_item:
                    current_item['ArticleCode'] = article_match.group(1)
                    
                # Extract StyleCode from Your reference (text after RSBR2074-01, RSBR2074-03, etc.)
                style_match = re.search(r'RSBR2074\-(\d+)\s+([A-Z0-9]+)', line)
                if style_match:
                    current_item['StyleCode'] = style_match.group(2)
                    
                # Extract ItemSize from description (looking for numbers like 0.25, 0.07)
                # Multiple patterns to catch different formats
                size_patterns = [
                    r'pendant\s+(\d+\.\d+)',  # pendant 0.25
                    r'(\d+\.\d+)\s*ct',       # 0.25 ct
                    r'YG[-\s]*(\d+\.\d+)',    # YG-0.25 or YG 0.25
                ]
                
                for pattern in size_patterns:
                    size_match = re.search(pattern, line, re.IGNORECASE)
                    if size_match and 'ItemSize' not in current_item:
                        current_item['ItemSize'] = size_match.group(1)
                        break
            
            # Add the last item
            if current_item and 'SrNo' in current_item:
                data.append(current_item)
                
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        return []
    
    return data

def create_excel_dataframe(data):
    """
    Create DataFrame with specified columns
    """
    if not data:
        print("No data extracted from PDF")
        return pd.DataFrame()
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Reorder columns as requested
    columns_order = ['SrNo', 'StyleCode', 'ItemSize', 'OrderQty']
    
    # Only include columns that exist in the data
    final_columns = [col for col in columns_order if col in df.columns]
    
    # Add missing columns with empty values
    for col in columns_order:
        if col not in df.columns:
            df[col] = ""
    
    return df[columns_order]

# Main execution
if __name__ == "__main__":
    # Specify your PDF file path here
   
    pdf_file_path = r'C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\BHAKTI\VPO#4656-IA-H1028-4890316-BK-SHIMAYRA.pdf' # Change this to your actual PDF path
    
    # Extract data from PDF
    print(f"Reading PDF from: {pdf_file_path}")
    extracted_data = extract_data_from_pdf(pdf_file_path)
    
    if extracted_data:
        # Create DataFrame
        df = create_excel_dataframe(extracted_data)
        
        # Display the results
        print("\nExtracted Data:")
        print("=" * 50)
        print(df)
        
        # Save to Excel file
        # output_file = 'extracted_data.xlsx'
        # df.to_excel(output_file, index=False)
        # print(f"\nData successfully saved to '{output_file}'")
        
        # Display basic statistics
        print(f"\nExtraction Summary:")
        print(f"Total records extracted: {len(df)}")
        print(f"Columns: {list(df.columns)}")
        
    else:
        print("No data was extracted from the PDF file.")
    
    # Display the DataFrame in notebook
    # df

Reading PDF from: C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\BHAKTI\VPO#4656-IA-H1028-4890316-BK-SHIMAYRA.pdf
Extracted text from PDF:
oT
redrO
SHIMAYRA JEWELLERY Bhakti Diamond LLC
PLOT NO: 62, SEEPZ ANDHERI (E) 50 West 47th Street Suite# 2110
Mumbai MAHARASHTRA 400096 INDIA New York NY 10036
oT
pihS
Bhakti Diamond LLC Purchase Order
50 West 47th Street Suite# 2110
New York NY 10036
(212) 398-7778
Order #: 4656
Page #: 1 of 1
P.O. #: 4890316 Date: 9/24/2025 Due Date: 9/30/2025 Cancel Date: 9/30/2025
Reference: VPO#4656-IA-H1028 Vendor #:0098S Phone #: 912269629949 Ship Via:
# Memo # Item # Vendor Item # Job Bag # Description Size Quantity Weight Unit Cost Amount
RG1964Q-LGD- SKU#2733453 14KW
1 14KW-SZ9 RG0001964QA 997280001 43RD=5.994CTW 1 0.0000 Q $0.00 $0.00
1RDCTR=1.50CTW
2RDSIDE=2.50CTW
2RDSIDE=1.50CTW
LGD GH VS ROUND
RING SZ9
1 0.0000
DIAMOND QUALITY LGD DIAGH VS2
STAMPING BHJ HDS LGD 14K
NOTE TRUE SIZE
Grand Total: $0.00
RightClick® Copyright © 2025 CFI/Wise Choice S

In [None]:
import re
import pandas as pd
import pdfplumber
from typing import List, Dict, Optional

# --- User inputs ---
# Choose defaults suitable for quick runs. You may change interactively in notebook UI.
ITEM_SIZE_TYPE = input("Enter ItemSize type prefix (e.g., TS, RS) [default TS]: ") or "TS"
METAL_MODE = input("Enter Metal mode (Default/Recycled) [default Default]: ") or "Default"
PRIORITY_DAYS = input("Enter priority in days (number) [default 5]: ") or "5"
ORDER_GROUP = input("Enter OrderGroup: ") or ""

PDF_PATH = r'C:\\Users\\Pratik Mali\\Desktop\\tools\\OrderProcessingTool\\BHAKTI\\VPO#4656-IA-H1028-4890316-BK-SHIMAYRA.pdf'

# --- Helpers ---
METAL_MAP = {
    # gold karat + tone → internal
    # Detected tokens should be normalized like 14KW, 10KY, 18KR etc
    "10KY": "G10Y",
    "10KW": "G10W",
    "10KR": "G10R",
    "14KY": "G14Y",
    "14KW": "G14W",
    "14KR": "G14R",
    "18KY": "G18Y",
    "18KW": "G18W",
    "18KR": "G18R",
    # platinum
    "PLATINUM": "PC95",
    "PT": "PC95",
    "PLAT": "PC95",
}

def add_recycled_suffix(metal_code: str, mode: str) -> str:
    if not metal_code:
        return metal_code
    if mode.strip().lower() == "recycled":
        return metal_code + "Z"
    return metal_code

SKU_START_PATTERN = re.compile(r"\bSKU\s*NO\.?\b|\bSKU#\b", re.IGNORECASE)
ORDER_NO_PATTERN = re.compile(r"Order\s*#:\s*(\d+)", re.IGNORECASE)
STYLE_CODE_PATTERN = re.compile(r"(?:\b[A-Z]{2}\d{5,}[A-Z]{1,4}\b|\bRG\d{5,}[A-Z]{1,4}\b)")
SIZE_PATTERN = re.compile(r"\bSZ\s*(\d{1,2})\b", re.IGNORECASE)
QTY_AFTER_CTW_PATTERN = re.compile(r"CTW\s*(\d+)\b", re.IGNORECASE)
STAMP_LINE_PATTERN = re.compile(r"^\s*STAMPING\s+(.+)$", re.IGNORECASE)
# Accept variants and extra tokens around diamond quality, e.g., LGD GH VS, LGD GH VS2
DIAMOND_QUALITY_PATTERN = re.compile(r"\bLGD\s+GH\s+VS\d*\b", re.IGNORECASE)

# SKU like token example: RG1964Q-LGD-14KW-SZ9
# Capture flexible SKU forms, e.g., RG1964Q-LGD-14KW-SZ9 or RG1964Q-LGD-PLATINUM-SZ9
SKU_NO_CAPTURE = re.compile(r"\b([A-Z0-9]+\-[A-Z0-9]+\-([A-Z0-9]+)\-SZ\s*(\d{1,2}))\b", re.IGNORECASE)

# Normalize a detected metal token (e.g., 14KW, Platinum, PT)
def normalize_metal_token(text: str) -> Optional[str]:
    text_up = text.upper()
    # Try direct tokens first in METAL_MAP keys
    for token in METAL_MAP.keys():
        if token in text_up:
            return token
    # Try to detect patterns like 14K + tone
    m = re.search(r"\b(10|14|18)K\s*([WYR])\b", text_up)
    if m:
        return f"{m.group(1)}K{m.group(2)}"
    # platinum words
    if re.search(r"\bPLAT(INUM)?\b|\bPT\b", text_up):
        return "PLATINUM"
    return None

# Extract tone from mapped metal (last char if gold); blank for platinum

def tone_from_metal(metal_code: str) -> str:
    if not metal_code:
        return ""
    if metal_code.startswith("P"):  # platinum codes like PC95
        return ""
    # gold tones end with Y/W/R
    last = metal_code[-1]
    if last in ("Y", "W", "R"):
        return last
    return ""

# Split text into item chunks starting at SKU markers until just before next SKU

def split_items_by_sku(full_text: str) -> List[str]:
    lines = full_text.splitlines()
    indices = []
    for i, line in enumerate(lines):
        if SKU_START_PATTERN.search(line) or SKU_NO_CAPTURE.search(line):
            indices.append(i)
    if not indices:
        return []
    chunks = []
    for idx, start in enumerate(indices):
        end = indices[idx + 1] if idx + 1 < len(indices) else len(lines)
        chunk = "\n".join(lines[start:end]).strip()
        if chunk:
            chunks.append(chunk)
    return chunks

# Parse a single item chunk into a dict following the required schema

def parse_item_chunk(chunk: str, item_po_no: str, sr_no: int) -> Dict[str, str]:
    # StyleCode
    style_match = STYLE_CODE_PATTERN.search(chunk)
    if style_match:
        style_code = style_match.group(0)
    else:
        # Fallbacks:
        # 1) Look for RG + digits + trailing letters anywhere
        m1 = re.search(r"\bRG\d{3,}[A-Z]{1,4}\b", chunk)
        # 2) Generic: two letters + 6+ digits + 1-4 letters
        m2 = re.search(r"\b[A-Z]{2}\d{4,}[A-Z]{1,4}\b", chunk)
        style_code = (m1 or m2).group(0) if (m1 or m2) else ""

    # ItemSize (SZN) -> N, then prefixed with ITEM_SIZE_TYPE; pad to 2 digits
    size_match = SIZE_PATTERN.search(chunk)
    item_size_num = size_match.group(1) if size_match else ""
    item_size_formatted = (
        f"{ITEM_SIZE_TYPE}{int(item_size_num):02d}" if item_size_num else ""
    )

    # OrderQty: first integer immediately after a CTW occurrence in the chunk
    qty_match = QTY_AFTER_CTW_PATTERN.search(chunk)
    order_qty = qty_match.group(1) if qty_match else ""

    # Metal detection and mapping
    metal_token = normalize_metal_token(chunk) or ""
    mapped_metal = METAL_MAP.get(metal_token, "")
    mapped_metal = add_recycled_suffix(mapped_metal, METAL_MODE)

    # Tone from metal
    tone = tone_from_metal(mapped_metal)

    # CustomerProductionInstruction: prefer line containing LGD and RING and SZ
    lines = [ln.strip() for ln in chunk.splitlines() if ln.strip()]
    cpi_candidates = [
        ln for ln in lines
        if re.search(r"\bLGD\b", ln, re.IGNORECASE)
        and re.search(r"\bRING\b", ln, re.IGNORECASE)
        and re.search(r"\bSZ\s*\d{1,2}\b", ln, re.IGNORECASE)
    ]
    if not cpi_candidates:
        # Next best: any line with SZ and CTW
        cpi_candidates = [
            ln for ln in lines
            if re.search(r"\bSZ\s*\d{1,2}\b", ln, re.IGNORECASE) and re.search(r"CTW", ln, re.IGNORECASE)
        ]
    if not cpi_candidates:
        # Fallback: the longest non-empty line
        cpi_candidates = [max(lines, key=len)] if lines else [""]
    customer_production_instruction = cpi_candidates[0]

    # StampInstruction: content after "STAMPING" on that line only
    stamp = ""
    for line in chunk.splitlines():
        m = STAMP_LINE_PATTERN.search(line)
        if m:
            stamp = m.group(1).strip()
            break

    # Diamond quality for SpecialRemarks
    dq_match = DIAMOND_QUALITY_PATTERN.search(chunk)
    diamond_quality = dq_match.group(0).upper() if dq_match else ""

    # SKUNo capture and derive metal/tone from it if present
    sku_match = SKU_NO_CAPTURE.search(chunk)
    sku_no = sku_match.group(1).upper() if sku_match else ""
    sku_metal_token = sku_match.group(2).upper() if sku_match else ""
    sku_size_num = sku_match.group(3) if sku_match else ""
    
    # If ItemSize missing from text, use size from SKU
    if not item_size_num and sku_size_num:
        item_size_formatted = f"{ITEM_SIZE_TYPE}{int(sku_size_num):02d}"
    
    # Prefer metal from SKU if available
    if sku_metal_token:
        token_norm = normalize_metal_token(sku_metal_token) or sku_metal_token
        mapped_from_sku = METAL_MAP.get(token_norm, METAL_MAP.get(token_norm.upper(), ""))
        if mapped_from_sku:
            mapped_metal = add_recycled_suffix(mapped_from_sku, METAL_MODE)
            tone = tone_from_metal(mapped_metal)

    # Tone long-form mapping for SpecialRemarks
    tone_long = {
        "W": "White Gold",
        "Y": "Yellow Gold",
        "R": "Pink Gold",
    }.get(tone, "Pt" if mapped_metal.startswith("P") else "")

    special_remarks_parts = []
    if ORDER_GROUP:
        special_remarks_parts.append(ORDER_GROUP)
    if sku_no:
        special_remarks_parts.append(sku_no)
    if tone_long:
        special_remarks_parts.append(tone_long)
    if diamond_quality:
        special_remarks_parts.append(diamond_quality)
    # If diamond quality not found in the chunk, try CPI line
    if not diamond_quality and re.search(r"\bLGD\b", customer_production_instruction, re.IGNORECASE):
        m_dq = DIAMOND_QUALITY_PATTERN.search(customer_production_instruction)
        if m_dq:
            diamond_quality = m_dq.group(0).upper()
    
    special_remarks = ", ".join([p for p in special_remarks_parts if p])

    row = {
        "SrNo": f"{sr_no}.",
        "StyleCode": style_code,
        "ItemSize": item_size_formatted,
        "OrderQty": order_qty,
        "OrderItemPcs": "",
        "Metal": mapped_metal,
        "Tone": tone,
        "ItemPoNo": item_po_no,
        "ItemRefNo": "",
        "StockType": "",
        "Priority": f"{int(PRIORITY_DAYS)} Days" if PRIORITY_DAYS else "5 Days",
        "MakeType": "",
        "CustomerProductionInstruction": customer_production_instruction,
        "SpecialRemarks": special_remarks,
        "DesignProductionInstruction": "White Rhodium" if tone == "W" else ("" if mapped_metal.startswith("P") else "No Rhodium"),
        "StampInstruction": stamp,
        "OrderGroup": ORDER_GROUP,
        "SKUNo": sku_no,
        "Basestoneminwt": "",
        "Basestonemaxwt": "",
        "Basemetalminwt": "",
        "Basemetalmaxwt": "",
        "Productiondeliverydate": "",
        "Expecteddeliverydate": "",
        "": "",
        "SetPrice": "",
        "StoneQuality": "",
    }
    return row

# --- Main Run ---
with pdfplumber.open(PDF_PATH) as pdf:
    full_text = "\n".join([page.extract_text() or "" for page in pdf.pages])

# Find ItemPoNo (Order #: NNNN) once for the whole file
po_match = ORDER_NO_PATTERN.search(full_text)
item_po_no = po_match.group(1) if po_match else ""

# Split into item chunks
chunks = split_items_by_sku(full_text)

rows = []
for idx, chunk in enumerate(chunks, start=1):
    rows.append(parse_item_chunk(chunk, item_po_no, idx))

columns = [
    "SrNo",
    "StyleCode",
    "ItemSize",
    "OrderQty",
    "OrderItemPcs",
    "Metal",
    "Tone",
    "ItemPoNo",
    "ItemRefNo",
    "StockType",
    "Priority",
    "MakeType",
    "CustomerProductionInstruction",
    "SpecialRemarks",
    "DesignProductionInstruction",
    "StampInstruction",
    "OrderGroup",
    "SKUNo",
    "Basestoneminwt",
    "Basestonemaxwt",
    "Basemetalminwt",
    "Basemetalmaxwt",
    "Productiondeliverydate",
    "Expecteddeliverydate",
    "",
    "SetPrice",
    "StoneQuality",
]

df_final = pd.DataFrame(rows, columns=columns)
df_final


Unnamed: 0,SrNo,StyleCode,ItemSize,OrderQty,OrderItemPcs,Metal,Tone,ItemPoNo,ItemRefNo,StockType,...,SKUNo,Basestoneminwt,Basestonemaxwt,Basemetalminwt,Basemetalmaxwt,Productiondeliverydate,Expecteddeliverydate,Unnamed: 19,SetPrice,StoneQuality
0,1.0,RG0001964QA,ts09,1,,G14WZ,,4656,,,...,,,,,,,,,,


In [7]:
df_final.to_excel('output_1.xlsx', index=False)