In [1]:
import pandas as pd
import re
import pdfplumber

# ---------- READ PDF AND EXTRACT TEXT ----------
pdf_file_path = r'C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\JJL\pedido_177546.pdf'


def read_pdf_text(pdf_path: str) -> str:
    """Return concatenated text from all pages of the given PDF."""
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                full_text += page_text + "\n"
    print(f"Reading PDF from: {pdf_path}")
    print("Extracted text from PDF:")
    print("=" * 50)
    print(full_text)
    print("=" * 50)
    return full_text

# The extracted PDF text that will be consumed by the next cell
RAW_TEXT = read_pdf_text(pdf_file_path)


Reading PDF from: C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\JJL\pedido_177546.pdf
Extracted text from PDF:
Emisión/Emission Pedido/Order SHIMAYRA JEWELLERY
16/06/2025 177546 PLOT NO: 62, SEEPZ ANDHERI (E)
Mumbai 400096
DETALLE DEL PEDIDO / DETAIL OF ORDER
Imagen/Image Referencia/Reference Unid/Units Peso/Weight Código/Code
RG044975-O/BL-BR-0.05KT-N-09
5 2BRSSR86A4975N09
RING WHITE GOLD NATURAL DIAMOND CT. 0.05KT SIZE 09
RG044975-O/BL-BR-0.05KT-N-11
5 2BRSSR86A4975N11
RING WHITE GOLD NATURAL DIAMOND CT. 0.05KT SIZE 11
RG044975-O/BL-BR-0.05KT-N-13
10 2BRSSR86A4975N13
RING WHITE GOLD NATURAL DIAMOND CT. 0.05KT SIZE 13
RG044975-O/BL-BR-0.05KT-N-16
5 2BRSSR86A4975N16
RING WHITE GOLD NATURAL DIAMOND CT. 0.05KT SIZE 16
RG044975-O/BL-BR-0.05KT-N-18
5 2BRSSR86A4975N18
RING WHITE GOLD NATURAL DIAMOND CT. 0.05KT SIZE 18
R-22463-O/BL-BR-0.10KT-N-08
5 2BRSSR86KP463N08
RING WHITE GOLD NATURAL DIAMOND CT. 0.10KT SIZE 08
R-22463-O/BL-BR-0.10KT-N-10
5 2BRSSR86KP463N10
RING WHITE GOLD NATUR

In [4]:
import pandas as pd
import re

# ---------- SETTINGS ----------
# Extract PO number (6 consecutive digits) from the raw text
ITEM_PO_NO = re.search(r"\b\d{6}\b", RAW_TEXT).group(0) if re.search(r"\b\d{6}\b", RAW_TEXT) else ""

# EU size map: 08->EU48, 09->EU49, 10->EU50 ...
def map_eu(size):
    try:
        base = 40  # offset so that 8 → 48
        return f"EU {int(size) + base}"
    except Exception:
        return ""

# ---------- PARSE ITEMS ----------
# This pattern captures items with OR without N-XX size
pattern = re.compile(
    r"([A-Z0-9\-]+)-O[^\n]*?(?:N-(\d+))?\n(\d+)\s+[A-Z0-9]+\n([A-Z ]+CT\..*?)(?=(?:\n[A-Z0-9\-]+-O|$))",
    re.DOTALL
)

# Collect all matches to know which one is last
matches = list(pattern.finditer(RAW_TEXT))

items = []
for i, match in enumerate(matches, start=1):
    style_code = match.group(1).strip()                     # everything before "-O"
    size = match.group(2).strip() if match.group(2) else "" # may not exist
    order_qty = match.group(3).strip()
    desc = match.group(4).strip()

    # For the last item, trim description before "Polígono"
    if i == len(matches):
        cut = desc.find("Polígono")
        if cut != -1:
            desc = desc[:cut].rstrip()

    # Detect tone (White / Yellow)
    tone = "W" if "WHITE" in desc.upper() else ("Y" if "YELLOW" in desc.upper() else "")
    metal = f"G750{tone}" if tone else "G750"

    # Design production instruction
    design_instr = "White Rodium" if "WHITE" in desc.upper() else "No Rodium"

    eu_size = map_eu(size) if size else ""

    # Ask user inputs per item
    priority = input(f"Enter Priority for item {style_code}-{size or 'NA'}: ")
    dia_qlty = input(f"Enter Diamond Quality for item {style_code}-{size or 'NA'}: ")

    special_remarks = f"{metal}"
    if eu_size:
        special_remarks += f",{eu_size}"
    # Always append DIA QUALITY from user input
    special_remarks += f",DIA QUALITY: {dia_qlty}"

    # build row
    item = {
        "Sr.NO": i,
        "Style Code": style_code,
        "ItemSize": eu_size,
        "OrderQty": order_qty,
        "OrderItemPcs": "",
        "Metal": metal,
        "Tone": tone,
        "ItemPoNo.": ITEM_PO_NO,
        "ItemRefNo": "",
        "StockType": "",
        "Priority": priority,
        "MakeType": "",
        "CustomerProductionInstruction": desc,
        "SpecialRemarks": special_remarks,
        "DesignProductionInstruction": design_instr,
        "StampInstruction": "750 +logo",
        "OrderGroup": "JJL",
        "Certificate": "",
        "SKUNo": "",
        "Basestoneminwt": "",
        "Basestonemaxwt": "",
        "Basemetalminwt": "",
        "Basemetalmaxwt": "",
        "Productiondeliverydate": "",
        "Expecteddeliverydate": "",
        "SetPrice": "",
        "StoneQuality": "",
    }
    items.append(item)

# ---------- CREATE DATAFRAME ----------
df = pd.DataFrame(items)

# ---------- DISPLAY RESULT ----------
pd.set_option("display.max_columns", None)
print(df)

# ---------- OPTIONAL: EXPORT ----------
df.to_excel("parsed_order_data_integrated_1.xlsx", index=False)


    Sr.NO Style Code ItemSize OrderQty OrderItemPcs  Metal Tone ItemPoNo.  \
0       1   RG044975    EU 49        5               G750W    W    177546   
1       2   RG044975    EU 51        5               G750W    W    177546   
2       3   RG044975    EU 53       10               G750W    W    177546   
3       4   RG044975    EU 56        5               G750W    W    177546   
4       5   RG044975    EU 58        5               G750W    W    177546   
5       6    R-22463    EU 48        5               G750W    W    177546   
6       7    R-22463    EU 50        5               G750W    W    177546   
7       8    R-22463    EU 53        5               G750W    W    177546   
8       9   RG043810    EU 50        5               G750W    W    177546   
9      10   RG043810    EU 52        5               G750W    W    177546   
10     11   RG043810    EU 55        5               G750W    W    177546   
11     12   RG043812    EU 48        5               G750W    W    177546   