In [1]:
import pandas as pd #PDF TO TEXT PO2
import re
import pdfplumber

def extract_data_from_pdf(pdf_path):
    """
    Extract data from PDF file and return structured data
    """
    data = []
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Extract text from all pages
            full_text = ""
            for page in pdf.pages:
                full_text += page.extract_text() + "\n"
            
            print("Extracted text from PDF:")
            print("=" * 50)
            print(full_text)
            print("=" * 50)
            
            # Process the extracted text
            lines = full_text.split('\n')
            current_item = {}
            
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                    
                # Extract SrNo (looking for lines starting with numbers)
                sr_no_match = re.match(r'^\s*(\d+)\s*', line)
                if sr_no_match:
                    if current_item and 'SrNo' in current_item:  # Save previous item if exists
                        data.append(current_item)
                    current_item = {'SrNo': sr_no_match.group(1), 'OrderQty': 2}
                    
                # Extract Article code (patterns like 9-DD106-YG-25, 9-DD106-YG-7)
                article_match = re.search(r'([A-Z0-9\-]+YG\-?\d*\.?\d+)', line)
                if article_match and 'ArticleCode' not in current_item:
                    current_item['ArticleCode'] = article_match.group(1)
                    
                # Extract StyleCode from Your reference (text after RSBR2074-01, RSBR2074-03, etc.)
                style_match = re.search(r'RSBR2074\-(\d+)\s+([A-Z0-9]+)', line)
                if style_match:
                    current_item['StyleCode'] = style_match.group(2)
                    
                # Extract ItemSize from description (looking for numbers like 0.25, 0.07)
                # Multiple patterns to catch different formats
                size_patterns = [
                    r'pendant\s+(\d+\.\d+)',  # pendant 0.25
                    r'(\d+\.\d+)\s*ct',       # 0.25 ct
                    r'YG[-\s]*(\d+\.\d+)',    # YG-0.25 or YG 0.25
                ]
                
                for pattern in size_patterns:
                    size_match = re.search(pattern, line, re.IGNORECASE)
                    if size_match and 'ItemSize' not in current_item:
                        current_item['ItemSize'] = size_match.group(1)
                        break
            
            # Add the last item
            if current_item and 'SrNo' in current_item:
                data.append(current_item)
                
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        return []
    
    return data

def create_excel_dataframe(data):
    """
    Create DataFrame with specified columns
    """
    if not data:
        print("No data extracted from PDF")
        return pd.DataFrame()
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Reorder columns as requested
    columns_order = ['SrNo', 'StyleCode', 'ItemSize', 'OrderQty']
    
    # Only include columns that exist in the data
    final_columns = [col for col in columns_order if col in df.columns]
    
    # Add missing columns with empty values
    for col in columns_order:
        if col not in df.columns:
            df[col] = ""
    
    return df[columns_order]

# Main execution
if __name__ == "__main__":
    # Specify your PDF file path here
   
    pdf_file_path = r'C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\FSA\Fredy Sadik (FSA)_PO.pdf' # Change this to your actual PDF path
    
    # Extract data from PDF
    print(f"Reading PDF from: {pdf_file_path}")
    extracted_data = extract_data_from_pdf(pdf_file_path)
    
    if extracted_data:
        # Create DataFrame
        df = create_excel_dataframe(extracted_data)
        
        # Display the results
        print("\nExtracted Data:")
        print("=" * 50)
        print(df)
        
        # Save to Excel file
        # output_file = 'extracted_data.xlsx'
        # df.to_excel(output_file, index=False)
        # print(f"\nData successfully saved to '{output_file}'")
        
        # Display basic statistics
        print(f"\nExtraction Summary:")
        print(f"Total records extracted: {len(df)}")
        print(f"Columns: {list(df.columns)}")
        
    else:
        print("No data was extracted from the PDF file.")
    
    # Display the DataFrame in notebook
    # df

Reading PDF from: C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\FSA\Fredy Sadik (FSA)_PO.pdf
Extracted text from PDF:
Shimayra Jewellery
Order 16047304
Plot No 62, SEEPZ - Andheri (E)
Mumbai, 400096
Order Date 4. September 2025 India
Payment Terms
Your Reference MANOR L'ATELIER GOLD 18 KARAT
Vendor Order No.
Requested Delivery Date 4. October 2025
We order the following goods. Quantities, gold kt, diamonds and brillants weight, coloured stone, weight have to be the same as the ones on the
present order forme. The ring sizes (FO) must be exact.
****Neckelace and bracelet length must be exact.****
Unit Total Unit
Supplier Open Calculati Metal metal price % Manuf
reference Description Item No. Size Qty qty on type Weight Weight Manuf Disc Amount
ER0000564A 10210233 STA 3 3 Piece 0.77 0.00 174.72 0 0.00
YG750 42brillants OR 18 CARA - 750
0.09ct H-SI1
TOTAL 10210233 3 3 0.00 0.00
PD0000453A 10210234 42 3 3 Piece 0.52 0.00 86.16 0 0.00
YG750 21 brillants OR 18 CARA - 750
0.05ct H-SI

In [None]:
import re
import pandas as pd
import pdfplumber

# Map tone letter to readable metal description for remarks
TONE_TO_DESCRIPTION = {
    'Y': 'YELLOW GOLD',
    'W': 'WHITE GOLD',
    'R': 'ROSE GOLD',
}


def extract_full_text_from_pdf(pdf_path: str) -> str:
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted = page.extract_text() or ""
            full_text += extracted + "\n"
    return full_text


def find_po_number(text: str) -> str:
    match = re.search(r"Order\s+(\d+)", text)
    return match.group(1) if match else ""


def detect_tone(lines_block: list[str]) -> str:
    joined = " ".join(lines_block)
    # Prefer explicit YG/WG/RG markers
    if re.search(r"\bWG\s*750|WG750", joined):
        return 'W'
    if re.search(r"\bRG\s*750|RG750", joined):
        return 'R'
    if re.search(r"\bYG\s*750|YG750", joined):
        return 'Y'
    # Fallback: look for words WHITE/YELLOW/ROSE
    if re.search(r"WHITE", joined, re.IGNORECASE):
        return 'W'
    if re.search(r"YELLOW", joined, re.IGNORECASE):
        return 'Y'
    if re.search(r"ROSE", joined, re.IGNORECASE):
        return 'R'
    return ''


def detect_metal_fineness(lines_block: list[str]) -> str:
    joined = " ".join(lines_block)
    if re.search(r"\b18\s*CARA?\b", joined, re.IGNORECASE):
        return '750'
    if re.search(r"\b750\b", joined):
        return '750'
    # Could add other fineness mappings if needed
    return ''


def detect_diamond_quality(lines_block: list[str]) -> str:
    # Look for patterns like H-SI1, G-VS, etc.
    for line in lines_block:
        m = re.search(r"\b([A-Z]{1,2}-?SI\d|[A-Z]{1,2}-?VS\d?|[A-Z]{1,2}-?VVS\d?|[A-Z]{1,2}-?I\d)\b", line)
        if m:
            return m.group(1)
    return ''


def detect_carat_line(lines_block: list[str]) -> str:
    # Return first occurrence like "18 CARA - 750"
    for line in lines_block:
        m = re.search(r"\b18\s*CARA?\s*-\s*750\b", line, re.IGNORECASE)
        if m:
            return m.group(0)
    return '18 CARA - 750'


def is_item_header(line: str) -> bool:
    # Pattern A: STYLE SKU SIZE/STA QTY QTY Piece
    if re.search(r"^[A-Z0-9\-]{3,}\s+\d{8}\s+(?:STA|\d+)\s+\d+\s+\d+\s+Piece\b", line):
        return True
    # Pattern B: STYLE STA/SIZE QTY QTY Piece (SKU may appear in TOTAL line)
    if re.search(r"^[A-Z0-9\-]{3,}\s+(?:STA|\d+)\s+\d+\s+\d+\s+Piece\b", line):
        return True
    return False


def parse_items(text: str) -> list[dict]:
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    items: list[dict] = []

    i = 0
    while i < len(lines):
        line = lines[i]
        if not is_item_header(line):
            i += 1
            continue

        # Try Pattern A first
        mA = re.search(r"^([A-Z0-9\-]{3,})\s+(\d{8})\s+(STA|\d+)\s+(\d+)\s+(\d+)\s+Piece\b", line)
        style_code = ""
        sku_no = ""
        size_token = ""
        order_qty = ""

        if mA:
            style_code = mA.group(1)
            sku_no = mA.group(2)
            size_token = mA.group(3)
            order_qty = mA.group(4)  # take first quantity
        else:
            # Pattern B: STYLE SIZE/STA QTY QTY Piece, find SKU from upcoming TOTAL line
            mB = re.search(r"^([A-Z0-9\-]{3,})\s+(STA|\d+)\s+(\d+)\s+(\d+)\s+Piece\b", line)
            if mB:
                style_code = mB.group(1)
                size_token = mB.group(2)
                order_qty = mB.group(3)

        # Gather following block until next header or TOTAL for this block
        block_lines = [line]
        j = i + 1
        sku_from_total = ""
        while j < len(lines):
            nxt = lines[j]
            if is_item_header(nxt):
                break
            block_lines.append(nxt)
            total_match = re.search(r"^TOTAL\s+(\d{8})\b", nxt)
            if total_match and not sku_no:
                sku_from_total = total_match.group(1)
            # Stop block at a blank TOTAL line row end
            j += 1
        i = j  # advance

        if not sku_no:
            sku_no = sku_from_total

        # Tone and fineness detection from the block
        tone = detect_tone(block_lines)
        fineness = detect_metal_fineness(block_lines)
        diamond_quality = detect_diamond_quality(block_lines)
        carat_line = detect_carat_line(block_lines)

        # Build fields
        item_size = "" if size_token == 'STA' else (f"EU{size_token}" if size_token else "")
        metal = f"G{fineness}{tone}" if fineness and tone else (f"G{fineness}" if fineness else "")

        # Ask user inputs per item
        priority = input(f"Enter Priority for SKU {sku_no or style_code}: ").strip()
        stamp_var = input(f"Enter stamp variable for SKU {sku_no or style_code} ('lgd' or leave blank for natural): ").strip().lower()
        stamp_variable_text = 'lgd' if stamp_var == 'lgd' else ''

        # Special remarks: "SKUNo,750 YELLOW GOLD,DIA QLTY: H-SI1"
        tone_desc = TONE_TO_DESCRIPTION.get(tone, '')
        special_remarks_parts = []
        if sku_no:
            special_remarks_parts.append(sku_no)
        if fineness or tone_desc:
            text_part = " ".join([p for p in [fineness, tone_desc] if p]).strip()
            if text_part:
                special_remarks_parts.append(text_part)
        if diamond_quality:
            special_remarks_parts.append(f"DIA QLTY: {diamond_quality}")
        special_remarks = ",".join(special_remarks_parts)

        # CustomerProductionInstruction combines the carat line and the common sentence
        common_sentence = "Polishing and setting must be very well done."
        customer_prod_instruction = f"{carat_line}, {common_sentence}" if carat_line else common_sentence

        # DesignProductionInstruction based on tone
        design_prod_instruction = "white rodium" if tone == 'W' else "no rodoium"

        items.append({
            'Sr.No': len(items) + 1,
            'Stylecode': style_code,
            'ItemSize': item_size,
            'OrderQty': order_qty,
            'OrderItemPcs': '',
            'Metal': metal,
            'Tone': tone,
            'ItemPoNo': '',  # fill after we know PO
            'ItemRefNo': '',
            'StockType': '',
            'Priority': priority,
            'MakeType': '',
            'CustomerProductionInstruction': customer_prod_instruction,
            'SpecialRemarks': special_remarks,
            'DesignProductionInstruction': design_prod_instruction,
            'StampInstruction': f"750+customer logo+{stamp_variable_text}".rstrip('+'),
            'OrderGroup': '',
            'Certificate': '',
            'SKUNo': sku_no,
            'Basestoneminwt': '',
            'Basestonemaxwt': '',
            'Basemetalminwt': '',
            'Basemetalmaxwt': '',
            'Productiondeliverydate': '',
            'Expecteddeliverydate': '',
            'SetPrice': '',
            'StoneQuality': '',
        })

    return items


# Entry point for the new mapping flow
try:
    pdf_path = pdf_file_path if 'pdf_file_path' in globals() else r'C:\\Users\\Pratik Mali\\Desktop\\tools\\OrderProcessingTool\\FSA\\Fredy Sadik (FSA)_PO.pdf'
    text = extract_full_text_from_pdf(pdf_path)
    po_no = find_po_number(text)
    parsed_items = parse_items(text)

    # Fill PO number for all items
    for it in parsed_items:
        it['ItemPoNo'] = po_no

    # Build DataFrame with the requested columns in order
    requested_columns = [
        'Sr.No',
        'Stylecode',
        'ItemSize',
        'OrderQty',
        'OrderItemPcs',
        'Metal',
        'Tone',
        'ItemPoNo',
        'ItemRefNo',
        'StockType',
        'Priority',
        'MakeType',
        'CustomerProductionInstruction',
        'SpecialRemarks',
        'DesignProductionInstruction',
        'StampInstruction',
        'OrderGroup',
        'Certificate',
        'SKUNo',
        'Basestoneminwt',
        'Basestonemaxwt',
        'Basemetalminwt',
        'Basemetalmaxwt',
        'Productiondeliverydate',
        'Expecteddeliverydate',
        'SetPrice',
        'StoneQuality',
    ]

    df_items = pd.DataFrame(parsed_items)
    for col in requested_columns:
        if col not in df_items.columns:
            df_items[col] = ''
    df_items = df_items[requested_columns]

    print("Mapped Items Data:")
    print(df_items)

    # Optional: save to Excel next to the PDF
    out_path = pdf_path.rsplit('\\', 1)[0] + '\\FSA_mapped_items.xlsx'
    df_items.to_excel(out_path, index=False)
    print(f"Saved to: {out_path}")
    
except Exception as e:
    print(f"Error in mapping flow: {e}")


Mapped Items Data:
   Sr.No   Stylecode ItemSize OrderQty OrderItemPcs  Metal Tone  ItemPoNo  \
0      1  ER0000564A                 3               G750Y    Y  16047304   
1      2  PD0000453A     EU42        3               G750Y    Y  16047304   
2      3    10212596                25               G750Y    Y  16047304   
3      4    E-115356                 5               G750Y    Y  16047304   

  ItemRefNo StockType Priority MakeType  \
0                                         
1                                         
2                                         
3                                         

                       CustomerProductionInstruction  \
0  18 CARA - 750, Polishing and setting must be v...   
1  18 CARA - 750, Polishing and setting must be v...   
2  18 CARA - 750, Polishing and setting must be v...   
3  18 CARA - 750, Polishing and setting must be v...   

                             SpecialRemarks DesignProductionInstruction  \
0  10210233,750 YELLOW 

In [None]:
import re
import pandas as pd

# Overrides: more robust item header parsing and style detection

HEADER_PAT_A = re.compile(r"^([A-Z0-9\-]{3,})\s+(?:([YWR]G)\s*?750|([YWR]G750))?\s*(\d{8})\s+(STA|\d+)\s+(\d+)\s+(\d+)\s+Piece\b")
HEADER_PAT_B = re.compile(r"^([A-Z][A-Z0-9\-]{2,})\s+(STA|\d+)\s+(\d+)\s+(\d+)\s+Piece\b")
HEADER_PAT_C = re.compile(r"^(\d{8})\s+(STA|\d+)\s+(\d+)\s+(\d+)\s+Piece\b")
TOTAL_SKU_PAT = re.compile(r"^TOTAL\s+(\d{8})\b")

STYLE_TOKEN_PAT = re.compile(r"\b([A-Z][A-Z\-]*\d{3,})\b")
EXCLUDE_STYLE_TOKENS = {"YG750", "WG750", "RG750", "YG", "WG", "RG"}


def is_item_header_v2(line: str) -> bool:
    return bool(HEADER_PAT_A.search(line) or HEADER_PAT_B.search(line) or HEADER_PAT_C.search(line))


def find_style_in_block(block_lines: list[str]) -> str:
    for ln in block_lines[1:]:  # skip the header itself
        # skip obvious non-style lines
        if ln.upper().startswith("TOTAL "):
            break
        m = STYLE_TOKEN_PAT.search(ln)
        if m:
            token = m.group(1)
            if token not in EXCLUDE_STYLE_TOKENS and not token.isdigit() and len(token) >= 5:
                return token
    return ""


def parse_items_v2(text: str) -> list[dict]:
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    items: list[dict] = []
    i = 0
    while i < len(lines):
        line = lines[i]
        if not is_item_header_v2(line):
            i += 1
            continue

        style_code = ""
        sku_no = ""
        size_token = ""
        order_qty = ""

        mA = HEADER_PAT_A.search(line)
        mB = HEADER_PAT_B.search(line) if not mA else None
        mC = HEADER_PAT_C.search(line) if not (mA or mB) else None

        if mA:
            style_code = mA.group(1)
            # tone hint present implicitly via optional groups 2/3; we detect tone later from block
            sku_no = mA.group(4)
            size_token = mA.group(5)
            order_qty = mA.group(6)  # first qty
        elif mB:
            style_code = mB.group(1)
            size_token = mB.group(2)
            order_qty = mB.group(3)
            # sku to be found from TOTAL in block
        elif mC:
            # SKU-first header, style comes in following lines
            sku_no = mC.group(1)
            size_token = mC.group(2)
            order_qty = mC.group(3)
        else:
            i += 1
            continue

        # Collect block lines until next header or end; include TOTAL line
        block_lines = [line]
        j = i + 1
        sku_from_total = ""
        while j < len(lines):
            nxt = lines[j]
            if is_item_header_v2(nxt):
                break
            block_lines.append(nxt)
            t = TOTAL_SKU_PAT.search(nxt)
            if t:
                sku_from_total = t.group(1)
            j += 1
        i = j

        if not sku_no:
            sku_no = sku_from_total

        # If style not known (e.g., SKU-first case), try to find it within the block
        if not style_code:
            style_code = find_style_in_block(block_lines)

        # Tone detection: prefer explicit YG/WG/RG on header or block
        tone = ''
        joined = " ".join(block_lines)
        if re.search(r"\bWG\s*750|WG750", joined):
            tone = 'W'
        elif re.search(r"\bRG\s*750|RG750", joined):
            tone = 'R'
        elif re.search(r"\bYG\s*750|YG750", joined):
            tone = 'Y'
        else:
            if re.search(r"WHITE", joined, re.IGNORECASE):
                tone = 'W'
            elif re.search(r"ROSE", joined, re.IGNORECASE):
                tone = 'R'
            elif re.search(r"YELLOW", joined, re.IGNORECASE):
                tone = 'Y'

        # Fineness detection
        fineness = '750' if (re.search(r"\b18\s*CARA?\b", joined, re.IGNORECASE) or re.search(r"\b750\b", joined)) else ''

        # Diamond quality
        diamond_quality = ''
        for bl in block_lines:
            m = re.search(r"\b([A-Z]{1,2}-?SI\d|[A-Z]{1,2}-?VS\d?|[A-Z]{1,2}-?VVS\d?|[A-Z]{1,2}-?I\d)\b", bl)
            if m:
                diamond_quality = m.group(1)
                break

        # Carat line
        carat_line = ''
        for bl in block_lines:
            m = re.search(r"\b18\s*CARA?\s*-\s*750\b", bl, re.IGNORECASE)
            if m:
                carat_line = m.group(0)
                break
        if not carat_line and fineness == '750':
            carat_line = '18 CARA - 750'

        # Build fields
        item_size = "" if size_token == 'STA' else (f"EU{size_token}" if size_token else "")
        metal = f"G{fineness}{tone}" if fineness and tone else (f"G{fineness}" if fineness else "")

        # Inputs
        priority = input(f"Enter Priority for SKU {sku_no or style_code}: ").strip()
        stamp_var = input(f"Enter stamp variable for SKU {sku_no or style_code} ('lgd' or leave blank for natural): ").strip().lower()
        stamp_variable_text = 'lgd' if stamp_var == 'lgd' else ''

        # Special remarks
        tone_to_desc = {'Y': 'YELLOW GOLD', 'W': 'WHITE GOLD', 'R': 'ROSE GOLD'}
        tone_desc = tone_to_desc.get(tone, '')
        parts = []
        if sku_no:
            parts.append(sku_no)
        if fineness or tone_desc:
            txt = " ".join([p for p in [fineness, tone_desc] if p]).strip()
            if txt:
                parts.append(txt)
        if diamond_quality:
            parts.append(f"DIA QLTY: {diamond_quality}")
        special_remarks = ",".join(parts)

        common_sentence = "Polishing and setting must be very well done."
        customer_prod_instruction = f"{carat_line}, {common_sentence}" if carat_line else common_sentence
        design_prod_instruction = "white rodium" if tone == 'W' else "no rodoium"

        items.append({
            'Sr.No': len(items) + 1,
            'Stylecode': style_code,
            'ItemSize': item_size,
            'OrderQty': order_qty,
            'OrderItemPcs': '',
            'Metal': metal,
            'Tone': tone,
            'ItemPoNo': '',
            'ItemRefNo': '',
            'StockType': '',
            'Priority': priority,
            'MakeType': '',
            'CustomerProductionInstruction': customer_prod_instruction,
            'SpecialRemarks': special_remarks,
            'DesignProductionInstruction': design_prod_instruction,
            'StampInstruction': f"750+customer logo+{stamp_variable_text}".rstrip('+'),
            'OrderGroup': '',
            'Certificate': '',
            'SKUNo': sku_no,
            'Basestoneminwt': '',
            'Basestonemaxwt': '',
            'Basemetalminwt': '',
            'Basemetalmaxwt': '',
            'Productiondeliverydate': '',
            'Expecteddeliverydate': '',
            'SetPrice': '',
            'StoneQuality': '',
        })

    return items

# Re-run with the improved parser while keeping earlier utilities
try:
    pdf_path = pdf_file_path if 'pdf_file_path' in globals() else r'C:\\Users\\Pratik Mali\\Desktop\\tools\\OrderProcessingTool\\FSA\\Fredy Sadik (FSA)_PO.pdf'
    text = extract_full_text_from_pdf(pdf_path)
    po_no = find_po_number(text)
    parsed_items = parse_items_v2(text)
    for it in parsed_items:
        it['ItemPoNo'] = po_no

    requested_columns = [
        'Sr.No','Stylecode','ItemSize','OrderQty','OrderItemPcs','Metal','Tone','ItemPoNo','ItemRefNo','StockType','Priority','MakeType','CustomerProductionInstruction','SpecialRemarks','DesignProductionInstruction','StampInstruction','OrderGroup','Certificate','SKUNo','Basestoneminwt','Basestonemaxwt','Basemetalminwt','Basemetalmaxwt','Productiondeliverydate','Expecteddeliverydate','SetPrice','StoneQuality'
    ]
    df_items = pd.DataFrame(parsed_items)
    for col in requested_columns:
        if col not in df_items.columns:
            df_items[col] = ''
    df_items = df_items[requested_columns]

    print("Mapped Items Data (improved parser):")
    print(df_items)

    out_path = pdf_path.rsplit('\\', 1)[0] + '\\FSA_mapped_items_1.xlsx'
    df_items.to_excel(out_path, index=False)
    print(f"Saved to: {out_path}")
except Exception as e:
    print(f"Error in improved mapping flow: {e}")


Mapped Items Data (improved parser):
   Sr.No   Stylecode ItemSize OrderQty OrderItemPcs  Metal Tone  ItemPoNo  \
0      1  ER0000564A                 3               G750Y    Y  16047304   
1      2  PD0000453A     EU42        3               G750Y    Y  16047304   
2      3    10212596                25                G750       16047304   
3      4     PD01953     EU42       25               G750Y    Y  16047304   
4      5    E-115356                 5               G750Y    Y  16047304   

  ItemRefNo StockType Priority MakeType  \
0                            2            
1                            2            
2                            3            
3                            4            
4                            5            

                       CustomerProductionInstruction  \
0  18 CARA - 750, Polishing and setting must be v...   
1  18 CARA - 750, Polishing and setting must be v...   
2  18 CARA - 750, Polishing and setting must be v...   
3  18 CARA - 750,