In [2]:
pip install pdfplumber pandas openpyxl

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd  # #PDF TO TEXT PO1
import re
import pdfplumber

def extract_data_from_pdf(pdf_path):
    """
    Extract data from PDF file and return structured data
    """
    data = []
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Extract text from all pages
            full_text = ""
            for page in pdf.pages:
                full_text += page.extract_text() + "\n"
            
            print("Extracted text from PDF:")
            print("=" * 50)
            print(full_text)
            print("=" * 50)
            
            # Process the extracted text
            lines = full_text.split('\n')
            current_item = {}
            
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                    
                # Extract SrNo (looking for lines starting with numbers)
                sr_no_match = re.match(r'^\s*(\d+)\s*', line)
                if sr_no_match:
                    if current_item and 'SrNo' in current_item:  # Save previous item if exists
                        data.append(current_item)
                    current_item = {'SrNo': sr_no_match.group(1), 'OrderQty': 2}
                    
                # Extract Article code (patterns like 9-DD106-YG-25, 9-DD106-YG-7)
                article_match = re.search(r'([A-Z0-9\-]+YG\-?\d*\.?\d+)', line)
                if article_match and 'ArticleCode' not in current_item:
                    current_item['ArticleCode'] = article_match.group(1)
                    
                # Extract StyleCode from Your reference (text after RSBR2074-01, RSBR2074-03, etc.)
                style_match = re.search(r'RSBR2074\-(\d+)\s+([A-Z0-9]+)', line)
                if style_match:
                    current_item['StyleCode'] = style_match.group(2)
                    
                # Extract ItemSize from description (looking for numbers like 0.25, 0.07)
                # Multiple patterns to catch different formats
                size_patterns = [
                    r'pendant\s+(\d+\.\d+)',  # pendant 0.25
                    r'(\d+\.\d+)\s*ct',       # 0.25 ct
                    r'YG[-\s]*(\d+\.\d+)',    # YG-0.25 or YG 0.25
                ]
                
                for pattern in size_patterns:
                    size_match = re.search(pattern, line, re.IGNORECASE)
                    if size_match and 'ItemSize' not in current_item:
                        current_item['ItemSize'] = size_match.group(1)
                        break
            
            # Add the last item
            if current_item and 'SrNo' in current_item:
                data.append(current_item)
                
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        return []
    
    return data

def create_excel_dataframe(data):
    """
    Create DataFrame with specified columns
    """
    if not data:
        print("No data extracted from PDF")
        return pd.DataFrame()
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Reorder columns as requested
    columns_order = ['SrNo', 'StyleCode', 'ItemSize', 'OrderQty']
    
    # Only include columns that exist in the data
    final_columns = [col for col in columns_order if col in df.columns]
    
    # Add missing columns with empty values
    for col in columns_order:
        if col not in df.columns:
            df[col] = ""
    
    return df[columns_order]

# Main execution
if __name__ == "__main__":
    # Specify your PDF file path here
   
    pdf_file_path = r'C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\OBU\Offenbach (OBU)_PO.pdf'  # Change this to your actual PDF path
    
    # Extract data from PDF
    print(f"Reading PDF from: {pdf_file_path}")
    extracted_data = extract_data_from_pdf(pdf_file_path)
    
    if extracted_data:
        # Create DataFrame
        df = create_excel_dataframe(extracted_data)
        
        # Display the results
        print("\nExtracted Data:")
        print("=" * 50)
        print(df)
        
        # Save to Excel file
        output_file = 'extracted_data.xlsx'
        df.to_excel(output_file, index=False)
        print(f"\nData successfully saved to '{output_file}'")
        
        # Display basic statistics
        print(f"\nExtraction Summary:")
        print(f"Total records extracted: {len(df)}")
        print(f"Columns: {list(df.columns)}")
        
    else:
        print("No data was extracted from the PDF file.")
    
    # Display the DataFrame in notebook
    df

Reading PDF from: C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\OBU\Offenbach (OBU)_PO.pdf
Extracted text from PDF:
Purchase order
Ron Offenbach B.V.
De Wetering 101
4906 CT Oosterhout
Telephone +31886644100
Shimayra Jewellery E-mail info@offenbachgroup.com
Internet www.offenbachgroup.com
Plot No.62, SEEPZ-SEZ
IN 400096 Andheri (E), Mumbai
India
PO# : 6458 /
Supplier : 1712557
Order date : September 22, 2025
Handled by : Majenka
Our VAT no. : NL81 12.08.072.B01
# Picture Description Quantity
r
Article code Your reference
9-DD106-YG-25 RSBR2074-01 BR0000279D
Description Description
1 2
Diamo 14YG cord bracelet with 4 prong Diamo 14YG 4 prong pendant 0.25, VVS+
pendant 0.25 and stamp 585 and green emerald and
Diamo logo
Article code Your reference
9-DD106-YG-7 RSBR2074-03 BR0000279AA
Description Description
2 4
Diamo 14YG cord bracelet with 4 prong Diamo 14YG 4 prong pendant 0.07, VVS+
pendant 0.07 and stamp 585 and green emerald and
Diamo logo
Purchase order Total Total quantit

In [15]:
import pandas as pd #PDF TO TEXT PO2
import re
import pdfplumber

def extract_data_from_pdf(pdf_path):
    """
    Extract data from PDF file and return structured data
    """
    data = []
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Extract text from all pages
            full_text = ""
            for page in pdf.pages:
                full_text += page.extract_text() + "\n"
            
            print("Extracted text from PDF:")
            print("=" * 50)
            print(full_text)
            print("=" * 50)
            
            # Process the extracted text
            lines = full_text.split('\n')
            current_item = {}
            
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                    
                # Extract SrNo (looking for lines starting with numbers)
                sr_no_match = re.match(r'^\s*(\d+)\s*', line)
                if sr_no_match:
                    if current_item and 'SrNo' in current_item:  # Save previous item if exists
                        data.append(current_item)
                    current_item = {'SrNo': sr_no_match.group(1), 'OrderQty': 2}
                    
                # Extract Article code (patterns like 9-DD106-YG-25, 9-DD106-YG-7)
                article_match = re.search(r'([A-Z0-9\-]+YG\-?\d*\.?\d+)', line)
                if article_match and 'ArticleCode' not in current_item:
                    current_item['ArticleCode'] = article_match.group(1)
                    
                # Extract StyleCode from Your reference (text after RSBR2074-01, RSBR2074-03, etc.)
                style_match = re.search(r'RSBR2074\-(\d+)\s+([A-Z0-9]+)', line)
                if style_match:
                    current_item['StyleCode'] = style_match.group(2)
                    
                # Extract ItemSize from description (looking for numbers like 0.25, 0.07)
                # Multiple patterns to catch different formats
                size_patterns = [
                    r'pendant\s+(\d+\.\d+)',  # pendant 0.25
                    r'(\d+\.\d+)\s*ct',       # 0.25 ct
                    r'YG[-\s]*(\d+\.\d+)',    # YG-0.25 or YG 0.25
                ]
                
                for pattern in size_patterns:
                    size_match = re.search(pattern, line, re.IGNORECASE)
                    if size_match and 'ItemSize' not in current_item:
                        current_item['ItemSize'] = size_match.group(1)
                        break
            
            # Add the last item
            if current_item and 'SrNo' in current_item:
                data.append(current_item)
                
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        return []
    
    return data

def create_excel_dataframe(data):
    """
    Create DataFrame with specified columns
    """
    if not data:
        print("No data extracted from PDF")
        return pd.DataFrame()
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Reorder columns as requested
    columns_order = ['SrNo', 'StyleCode', 'ItemSize', 'OrderQty']
    
    # Only include columns that exist in the data
    final_columns = [col for col in columns_order if col in df.columns]
    
    # Add missing columns with empty values
    for col in columns_order:
        if col not in df.columns:
            df[col] = ""
    
    return df[columns_order]

# Main execution
if __name__ == "__main__":
    # Specify your PDF file path here
   
    pdf_file_path = r'C:\Users\Admin\Desktop\UNEEK\Purchase order.pdf' # Change this to your actual PDF path
    
    # Extract data from PDF
    print(f"Reading PDF from: {pdf_file_path}")
    extracted_data = extract_data_from_pdf(pdf_file_path)
    
    if extracted_data:
        # Create DataFrame
        df = create_excel_dataframe(extracted_data)
        
        # Display the results
        print("\nExtracted Data:")
        print("=" * 50)
        print(df)
        
        # Save to Excel file
        output_file = 'extracted_data.xlsx'
        df.to_excel(output_file, index=False)
        print(f"\nData successfully saved to '{output_file}'")
        
        # Display basic statistics
        print(f"\nExtraction Summary:")
        print(f"Total records extracted: {len(df)}")
        print(f"Columns: {list(df.columns)}")
        
    else:
        print("No data was extracted from the PDF file.")
    
    # Display the DataFrame in notebook
    df

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Reading PDF from: C:\Users\Admin\Desktop\UNEEK\Purchase order.pdf
Extracted text from PDF:
Purchase order
Ron Offenbach B.V.
De Wetering 101
4906 CT Oosterhout
Telephone +31886644100
Shimayra Jewellery E-mail info@offenbachgroup.com
Internet www.offenbachgroup.com
Plot No.62, SEEPZ-SEZ
IN 400096 Andheri (E), Mumbai
India
PO# : 6299 /
Supplier : 1712557
Order date : August 6, 2025
Handled by : Majenka
Our VAT no. : NL81 12.08.072.B01
# Picture Description Quantity
r
Article code Your reference
9-DD028-YG-30-58 RG047376-YG
Description Description
1 1
Diamo prong set ring 14KY 0,30ct Diamo prong set ring 14KY, 0,30ct and
stamp 585 and green emerald and Diamo
logo
Article code Your reference
9-DD035-YG-40-56 RG056063-YG
Description Description
2 1
Diamo open bezel setting solitair ring Diamo open bezel setting solitair ring
14KY 0,40ct 14KY, 0,40ct and stamp 585 and green
emerald and Diamo logo
Purchase order Total Total quantity 2
As contractually agreed all products must pass all standar

In [30]:
# Text to df (consider this) after pdf to textis done
import pandas as pd
import re
from IPython.display import display

# Paste your PO text here
PO_TEXT = """Purchase order
Ron Offenbach B.V.
De Wetering 101
4906 CT Oosterhout
Telephone +31886644100
Shimayra Jewellery E-mail info@offenbachgroup.com
Internet www.offenbachgroup.com
Plot No.62, SEEPZ-SEZ
IN 400096 Andheri (E), Mumbai
India
PO# : 6299 /
Supplier : 1712557
Order date : August 6, 2025
Handled by : Majenka
Our VAT no. : NL81 12.08.072.B01
# Picture Description Quantity
r
Article code Your reference
9-DD028-YG-30-58 RG047376-YG
Description Description
1 1
Diamo prong set ring 14KY 0,30ct Diamo prong set ring 14KY, 0,30ct and
stamp 585 and green emerald and Diamo
logo
Article code Your reference
9-DD035-YG-40-56 RG056063-YG
Description Description
2 1
Diamo open bezel setting solitair ring Diamo open bezel setting solitair ring
14KY 0,40ct 14KY, 0,40ct and stamp 585 and green
emerald and Diamo logo
Purchase order Total Total quantity 2"""

# Extract and display line items
lines = PO_TEXT.strip().split('\n')
line_items = []
current_item = {}
quantity_pattern = re.compile(r'^(\d+)\s+(\d+)$')  # Pattern for "number number" lines
description_lines = []  # To collect multi-line description

for i, line in enumerate(lines):
    line = line.strip()
    
    # Article code pattern - start new item
    if re.match(r'^\d+-\w+-\w+-\d+-\d+', line):
        if current_item: 
            # Join all description lines before starting new item
            if description_lines:
                current_item['Description'] = ' '.join(description_lines)
                description_lines = []
            line_items.append(current_item)
        current_item = {'Article_Code': line}
        description_lines = []  # Reset description lines for new item
    
    # Your reference pattern
    elif re.match(r'^RG\d+-[A-Z]+', line):
        current_item['Your_Reference'] = line
    
    # Quantity pattern - look for lines with "number number" format
    elif quantity_pattern.match(line):
        match = quantity_pattern.match(line)
        current_item['Quantity'] = int(match.group(2))  # Second number is the quantity
    
    # Description pattern - collect all description lines
    elif 'Diamo' in line or 'stamp' in line.lower() or 'emerald' in line.lower() or 'logo' in line.lower():
        description_lines.append(line)

# Handle the last item
if current_item:
    if description_lines:
        current_item['Description'] = ' '.join(description_lines)
    line_items.append(current_item)

# Create and display DataFrame
if line_items:
    items_df = pd.DataFrame(line_items)
    items_df['Item_Number'] = range(1, len(items_df) + 1)
    
    # Reorder columns
    cols = ['Item_Number', 'Article_Code', 'Your_Reference', 'Description', 'Quantity']
    available_cols = [col for col in cols if col in items_df.columns]
    items_df = items_df[available_cols]
    
    print("🛒 LINE ITEMS:")
    display(items_df)
else:
    print("No line items found.")

items_df.to_csv(r'C:\Users\Admin\Desktop\OBU_JUP2.csv', index=False)

🛒 LINE ITEMS:


Unnamed: 0,Item_Number,Article_Code,Description,Quantity
0,1,9-DD028-YG-30-58 RG047376-YG,"Diamo prong set ring 14KY 0,30ct Diamo prong s...",1
1,2,9-DD035-YG-40-56 RG056063-YG,Diamo open bezel setting solitair ring Diamo o...,1


In [None]:
# Parser to structure items and build requested DataFrame
import re
import pandas as pd
from IPython.display import display

# Choose source: reuse text from the second cell by re-reading the same PDF path for reproducibility
PDF_PATH = r'C:\Users\Pratik Mali\Desktop\tools\OrderProcessingTool\OBU\Offenbach (OBU)_PO.pdf'

try:
    import pdfplumber
    with pdfplumber.open(PDF_PATH) as pdf:
        text = "\n".join([page.extract_text() or '' for page in pdf.pages])
except Exception as e:
    raise RuntimeError(f'Could not read PDF at {PDF_PATH}: {e}')

# Helper functions
code_token_re = re.compile(r'[A-Z0-9][A-Z0-9\-]*[A-Z0-9]')
sku_first_code_re = re.compile(r'^(?P<sku>\d+-[A-Z]{2}\d{3})')
po_re = re.compile(r'PO#\s*:\s*(\d+)')
article_header_re = re.compile(r'^Article code', re.IGNORECASE)
quantity_line_re = re.compile(r'^(\d+)\s+(\d+)$')  # SrNo and Qty line after Description
style_tone_re = re.compile(r'-([A-Z])$')  # last -<tone>

# Extract the PO number once
po_match = po_re.search(text)
item_po_no = po_match.group(1) if po_match else ''

# Split the text into item blocks starting at "Article code" up to before next "Article code"
lines = [ln.strip() for ln in text.split('\n') if ln.strip()]
blocks = []
current = []
for ln in lines:
    if article_header_re.match(ln):
        if current:
            blocks.append(current)
            current = []
        # include header line in the block to keep relative context
        current.append(ln)
    else:
        if current:
            current.append(ln)
# last block
if current:
    blocks.append(current)

items = []
for b_index, block in enumerate(blocks):
    is_last_block = (b_index == len(blocks) - 1)
    btxt = "\n".join(block)
    # Collect codes appearing soon after the header
    codes = []
    # Find first line after header that contains codes (usually the next 1-2 lines)
    for idx, ln in enumerate(block[1:4], start=1):
        tokens = code_token_re.findall(ln)
        if tokens:
            codes.extend(tokens)
        # Stop early if we already captured at least one dash-containing token
        if len(codes) >= 2:
            break

    # Normalize codes: often appears like '9-DD028-YG-30-58 RG047376-YG' or '9-DD106-YG-25 RSBR2074-01 BR0000279D'
    # Keep only alnum and dashes inside tokens
    codes = [re.sub(r'[^A-Z0-9\-]', '', c) for c in codes]

    # Identify SKU (first code), ItemRefNo (second code when three codes present), StyleCode (last of 2 or 3)
    sku_full = codes[0] if len(codes) >= 1 else ''
    item_ref_no = codes[1] if len(codes) == 3 else ''
    style_code = codes[2] if len(codes) == 3 else (codes[1] if len(codes) == 2 else '')

    # ItemSize: from SKU when it contains YG-...-<size> (second number after YG)
    item_size = ''
    if 'YG' in sku_full:
        # capture the segment after YG-
        m = re.search(r'YG-([0-9]+)(?:-[0-9]+)?(?:-[0-9]+)?', sku_full)
        if m:
            parts = re.split(r'-', sku_full)
            # After 'YG', the next numeric parts are item sizes; we want the second numeric after YG
            after_yg = sku_full.split('YG', 1)[1].lstrip('-')
            nums = re.findall(r'\d+', after_yg)
            if len(nums) >= 2:
                item_size = nums[1]

    # Tone: last '-<letter>' in style_code (keep StyleCode as-is including -YG if present)
    tone = ''
    if style_code:
        mt = style_tone_re.search(style_code)
        tone = mt.group(1) if mt else ''

    # Find SrNo and OrderQty from the line that has two integers after 'Description' section marker
    sr_no = ''
    order_qty = ''
    order_item_pcs = ''

    # Locate the 'Description' marker within the block and then search following few lines for the pattern
    try:
        desc_idx = next(i for i, ln in enumerate(block) if ln.lower().startswith('description'))
    except StopIteration:
        desc_idx = None

    if desc_idx is not None:
        for ln in block[desc_idx:desc_idx+5]:
            qm = quantity_line_re.match(ln)
            if qm:
                sr_no = qm.group(1)
                order_qty = qm.group(2)
                order_item_pcs = order_qty
                break

    # CustomerProductionInstruction and StampInstruction split
    # Heuristic: split at the first occurrence of 'stamp' (case-insensitive) or when a new sentence begins mentioning 'stamp 585' etc.
    customer_instr = ''
    stamp_instr = ''
    # Collect lines likely belonging to description (exclude header and code lines and the quantity line)
    desc_lines = []
    for ln in block:
        if article_header_re.match(ln):
            continue
        if quantity_line_re.match(ln):
            continue
        # skip lines that are just the labels 'Description Description'
        if ln.lower().startswith('description'):
            continue
        # skip lines that are just codes
        if code_token_re.fullmatch(ln.replace(' ', '')):
            continue
        desc_lines.append(ln)
    full_desc = ' '.join(desc_lines)

    # If this is the last item, trim anything from 'Purchase order Total' onward from the description text
    if is_last_block:
        pot_idx = re.search(r'Purchase order Total', full_desc, flags=re.IGNORECASE)
        if pot_idx:
            full_desc = full_desc[:pot_idx.start()].strip()

    split_match = re.search(r'\b(stamp\b.*)', full_desc, flags=re.IGNORECASE)
    if split_match:
        before = full_desc[:split_match.start()].strip()
        after = full_desc[split_match.start():].strip()
        customer_instr = before
        stamp_instr = after
    else:
        customer_instr = full_desc
        stamp_instr = ''

    # Remove trailing 'and' at the end of CustomerProductionInstruction
    customer_instr = re.sub(r'\s*\band\b\s*$', '', customer_instr, flags=re.IGNORECASE).strip()

    # Certificate: add 'IGI Certified' if the last numeric chunk equals 100, else blank
    certificate = ''
    if sku_full:
        nums = re.findall(r'\d+', sku_full)
        if nums and nums[-1] == '100':
            certificate = 'IGI Certified'

    # SKUNo: take '9-DD106' from '9-DD106-YG-25' (first code up to second dash group)
    sku_no = ''
    if sku_full:
        m = re.match(r'^(\d+-[A-Z]{2}\d{3})', sku_full)
        if m:
            sku_no = m.group(1)
        else:
            # fallback: first two dash-separated chunks
            parts = sku_full.split('-')
            if len(parts) >= 2:
                sku_no = parts[0] + '-' + parts[1]

    items.append({
        'SrNo': sr_no,
        'StyleCode': style_code,
        'ItemSize': item_size,
        'OrderQty': order_qty,
        'OrderItemPcs': order_item_pcs,
        'Metal': '',
        'Tone': tone,
        'ItemPoNo': item_po_no,
        'ItemRefNo': item_ref_no,
        'StockType': '',
        'MakeType': '',
        'CustomerProductionInstruction': customer_instr,
        'SpecialRemarks': '',
        'DesignProductionInstruction': '',
        'StampInstruction': stamp_instr,
        'OrderGroup': '',
        'Certificate': certificate,
        'SKUNo': sku_no,
        'Basestoneminwt': '',
        'Basestonemaxwt': '',
        'Basemetalminwt': '',
        'Basemetalmaxwt': '',
        'Productiondeliverydate': '',
        'Expecteddeliverydate': '',
        '': '',  # blank column
        'SetPrice': '',
        'StoneQuality': 'VVS+' if re.search(r'\bVVS\+\b', btxt) else ''
    })

# Build DataFrame with exact column order
columns_order = [
    'SrNo','StyleCode','ItemSize','OrderQty','OrderItemPcs','Metal','Tone','ItemPoNo','ItemRefNo',
    'StockType','MakeType','CustomerProductionInstruction','SpecialRemarks','DesignProductionInstruction',
    'StampInstruction','OrderGroup','Certificate','SKUNo','Basestoneminwt','Basestonemaxwt','Basemetalminwt',
    'Basemetalmaxwt','Productiondeliverydate','Expecteddeliverydate','', 'SetPrice','StoneQuality'
]
result_df = pd.DataFrame(items, columns=columns_order)

# Save and display
output_file = 'structured_items_1.xlsx'
result_df.to_excel(output_file, index=False)
print('Structured items saved to', output_file)
display(result_df)


Structured items saved to structured_items_1.xlsx


Unnamed: 0,SrNo,StyleCode,ItemSize,OrderQty,OrderItemPcs,Metal,Tone,ItemPoNo,ItemRefNo,StockType,...,SKUNo,Basestoneminwt,Basestonemaxwt,Basemetalminwt,Basemetalmaxwt,Productiondeliverydate,Expecteddeliverydate,Unnamed: 19,SetPrice,StoneQuality
0,1,BR0000279D,,2,2,,,6458,RSBR2074-01,,...,9-DD106,,,,,,,,,
1,2,BR0000279AA,,4,4,,,6458,RSBR2074-03,,...,9-DD106,,,,,,,,,
