In [13]:
!pip install pdfplumber pandas openpyxl

Defaulting to user installation because normal site-packages is not writeable


In [14]:
import pandas as pd  # #PDF TO TEXT PO1
import re
import pdfplumber

def extract_data_from_pdf(pdf_path):
    """
    Extract data from PDF file and return structured data
    """
    data = []
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Extract text from all pages
            full_text = ""
            for page in pdf.pages:
                full_text += page.extract_text() + "\n"
            
            print("Extracted text from PDF:")
            print("=" * 50)
            print(full_text)
            print("=" * 50)
            
            # Process the extracted text
            lines = full_text.split('\n')
            current_item = {}
            
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                    
                # Extract SrNo (looking for lines starting with numbers)
                sr_no_match = re.match(r'^\s*(\d+)\s*', line)
                if sr_no_match:
                    if current_item and 'SrNo' in current_item:  # Save previous item if exists
                        data.append(current_item)
                    current_item = {'SrNo': sr_no_match.group(1), 'OrderQty': 2}
                    
                # Extract Article code (patterns like 9-DD106-YG-25, 9-DD106-YG-7)
                article_match = re.search(r'([A-Z0-9\-]+YG\-?\d*\.?\d+)', line)
                if article_match and 'ArticleCode' not in current_item:
                    current_item['ArticleCode'] = article_match.group(1)
                    
                # Extract StyleCode from Your reference (text after RSBR2074-01, RSBR2074-03, etc.)
                style_match = re.search(r'RSBR2074\-(\d+)\s+([A-Z0-9]+)', line)
                if style_match:
                    current_item['StyleCode'] = style_match.group(2)
                    
                # Extract ItemSize from description (looking for numbers like 0.25, 0.07)
                # Multiple patterns to catch different formats
                size_patterns = [
                    r'pendant\s+(\d+\.\d+)',  # pendant 0.25
                    r'(\d+\.\d+)\s*ct',       # 0.25 ct
                    r'YG[-\s]*(\d+\.\d+)',    # YG-0.25 or YG 0.25
                ]
                
                for pattern in size_patterns:
                    size_match = re.search(pattern, line, re.IGNORECASE)
                    if size_match and 'ItemSize' not in current_item:
                        current_item['ItemSize'] = size_match.group(1)
                        break
            
            # Add the last item
            if current_item and 'SrNo' in current_item:
                data.append(current_item)
                
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        return []
    
    return data

def create_excel_dataframe(data):
    """
    Create DataFrame with specified columns
    """
    if not data:
        print("No data extracted from PDF")
        return pd.DataFrame()
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Reorder columns as requested
    columns_order = ['SrNo', 'StyleCode', 'ItemSize', 'OrderQty']
    
    # Only include columns that exist in the data
    final_columns = [col for col in columns_order if col in df.columns]
    
    # Add missing columns with empty values
    for col in columns_order:
        if col not in df.columns:
            df[col] = ""
    
    return df[columns_order]

# Main execution
if __name__ == "__main__":
    # Specify your PDF file path here
   
    pdf_file_path = r'C:\Users\Admin\Desktop\UNEEK\Offenbach (OBU)_PO.pdf'  # Change this to your actual PDF path
    
    # Extract data from PDF
    print(f"Reading PDF from: {pdf_file_path}")
    extracted_data = extract_data_from_pdf(pdf_file_path)
    
    if extracted_data:
        # Create DataFrame
        df = create_excel_dataframe(extracted_data)
        
        # Display the results
        print("\nExtracted Data:")
        print("=" * 50)
        print(df)
        
        # Save to Excel file
        output_file = 'extracted_data.xlsx'
        df.to_excel(output_file, index=False)
        print(f"\nData successfully saved to '{output_file}'")
        
        # Display basic statistics
        print(f"\nExtraction Summary:")
        print(f"Total records extracted: {len(df)}")
        print(f"Columns: {list(df.columns)}")
        
    else:
        print("No data was extracted from the PDF file.")
    
    # Display the DataFrame in notebook
    df

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Reading PDF from: C:\Users\Admin\Desktop\UNEEK\Offenbach (OBU)_PO.pdf
Extracted text from PDF:
Purchase order
Ron Offenbach B.V.
De Wetering 101
4906 CT Oosterhout
Telephone +31886644100
Shimayra Jewellery E-mail info@offenbachgroup.com
Internet www.offenbachgroup.com
Plot No.62, SEEPZ-SEZ
IN 400096 Andheri (E), Mumbai
India
PO# : 6458 /
Supplier : 1712557
Order date : September 22, 2025
Handled by : Majenka
Our VAT no. : NL81 12.08.072.B01
# Picture Description Quantity
r
Article code Your reference
9-DD106-YG-25 RSBR2074-01 BR0000279D
Description Description
1 2
Diamo 14YG cord bracelet with 4 prong Diamo 14YG 4 prong pendant 0.25, VVS+
pendant 0.25 and stamp 585 and green emerald and
Diamo logo
Article code Your reference
9-DD106-YG-7 RSBR2074-03 BR0000279AA
Description Description
2 4
Diamo 14YG cord bracelet with 4 prong Diamo 14YG 4 prong pendant 0.07, VVS+
pendant 0.07 and stamp 585 and green emerald and
Diamo logo
Purchase order Total Total quantity 6
As contractually agreed al

In [15]:
import pandas as pd #PDF TO TEXT PO2
import re
import pdfplumber

def extract_data_from_pdf(pdf_path):
    """
    Extract data from PDF file and return structured data
    """
    data = []
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Extract text from all pages
            full_text = ""
            for page in pdf.pages:
                full_text += page.extract_text() + "\n"
            
            print("Extracted text from PDF:")
            print("=" * 50)
            print(full_text)
            print("=" * 50)
            
            # Process the extracted text
            lines = full_text.split('\n')
            current_item = {}
            
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                    
                # Extract SrNo (looking for lines starting with numbers)
                sr_no_match = re.match(r'^\s*(\d+)\s*', line)
                if sr_no_match:
                    if current_item and 'SrNo' in current_item:  # Save previous item if exists
                        data.append(current_item)
                    current_item = {'SrNo': sr_no_match.group(1), 'OrderQty': 2}
                    
                # Extract Article code (patterns like 9-DD106-YG-25, 9-DD106-YG-7)
                article_match = re.search(r'([A-Z0-9\-]+YG\-?\d*\.?\d+)', line)
                if article_match and 'ArticleCode' not in current_item:
                    current_item['ArticleCode'] = article_match.group(1)
                    
                # Extract StyleCode from Your reference (text after RSBR2074-01, RSBR2074-03, etc.)
                style_match = re.search(r'RSBR2074\-(\d+)\s+([A-Z0-9]+)', line)
                if style_match:
                    current_item['StyleCode'] = style_match.group(2)
                    
                # Extract ItemSize from description (looking for numbers like 0.25, 0.07)
                # Multiple patterns to catch different formats
                size_patterns = [
                    r'pendant\s+(\d+\.\d+)',  # pendant 0.25
                    r'(\d+\.\d+)\s*ct',       # 0.25 ct
                    r'YG[-\s]*(\d+\.\d+)',    # YG-0.25 or YG 0.25
                ]
                
                for pattern in size_patterns:
                    size_match = re.search(pattern, line, re.IGNORECASE)
                    if size_match and 'ItemSize' not in current_item:
                        current_item['ItemSize'] = size_match.group(1)
                        break
            
            # Add the last item
            if current_item and 'SrNo' in current_item:
                data.append(current_item)
                
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        return []
    
    return data

def create_excel_dataframe(data):
    """
    Create DataFrame with specified columns
    """
    if not data:
        print("No data extracted from PDF")
        return pd.DataFrame()
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Reorder columns as requested
    columns_order = ['SrNo', 'StyleCode', 'ItemSize', 'OrderQty']
    
    # Only include columns that exist in the data
    final_columns = [col for col in columns_order if col in df.columns]
    
    # Add missing columns with empty values
    for col in columns_order:
        if col not in df.columns:
            df[col] = ""
    
    return df[columns_order]

# Main execution
if __name__ == "__main__":
    # Specify your PDF file path here
   
    pdf_file_path = r'C:\Users\Admin\Desktop\UNEEK\Purchase order.pdf' # Change this to your actual PDF path
    
    # Extract data from PDF
    print(f"Reading PDF from: {pdf_file_path}")
    extracted_data = extract_data_from_pdf(pdf_file_path)
    
    if extracted_data:
        # Create DataFrame
        df = create_excel_dataframe(extracted_data)
        
        # Display the results
        print("\nExtracted Data:")
        print("=" * 50)
        print(df)
        
        # Save to Excel file
        output_file = 'extracted_data.xlsx'
        df.to_excel(output_file, index=False)
        print(f"\nData successfully saved to '{output_file}'")
        
        # Display basic statistics
        print(f"\nExtraction Summary:")
        print(f"Total records extracted: {len(df)}")
        print(f"Columns: {list(df.columns)}")
        
    else:
        print("No data was extracted from the PDF file.")
    
    # Display the DataFrame in notebook
    df

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Reading PDF from: C:\Users\Admin\Desktop\UNEEK\Purchase order.pdf
Extracted text from PDF:
Purchase order
Ron Offenbach B.V.
De Wetering 101
4906 CT Oosterhout
Telephone +31886644100
Shimayra Jewellery E-mail info@offenbachgroup.com
Internet www.offenbachgroup.com
Plot No.62, SEEPZ-SEZ
IN 400096 Andheri (E), Mumbai
India
PO# : 6299 /
Supplier : 1712557
Order date : August 6, 2025
Handled by : Majenka
Our VAT no. : NL81 12.08.072.B01
# Picture Description Quantity
r
Article code Your reference
9-DD028-YG-30-58 RG047376-YG
Description Description
1 1
Diamo prong set ring 14KY 0,30ct Diamo prong set ring 14KY, 0,30ct and
stamp 585 and green emerald and Diamo
logo
Article code Your reference
9-DD035-YG-40-56 RG056063-YG
Description Description
2 1
Diamo open bezel setting solitair ring Diamo open bezel setting solitair ring
14KY 0,40ct 14KY, 0,40ct and stamp 585 and green
emerald and Diamo logo
Purchase order Total Total quantity 2
As contractually agreed all products must pass all standar

In [21]:
# Hello Pratik ignore this code
import pandas as pd
import re
import ipywidgets as widgets
from IPython.display import display, clear_output
import warnings
warnings.filterwarnings('ignore')

def extract_po_data_from_text(text):
    """
    Extract purchase order data from pasted text and create structured DataFrames
    """
    
    # Initialize dictionaries to store data
    po_header = {}
    line_items = []
    current_item = {}
    
    # Split text into lines
    lines = text.strip().split('\n')
    
    # Extract header information
    for i, line in enumerate(lines):
        line = line.strip()
        
        # Extract PO number
        if line.startswith('PO# :'):
            po_header['PO_Number'] = line.split('PO# :')[-1].strip()
        
        # Extract Supplier
        elif line.startswith('Supplier :'):
            po_header['Supplier_Code'] = line.split('Supplier :')[-1].strip()
        
        # Extract Order date
        elif line.startswith('Order date :'):
            po_header['Order_Date'] = line.split('Order date :')[-1].strip()
        
        # Extract Handled by
        elif line.startswith('Handled by :'):
            po_header['Handled_By'] = line.split('Handled by :')[-1].strip()
        
        # Extract VAT number
        elif line.startswith('Our VAT no. :'):
            po_header['VAT_Number'] = line.split('Our VAT no. :')[-1].strip()
        
        # Extract company addresses
        elif 'Ron Offenbach B.V.' in line:
            po_header['Buyer_Company'] = 'Ron Offenbach B.V.'
            address_lines = []
            for j in range(i+1, min(i+3, len(lines))):
                if lines[j].strip() and not any(x in lines[j] for x in ['Telephone', 'E-mail', 'Internet']):
                    address_lines.append(lines[j].strip())
            po_header['Buyer_Address'] = ', '.join(address_lines)
        
        elif 'Shimayra Jewellery' in line:
            po_header['Supplier_Company'] = 'Shimayra Jewellery'
            address_lines = []
            for j in range(i+1, min(i+4, len(lines))):
                addr_line = lines[j].strip()
                if addr_line and not any(x in addr_line for x in ['Telephone', 'E-mail', 'Internet', 'PO#']):
                    address_lines.append(addr_line)
            po_header['Supplier_Address'] = ', '.join(address_lines)
        
        # Extract contact info
        elif 'Telephone' in line and '+' in line:
            po_header['Buyer_Telephone'] = line.replace('Telephone', '').strip()
        elif 'E-mail' in line and '@' in line:
            po_header['Supplier_Email'] = line.replace('E-mail', '').strip()
        elif 'Internet' in line and 'www.' in line:
            po_header['Supplier_Website'] = line.replace('Internet', '').strip()
        
        # Extract line items
        elif re.match(r'^\d+-\w+-\w+-\d+-\d+', line):
            if current_item and 'Article_Code' in current_item:
                line_items.append(current_item)
                current_item = {}
            current_item['Article_Code'] = line.strip()
        
        elif re.match(r'^RG\d+-[A-Z]+', line):
            current_item['Your_Reference'] = line.strip()
        
        elif 'Diamo' in line and 'ring' in line.lower():
            if 'Description' not in current_item:
                current_item['Description'] = line.strip()
            else:
                current_item['Your_Description'] = line.strip()
        
        elif line.strip().isdigit() and len(line.strip()) == 1:
            current_item['Quantity'] = int(line.strip())
            current_item['Item_Number'] = len(line_items) + 1
    
    # Add the last item if exists
    if current_item and 'Article_Code' in current_item:
        line_items.append(current_item)
    
    # Extract compliance information
    compliance_match = re.search(r'EU Reach Regulation.*?EN 1811:\d{4}', text, re.DOTALL)
    if compliance_match:
        po_header['Compliance_Standards'] = compliance_match.group().strip()
    else:
        po_header['Compliance_Standards'] = 'EU Reach Regulation 1907/2006 and tests EN 1811:2011'
    
    # Extract total quantity
    total_match = re.search(r'Total quantity\s+(\d+)', text)
    if total_match:
        po_header['Total_Quantity'] = int(total_match.group(1))
    else:
        po_header['Total_Quantity'] = sum(item.get('Quantity', 0) for item in line_items)
    
    return po_header, line_items

def create_dataframes(po_header, line_items):
    """
    Create pandas DataFrames from extracted data
    """
    # Create PO header DataFrame
    po_df = pd.DataFrame([po_header])
    
    # Create line items DataFrame
    if line_items:
        items_df = pd.DataFrame(line_items)
        # Ensure Item_Number is set properly
        if 'Item_Number' not in items_df.columns:
            items_df['Item_Number'] = range(1, len(items_df) + 1)
    else:
        items_df = pd.DataFrame(columns=['Item_Number', 'Article_Code', 'Your_Reference', 'Description', 'Quantity'])
    
    return po_df, items_df

# Sample data for testing
SAMPLE_TEXT = """Purchase order
Ron Offenbach B.V.
De Wetering 101
4906 CT Oosterhout
Telephone +31886644100
Shimayra Jewellery E-mail info@offenbachgroup.com
Internet www.offenbachgroup.com
Plot No.62, SEEPZ-SEZ
IN 400096 Andheri (E), Mumbai
India
PO# : 6299 /
Supplier : 1712557
Order date : August 6, 2025
Handled by : Majenka
Our VAT no. : NL81 12.08.072.B01
# Picture Description Quantity
r
Article code Your reference
9-DD028-YG-30-58 RG047376-YG
Description Description
1 1
Diamo prong set ring 14KY 0,30ct Diamo prong set ring 14KY, 0,30ct and
stamp 585 and green emerald and Diamo
logo
Article code Your reference
9-DD035-YG-40-56 RG056063-YG
Description Description
2 1
Diamo open bezel setting solitair ring Diamo open bezel setting solitair ring
14KY 0,40ct 14KY, 0,40ct and stamp 585 and green
emerald and Diamo logo
Purchase order Total Total quantity 2
As contractually agreed all products must pass all standards (Lead, Cadmium, Nickel,...) as stipulated in the
EU Reach Regulation 1907/2006 and tests EN 1811:2011"""

# Create widgets
text_area = widgets.Textarea(
    value=SAMPLE_TEXT,
    placeholder='Paste your purchase order text here...',
    description='PO Text:',
    layout=widgets.Layout(width='95%', height='200px'),
    style={'description_width': 'initial'}
)

process_btn = widgets.Button(
    description="Process Purchase Order",
    button_style='success',
    tooltip='Click to extract data',
    icon='cog'
)

clear_btn = widgets.Button(
    description="Clear",
    button_style='warning',
    tooltip='Clear all data'
)

download_btn = widgets.Button(
    description="Download Excel",
    button_style='info',
    tooltip='Download as Excel file'
)

output = widgets.Output()

def process_po(b):
    with output:
        clear_output()
        text = text_area.value
        
        if not text.strip():
            print("❌ Please paste purchase order text first!")
            return
        
        try:
            # Extract data
            po_header, line_items = extract_po_data_from_text(text)
            
            # Create DataFrames
            po_df, items_df = create_dataframes(po_header, line_items)
            
            # Display results
            print("✅ PURCHASE ORDER DATA EXTRACTED SUCCESSFULLY!")
            print("="*60)
            
            print("\n📋 PURCHASE ORDER HEADER:")
            print("-" * 40)
            header_display = pd.DataFrame([{k: v for k, v in po_header.items() if k != 'Compliance_Standards'}])
            display(header_display.T.rename(columns={0: 'Value'}))
            
            print(f"\n📊 TOTAL QUANTITY: {po_header.get('Total_Quantity', 0)}")
            
            print("\n🛒 LINE ITEMS:")
            print("-" * 40)
            if not items_df.empty:
                display(items_df)
                
                # Show sorting options
                print(f"\n🎯 SORT LINE ITEMS BY:")
                sort_columns = [col for col in items_df.columns if col != 'Your_Description']
                
                for col in sort_columns:
                    sorted_df = items_df.sort_values(by=col)
                    print(f"\nSorted by {col}:")
                    display(sorted_df[['Item_Number', 'Article_Code', 'Description', 'Quantity']])
            else:
                print("No line items found in the text.")
            
            print("\n⚖️ COMPLIANCE STANDARDS:")
            print("-" * 40)
            print(po_header.get('Compliance_Standards', 'Not specified'))
            
            # Store DataFrames for download
            process_btn.po_df = po_df
            process_btn.items_df = items_df
            process_btn.po_header = po_header
            
            print(f"\n💾 DataFrames created: PO Header ({po_df.shape}), Line Items ({items_df.shape})")
            
        except Exception as e:
            print(f"❌ Error processing text: {e}")
            print("Please check the text format and try again.")

def clear_all(b):
    text_area.value = ""
    with output:
        clear_output()
    print("✅ Text area cleared!")

def download_excel(b):
    if not hasattr(process_btn, 'po_df'):
        with output:
            print("❌ Please process a purchase order first!")
        return
    
    try:
        filename = f"purchase_order_{process_btn.po_header.get('PO_Number', 'unknown')}.xlsx"
        
        with pd.ExcelWriter(filename, engine='openpyxl') as writer:
            process_btn.po_df.to_excel(writer, sheet_name='PO_Header', index=False)
            process_btn.items_df.to_excel(writer, sheet_name='Line_Items', index=False)
            
            # Create combined sheet
            combined_data = []
            for idx, item in process_btn.items_df.iterrows():
                combined_row = {**process_btn.po_header, **item.to_dict()}
                combined_data.append(combined_row)
            
            combined_df = pd.DataFrame(combined_data)
            combined_df.to_excel(writer, sheet_name='Combined_Data', index=False)
        
        with output:
            print(f"✅ Excel file '{filename}' downloaded successfully!")
            
    except Exception as e:
        with output:
            print(f"❌ Error downloading Excel file: {e}")

# Set button click events
process_btn.on_click(process_po)
clear_btn.on_click(clear_all)
download_btn.on_click(download_excel)

# Create button layout
button_layout = widgets.HBox([process_btn, clear_btn, download_btn])

# Display the interface
print("🏢 PURCHASE ORDER DATA EXTRACTOR")
print("="*50)
print("Paste your purchase order text below and click 'Process Purchase Order'")

display(text_area)
display(button_layout)
display(output)

# Additional utility functions for manual use
def quick_process(text):
    """
    Quick process function for manual text processing
    """
    po_header, line_items = extract_po_data_from_text(text)
    po_df, items_df = create_dataframes(po_header, line_items)
    
    print("Quick Processing Results:")
    print("PO Header:", po_df.shape)
    print("Line Items:", items_df.shape)
    
    return po_df, items_df

def show_dataframe_info():
    """
    Show available columns and data types
    """
    if hasattr(process_btn, 'po_df'):
        print("📊 DATAFRAME INFORMATION:")
        print("PO Header Columns:", list(process_btn.po_df.columns))
        print("Line Items Columns:", list(process_btn.items_df.columns))
    else:
        print("No data processed yet. Please process a purchase order first.")

# Usage examples cell (run this in a separate cell if needed)
"""
# Example usage for manual processing:
# po_df, items_df = quick_process(your_text_here)

# Example sorting:
# sorted_items = items_df.sort_values('Quantity', ascending=False)
# sorted_items = items_df.sort_values('Article_Code')

# Example filtering:
# high_quantity = items_df[items_df['Quantity'] > 1]
# specific_article = items_df[items_df['Article_Code'].str.contains('YG-30')]
"""


🏢 PURCHASE ORDER DATA EXTRACTOR
Paste your purchase order text below and click 'Process Purchase Order'


Textarea(value='Purchase order\nRon Offenbach B.V.\nDe Wetering 101\n4906 CT Oosterhout\nTelephone +3188664410…

HBox(children=(Button(button_style='success', description='Process Purchase Order', icon='cog', style=ButtonSt…

Output()

NameError: name 'items_df' is not defined

In [30]:
# Text to df (consider this) after pdf to textis done
import pandas as pd
import re
from IPython.display import display

# Paste your PO text here
PO_TEXT = """Purchase order
Ron Offenbach B.V.
De Wetering 101
4906 CT Oosterhout
Telephone +31886644100
Shimayra Jewellery E-mail info@offenbachgroup.com
Internet www.offenbachgroup.com
Plot No.62, SEEPZ-SEZ
IN 400096 Andheri (E), Mumbai
India
PO# : 6299 /
Supplier : 1712557
Order date : August 6, 2025
Handled by : Majenka
Our VAT no. : NL81 12.08.072.B01
# Picture Description Quantity
r
Article code Your reference
9-DD028-YG-30-58 RG047376-YG
Description Description
1 1
Diamo prong set ring 14KY 0,30ct Diamo prong set ring 14KY, 0,30ct and
stamp 585 and green emerald and Diamo
logo
Article code Your reference
9-DD035-YG-40-56 RG056063-YG
Description Description
2 1
Diamo open bezel setting solitair ring Diamo open bezel setting solitair ring
14KY 0,40ct 14KY, 0,40ct and stamp 585 and green
emerald and Diamo logo
Purchase order Total Total quantity 2"""

# Extract and display line items
lines = PO_TEXT.strip().split('\n')
line_items = []
current_item = {}
quantity_pattern = re.compile(r'^(\d+)\s+(\d+)$')  # Pattern for "number number" lines
description_lines = []  # To collect multi-line description

for i, line in enumerate(lines):
    line = line.strip()
    
    # Article code pattern - start new item
    if re.match(r'^\d+-\w+-\w+-\d+-\d+', line):
        if current_item: 
            # Join all description lines before starting new item
            if description_lines:
                current_item['Description'] = ' '.join(description_lines)
                description_lines = []
            line_items.append(current_item)
        current_item = {'Article_Code': line}
        description_lines = []  # Reset description lines for new item
    
    # Your reference pattern
    elif re.match(r'^RG\d+-[A-Z]+', line):
        current_item['Your_Reference'] = line
    
    # Quantity pattern - look for lines with "number number" format
    elif quantity_pattern.match(line):
        match = quantity_pattern.match(line)
        current_item['Quantity'] = int(match.group(2))  # Second number is the quantity
    
    # Description pattern - collect all description lines
    elif 'Diamo' in line or 'stamp' in line.lower() or 'emerald' in line.lower() or 'logo' in line.lower():
        description_lines.append(line)

# Handle the last item
if current_item:
    if description_lines:
        current_item['Description'] = ' '.join(description_lines)
    line_items.append(current_item)

# Create and display DataFrame
if line_items:
    items_df = pd.DataFrame(line_items)
    items_df['Item_Number'] = range(1, len(items_df) + 1)
    
    # Reorder columns
    cols = ['Item_Number', 'Article_Code', 'Your_Reference', 'Description', 'Quantity']
    available_cols = [col for col in cols if col in items_df.columns]
    items_df = items_df[available_cols]
    
    print("🛒 LINE ITEMS:")
    display(items_df)
else:
    print("No line items found.")

items_df.to_csv(r'C:\Users\Admin\Desktop\OBU_JUP2.csv', index=False)

🛒 LINE ITEMS:


Unnamed: 0,Item_Number,Article_Code,Description,Quantity
0,1,9-DD028-YG-30-58 RG047376-YG,"Diamo prong set ring 14KY 0,30ct Diamo prong s...",1
1,2,9-DD035-YG-40-56 RG056063-YG,Diamo open bezel setting solitair ring Diamo o...,1
