In [18]:
import re
import pandas as pd

# Read the Excel file into a DataFrame (replace 'your_excel_file.xlsx' with your actual file name)
df = pd.read_excel('invoice_comments.xlsx')

# Define a function to extract invoice number
def extract_invoice_number(text):
    if pd.isna(text):
        return None

    # Define regular expression patterns to match different formats of invoice numbers
    patterns = [
        r"icp\s*(\d+)",              # Matches "icp" followed by a number
        r"invoice\s*(\d+)",        # Matches "invoice" followed by a number
        r"م\s*(\d+)",               # Matches "م" followed by a number
        r"مستخلص\s*(\d+)",     # Matches "مستخلص" followed by a number
        r"INVOICE\s*\(?(\d+)\)?",  # Matches "INVOICE" or "INVOICE(number)"
        r"inv\s*(?:no)?\s*#?\s*(\d+)",  # Matches "inv", with optional "no" or "#", followed by a number
        r"[Ii][Nn][Vv]\s*(?:no)?\s*.?\s*(\d+)",  # Matches "inv" or "INV", with optional "no" and intervening characters, followed by a number
        r"[Ii][Pp][Cc]\s*\(?(\d+)\)?",  # Matches "ipc" or "IPC" with an optional number in parentheses
        r"-\s*\(?(\d+)\)?-?"  # Matches a number in parentheses, optionally with hyphens around
    ]

    # Search for the patterns in the text
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1)

    return None


# Define separate functions to extract each invoice status
def extract_advance_payment(text):
    if pd.isna(text):
        return None
    advance_payment_pattern = r"دفعة\s*مقدمة|الدفعة\s*المقدمة|الدفعه\s*المقدمة|Advance\s*payment"  # Added more spacing variations
    if re.search(advance_payment_pattern, text, re.IGNORECASE):
        return "Advance Payment"
    return None

def extract_part_of_invoice(text):
    if pd.isna(text):
        return None
    part_of_invoice_pattern = r"جزء|باقي|باقى|part"  # Added different forms of "باقي"
    if re.search(part_of_invoice_pattern, text, re.IGNORECASE):
        return "Part of Invoice"
    return None

def extract_final_invoice(text):
    if pd.isna(text):
        return None
    final_invoice_pattern = r"final|ختامي|ختامى"
    if re.search(final_invoice_pattern, text, re.IGNORECASE):
        return "Final"
    return None

# Extract invoice numbers from the "comments" column
df['invoice_number'] = df['comments'].apply(extract_invoice_number)

# Extract individual invoice statuses into separate columns
df['advance_payment'] = df['comments'].apply(extract_advance_payment)
df['part_of_invoice'] = df['comments'].apply(extract_part_of_invoice)
df['final_invoice'] = df['comments'].apply(extract_final_invoice)

# Combine the individual status columns into a single "Invoice Status" column
def combine_statuses(row):
    statuses = []
    if row['advance_payment']:
        statuses.append(row['advance_payment'])
    if row['part_of_invoice']:
        statuses.append(row['part_of_invoice'])
    if row['final_invoice']:
        statuses.append(row['final_invoice'])
    return ' - '.join(statuses)

df['invoice_status'] = df.apply(combine_statuses, axis=1)

# Convert `invoice_number` to numeric, but keep NaN values
df['invoice_number'] = pd.to_numeric(df['invoice_number'], errors='coerce')

# Save the dataframe to a new Excel file
df.to_excel("invoice_comments_with_numbers.xlsx", index=False)