In [None]:
import re
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('comments_field.xlsx - Collection.csv')

# Define a function to extract invoice number
def extract_invoice_number(text):
    if pd.isna(text):
        return None

    # Define regular expression patterns to match different formats of invoice numbers
    patterns = [
        r"icp\s*(\d+)",              # Matches "icp" followed by a number
        r"invoice\s*(\d+)",        # Matches "invoice" followed by a number
        r"م\s*(\d+)",               # Matches "م" followed by a number
        r"مستخلص\s*(\d+)",     # Matches "مستخلص" followed by a number
        r"INVOICE\s*\(?(\d+)\)?",  # Matches "INVOICE" or "INVOICE(number)"
    ]

    # Search for the patterns in the text
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1)

    return None

# Define a function to extract invoice status
def extract_invoice_status(text):
    if pd.isna(text):
        return None

    # Define regular expression pattern to match "final", "ختامي", or "ختامى"
    pattern = r"final|ختامي|ختامى"

    # Search for the pattern in the text
    if re.search(pattern, text, re.IGNORECASE):
        return "Final"

    return None

# Extract invoice numbers from `COMMENTS` column
df['invoice_number'] = df['COMMENTS'].apply(extract_invoice_number)

# Extract invoice status from `COMMENTS` column
df['invoice_status'] = df['COMMENTS'].apply(extract_invoice_status)

# Drop rows with missing `invoice_number`
df_filtered = df.dropna(subset=['invoice_number'])

# Convert `invoice_number` to numeric
df_filtered['invoice_number'] = pd.to_numeric(df_filtered['invoice_number'])

# Save the dataframe to an excel file
df_filtered.to_excel("comments_with_invoice_details.xlsx", index=False)