In [1]:
import re
import pandas as pd

def extract_invoice_number(text):
    if pd.isna(text):
        return None

    if isinstance(text, int):
        text = str(text)

    patterns = [
        r"icp\s*(\d+)",
        r"invoice\s*(\d+)",
        r"م\s*(\d+)",
        r"مستخلص\s*(\d+)",
        r"INVOICE\s*\(?(\d+)\)?",
        r"inv\s*(?:no)?\s*#?\s*(\d+)",
        r"[Ii][Nn][Vv]\s*(?:no)?\s*.?\s*(\d+)",
        r"[Ii][Pp][Cc]\s*\(?(\d+)\)?",
        r"-\s*\(?(\d+)\)?-?",
        r"[Ii]nv\.\s*[Nn]o\.\s*(\d+)",
        r"IPC#\s*(\d+)",
        r"IPC_(\d+)_LRT",
        r"[Aa]pachi [Ii]nv [Nn]o [Pp]\.[Cc]#(\d+)",
        r"\((.*?)\)" #New pattern for extracting numbers inside brackets
    ]

    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1)

    return None

# Define separate functions to extract each invoice status
def extract_advance_payment(text):
    if pd.isna(text):
        return None
        
    # Convert integer values to strings
    if isinstance(text, int):
        text = str(text)

    advance_payment_pattern = r"دفعة\s*مقدمة|الدفعة\s*المقدمة|الدفعه\s*المقدمة|Advance\s*payment"
    if re.search(advance_payment_pattern, text, re.IGNORECASE):
        return "Advance Payment"
    return None

def extract_part_of_invoice(text):
    if pd.isna(text):
        return None

    # Convert integer values to strings
    if isinstance(text, int):
        text = str(text)
        
    part_of_invoice_pattern = r"جزء|باقي|باقى|part"
    if re.search(part_of_invoice_pattern, text, re.IGNORECASE):
        return "Part of Invoice"
    return None

def extract_final_invoice(text):
    if pd.isna(text):
        return None
                
    # Convert integer values to strings
    if isinstance(text, int):
        text = str(text)
        
    final_invoice_pattern = r"final|ختامي|ختامى"
    if re.search(final_invoice_pattern, text, re.IGNORECASE):
        return "Final"
    return None

def extract_on_account(text):
    if pd.isna(text):
        return None
                    
    # Convert integer values to strings
    if isinstance(text, int):
        text = str(text)
        
    on_account_pattern = r"دفعة|دفعه|on\s*acc|on\s*account"
    if re.search(on_account_pattern, text, re.IGNORECASE):
        return "On Account"
    return None

def extract_return(text):
    if pd.isna(text):
        return None
                
    # Convert integer values to strings
    if isinstance(text, int):
        text = str(text)
        
    return_pattern = r"رد"
    if re.search(return_pattern, text, re.IGNORECASE):
        return "Return"
    return None

def extract_deduction(text):
    if pd.isna(text):
        return None
                        
    # Convert integer values to strings
    if isinstance(text, int):
        text = str(text)
        
    deduction_pattern = r"خصم|خصومات"
    if re.search(deduction_pattern, text, re.IGNORECASE):
        return "Deduction"
    return None

def extract_retention(text):
    if pd.isna(text):
        return None
    
    # Convert integer values to strings
    if isinstance(text, int):
        text = str(text)
        
    retention_pattern = r"تامين نهائى|retention"
    if re.search(retention_pattern, text, re.IGNORECASE):
        return "Retention"
    return None

def extract_escalation(text):
    if pd.isna(text):
        return None
        
    # Convert integer values to strings
    if isinstance(text, int):
        text = str(text)
    
    escalation_pattern = r"esc\.|فروق"
    if re.search(escalation_pattern, text, re.IGNORECASE):
        return "Escalation"
    return None

# Read the Excel file into a DataFrame
df = pd.read_excel('invoice_comments.xlsx')  # Replace 'invoice_comments.xlsx' with your actual file name

# Extract invoice numbers from the "comments" column
df['invoice_number'] = df['comments'].apply(extract_invoice_number)

# Extract individual invoice statuses into separate columns
df['advance_payment'] = df['comments'].apply(extract_advance_payment)
df['part_of_invoice'] = df['comments'].apply(extract_part_of_invoice)
df['final_invoice'] = df['comments'].apply(extract_final_invoice)
df['on_account'] = df['comments'].apply(extract_on_account)
df['return'] = df['comments'].apply(extract_return)
df['deduction'] = df['comments'].apply(extract_deduction)
df['retention'] = df['comments'].apply(extract_retention)
df['escalation'] = df['comments'].apply(extract_escalation)  # New column for "Escalation" status

# Combine the individual status columns into a single "Invoice Status" column
def combine_statuses(row):
    statuses = []
    if row['advance_payment']:
        statuses.append(row['advance_payment'])
    if row['part_of_invoice']:
        statuses.append(row['part_of_invoice'])
    if row['final_invoice']:
        statuses.append(row['final_invoice'])
    if row['on_account']:
        statuses.append(row['on_account'])
    if row['return']:
        statuses.append(row['return'])
    if row['deduction']:
        statuses.append(row['deduction'])
    if row['retention']:
        statuses.append(row['retention'])
    if row['escalation']:  # Include "Escalation" in the combined status
        statuses.append(row['escalation'])
    return ' - '.join(statuses)

df['invoice_status'] = df.apply(combine_statuses, axis=1)

# Convert `invoice_number` to numeric, but keep NaN values
df['invoice_number'] = pd.to_numeric(df['invoice_number'], errors='coerce')

# Save the dataframe to a new Excel file
df.to_excel("invoice_comments_with_numbers.xlsx", index=False)