## Pdf_WO VS. Open sales order.xlsx

In [2]:
mappings = {
    'M.280-SSD-256GB-PCIe44-TLC5WT-T': 'M.280-SSD-256GB-PCIe44-TLC5WT-TD',
    'M.280-SSD-512GB-PCIe44-TLC5WT-T': 'M.280-SSD-512GB-PCIe44-TLC5WT-TD',
    'M.242-SSD-256GB-PCIe34-TLC5WT-T': 'M.242-SSD-256GB-PCIe34-TLC5WT-TD',
    'M.242-SSD-512GB-PCIe34-TLC5WT-T': 'M.242-SSD-512GB-PCIe34-TLC5WT-TD',
    'M.242-SSD-128GB-PCIe34-TLC5WT-T': 'M.242-SSD-128GB-PCIe34-TLC5WT-TD',
    'Cblkit-FP-NRU-230V-AWP_NRU-240S': 'Cblkit-FP-NRU-230V-AWP_NRU-240S-AWP',
}

In [3]:
import os
import pandas as pd
from sqlalchemy import create_engine

# 1. Build engine
DATABASE_DSN = (
    "postgresql://postgres.avcznjglmqhmzqtsrlfg:Czheyuan0227@"
    "aws-0-us-east-2.pooler.supabase.com:6543/postgres?sslmode=require"
)
engine = create_engine(DATABASE_DSN, pool_pre_ping=True)


def fetch_pdf_orders_df_from_supabase(dsn: str) -> pd.DataFrame:
    """
    Reads order_id + extracted_data from public.pdf_file_log and returns a
    two-column DataFrame with ['WO','Product Number'] rows, one per item in JSON.
    """
    eng = create_engine(dsn, pool_pre_ping=True)
    rows = pd.read_sql('SELECT order_id, extracted_data FROM public.pdf_file_log', eng)

    def rows_from_json(extracted_data, order_id=""):
        # extracted_data may be JSON string or dict
        if isinstance(extracted_data, str):
            try:
                extracted_data = json.loads(extracted_data)
            except Exception:
                extracted_data = {}
        data = extracted_data or {}
        wo = data.get("wo", order_id)
        items = data.get("items") or []

        # one row per item; if none, keep a placeholder
        if not items:
            return [{"WO": wo, "Product Number": ""}]
        out = []
        for it in items:
            pn = (
                it.get("product_number")
                or it.get("part_number")
                or it.get("product")
                or it.get("part")
                or ""
            )
            out.append({"WO": wo, "Product Number": pn})
        return out

    all_rows = []
    for _, r in rows.iterrows():
        all_rows.extend(rows_from_json(r.get("extracted_data"), r.get("order_id")))

    return pd.DataFrame(all_rows, columns=["WO", "Product Number"])

pdf_orders_df = fetch_pdf_orders_df_from_supabase(DATABASE_DSN)

pdf_orders_df[pdf_orders_df['WO'] == 'SO-20251329']

Unnamed: 0,WO,Product Number
14587,SO-20251329,POC-410
14588,SO-20251329,DDR4-16GB-32-SM
14589,SO-20251329,M.280-SSD-256GB-SATA-TLC5WT-TD
14590,SO-20251329,Win11IoT24-Entry
14591,SO-20251329,SSD-512GB-TLC5ET-PN


In [4]:
ref = pdf_orders_df.copy()
ref['__pos_out'] = ref.groupby('WO').cumcount()                 # position within WO
ref['__occ'] = ref.groupby(['WO','Product Number']).cumcount()  # occurrence index for duplicates
ref_key = ref[['WO','Product Number','__occ','__pos_out']]
ref[ref['WO'] == 'SO-20251329']

Unnamed: 0,WO,Product Number,__pos_out,__occ
14587,SO-20251329,POC-410,0,0
14588,SO-20251329,DDR4-16GB-32-SM,1,0
14589,SO-20251329,M.280-SSD-256GB-SATA-TLC5WT-TD,2,0
14590,SO-20251329,Win11IoT24-Entry,3,0
14591,SO-20251329,SSD-512GB-TLC5ET-PN,4,0


In [5]:
import json, re, numpy as np, pandas as pd
df_sales_order = pd.read_sql_table("open_sales_orders", con=engine, schema="public")
# df_sales_order = df_sales_order = pd.read_csv(r"C:\Users\Admin\OneDrive - neousys-tech\Share NTA Warehouse\Daily Update\Open Sales Order 9_22_2025.CSV", encoding="ISO-8859-1")

def transform_sales_order(df_sales_order: pd.DataFrame) -> pd.DataFrame:
    df = df_sales_order.copy()
    df = df.rename(columns={"Unnamed: 0": "Component", "Num": "WO_Number", "Backordered": "Qty"})
    df["Component"] = df["Component"].ffill().astype(str).str.strip()
    df = df[~df["Component"].str.startswith("total", na=False)]
    df = df[~df["Component"].str.lower().isin(["forwarding charge", "tariff (estimation)"])]
    if "Inventory Site" in df.columns:
        df = df[df["Inventory Site"] == "WH01S-NTA"]
    df['Component'] = df['Component'].replace(mappings)
    return df

df_sales_order = transform_sales_order(df_sales_order)

# Build df_out from Sales Order 
needed_cols = {
    "Name": "Customer",
    "P. O. #": "Customer PO",
    "WO_Number": "WO",
    "Component": "Product Number",
    "Backordered": "Qty",
    "Ship Date": "Lead Time"
}
for c in ["Customer","PO"]:
    if c not in df_sales_order.columns:
        df_sales_order[c] = ""

df_out = df_sales_order.rename(columns=needed_cols)[list(needed_cols.values())]

# Sort to group visually by WO, then by Product Number
df_out = df_out.sort_values(['WO', 'Product Number']).reset_index(drop=True)

tgt = df_out.copy()
tgt['__occ'] = tgt.groupby(['WO','Product Number']).cumcount()

# 3) Merge positions from output_df to df_out rows (match by WO + Product + occurrence)
merged = tgt.merge(ref_key, on=['WO','Product Number','__occ'], how='left')

# 4) For rows not present in output_df, keep their original within-WO order but push them after the matched ones
merged['__fallback'] = merged.groupby('WO').cumcount()
merged['__pos_out'] = merged['__pos_out'].fillna(np.inf)

df_sales_order[df_sales_order['WO_Number'] == 'SO-20251329']

Unnamed: 0,Component,Type,Date,Ship Date,Deliv Date,Terms,Due Date,WO_Number,P. O. #,Name,Invoiced,Qty,Amount,Item,Rep,Open Balance,Inventory Site,Customer,PO
440,DDR4-16GB-32-SM,Sales Order,09/18/2025,10/03/2025,,Net 30,10/18/2025,SO-20251329,00505698(2),LASERAX INC,0.0,3.0,306.0,Memory Module:DDR4-16GB-32-SM,A702,306.0,WH01S-NTA,,
665,POC-410,Sales Order,09/18/2025,10/03/2025,,Net 30,10/18/2025,SO-20251329,00505698(2),LASERAX INC,0.0,3.0,1536.0,POC-400 Series:POC-410,A702,1536.0,WH01S-NTA,,
829,M.280-SSD-256GB-SATA-TLC5WT-TD,Sales Order,09/18/2025,10/03/2025,,Net 30,10/18/2025,SO-20251329,00505698(2),LASERAX INC,0.0,3.0,183.0,Storage:M.280-SSD-256GB-SATA-TLC5WT-TD,A702,183.0,WH01S-NTA,,
909,SSD-512GB-TLC5ET-PN,Sales Order,09/18/2025,10/03/2025,,Net 30,10/18/2025,SO-20251329,00505698(2),LASERAX INC,0.0,1.0,84.0,Storage:SSD-512GB-TLC5ET-PN,A702,84.0,WH01S-NTA,,
935,Win11IoT24-Entry,Sales Order,09/18/2025,10/03/2025,,Net 30,10/18/2025,SO-20251329,00505698(2),LASERAX INC,0.0,3.0,153.0,Windows OS:Win11IoT24-Entry,A702,153.0,WH01S-NTA,,


In [6]:
merged[merged['WO'] == 'SO-20251329']

Unnamed: 0,Customer,Customer.1,Customer PO,WO,Product Number,Qty,Lead Time,__occ,__pos_out,__fallback
917,LASERAX INC,,00505698(2),SO-20251329,DDR4-16GB-32-SM,3.0,10/03/2025,0,1.0,0
918,LASERAX INC,,00505698(2),SO-20251329,M.280-SSD-256GB-SATA-TLC5WT-TD,3.0,10/03/2025,0,2.0,1
919,LASERAX INC,,00505698(2),SO-20251329,POC-410,3.0,10/03/2025,0,0.0,2
920,LASERAX INC,,00505698(2),SO-20251329,SSD-512GB-TLC5ET-PN,1.0,10/03/2025,0,4.0,3
921,LASERAX INC,,00505698(2),SO-20251329,Win11IoT24-Entry,3.0,10/03/2025,0,3.0,4


In [7]:
inf =merged[merged['__pos_out'] == np.inf]
inf.to_excel(r"C:\Users\Admin\OneDrive - neousys-tech\Desktop\Output.xlsx", sheet_name="Sheet3", index=False)

In [8]:
def reorder_df_out_by_output(output_df: pd.DataFrame, df_out: pd.DataFrame) -> pd.DataFrame:
    # 1) output_df is from PDFs, df_out is from Excel open sales order
    ref = output_df.copy()
    ref['__pos_out'] = ref.groupby('WO').cumcount()                 # position within WO
    ref['__occ'] = ref.groupby(['WO','Product Number']).cumcount()  # occurrence index for duplicates
    ref_key = ref[['WO','Product Number','__occ','__pos_out']]

    # 2) On df_out, tag each duplicate with its own occurrence index
    tgt = df_out.copy()
    tgt['__occ'] = tgt.groupby(['WO','Product Number']).cumcount()

    # 3) Merge positions from output_df to df_out rows (match by WO + Product + occurrence)
    merged = tgt.merge(ref_key, on=['WO','Product Number','__occ'], how='left')

    # 4) For rows not present in output_df, keep their original within-WO order but push them after the matched ones
    merged['__fallback'] = merged.groupby('WO').cumcount()
    merged['__pos_out'] = merged['__pos_out'].fillna(np.inf)

    # 5) Final order: by WO, then by output_df position; if missing, by original order
    ordered = (merged
               .sort_values(['WO','__pos_out','__fallback'])
               .drop(columns=['__occ','__pos_out','__fallback'])
               .reset_index(drop=True))
    return ordered


final_sales_order = reorder_df_out_by_output(pdf_orders_df, df_out)

final_sales_order['Product Number'] = final_sales_order['Product Number'].replace(mappings)

final_sales_order = final_sales_order.loc[:, ~final_sales_order.columns.duplicated()]


In [9]:
final_sales_order[final_sales_order['WO'] == 'SO-20251329']

Unnamed: 0,Customer,Customer PO,WO,Product Number,Qty,Lead Time
917,LASERAX INC,00505698(2),SO-20251329,POC-410,3.0,10/03/2025
918,LASERAX INC,00505698(2),SO-20251329,DDR4-16GB-32-SM,3.0,10/03/2025
919,LASERAX INC,00505698(2),SO-20251329,M.280-SSD-256GB-SATA-TLC5WT-TD,3.0,10/03/2025
920,LASERAX INC,00505698(2),SO-20251329,Win11IoT24-Entry,3.0,10/03/2025
921,LASERAX INC,00505698(2),SO-20251329,SSD-512GB-TLC5ET-PN,1.0,10/03/2025


In [10]:
wo = "SO-20251329"
pdf_set = set(ref.loc[ref['WO'] == wo, 'Product Number'])
xls_set = set(tgt.loc[tgt['WO'] == wo, 'Product Number'])
print("Only in Excel:", xls_set - pdf_set)
print("Only in PDF:",  pdf_set - xls_set)


Only in Excel: set()
Only in PDF: set()
