In [63]:
import os
import re
import pandas as pd
from tqdm import tqdm

# === Paths ===
CLUSTER_BASE = r"../data/crops_face/20251105/4_cafe_pos_cluster_faces"
OUTPUT_CSV   = r"../data/cluster_accid_diversity_20251105.csv"

# === Regex Helper ===
re_accoid = re.compile(r"AccoID_(\d+)")

records = []

# === Iterate through cluster folders ===
for cluster in tqdm(sorted(os.listdir(CLUSTER_BASE)), desc="Scanning clusters"):
    cluster_path = os.path.join(CLUSTER_BASE, cluster)
    if not os.path.isdir(cluster_path):
        continue

    accoids = set()  # store unique order IDs for this cluster

    for fname in os.listdir(cluster_path):
        if not fname.lower().endswith((".jpg", ".png")):
            continue

        match = re_accoid.search(fname)
        if match:
            accoids.add(int(match.group(1)))

    # Record summary for this cluster
    records.append({
        "cluster": cluster,
        "num_unique_orders": len(accoids),
        "accoids": sorted(list(accoids))
    })

# === Convert to DataFrame ===
df = pd.DataFrame(records)
df = df.sort_values("num_unique_orders", ascending=False)

# === Save ===
df.to_csv(OUTPUT_CSV, index=False)
print(f"\nâœ… Clusterâ€“AccoID diversity summary saved â†’ {OUTPUT_CSV}")
print(df.head(10))


Scanning clusters: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 287/287 [00:00<00:00, 2249.01it/s]


âœ… Clusterâ€“AccoID diversity summary saved â†’ ../data/cluster_accid_diversity_20251105.csv
           cluster  num_unique_orders  \
286  cluster_noise                148   
160    cluster_173                 13   
274    cluster_295                 12   
273    cluster_294                 11   
275    cluster_296                  9   
277    cluster_298                  9   
285    cluster_306                  8   
124    cluster_131                  7   
206    cluster_222                  7   
281    cluster_302                  7   

                                               accoids  
286  [1168504, 1168525, 1168530, 1168537, 1168544, ...  
160  [1168755, 1168760, 1169052, 1169098, 1169105, ...  
274  [1168537, 1168544, 1168755, 1168760, 1168869, ...  
273  [1168525, 1168552, 1168559, 1168564, 1168788, ...  
275  [1168626, 1168633, 1168715, 1168722, 1168748, ...  
277  [1168811, 1169239, 1169272, 1169279, 1169286, ...  
285  [1168602, 1168833, 1168840, 1168902, 1168907, ...




In [65]:
import os
import re
import shutil
from collections import Counter
from tqdm import tqdm
import pandas as pd

# === PATHS ===
SRC_BASE = r"../data/crops_face/20251105/4_cafe_pos_cluster_faces"       # original clusters
OUT_BASE = r"../data/crops_face/20251105/4_cafe_pos_cluster_faces_filtered"  # output folder
os.makedirs(OUT_BASE, exist_ok=True)

# === Helper: Extract AccoID from filename ===
def extract_accoid(filename):
    """
    Extract AccoID from filename pattern like:
    AccoID_1150642_20251021_110849.jpg
    """
    match = re.search(r"AccoID_(\d+)", filename)
    return match.group(1) if match else None


# === STEP 1: FILTER CLUSTERS TO KEEP MOST FREQUENT ACCOID ===
clusters = [
    d for d in os.listdir(SRC_BASE)
    if d.startswith("cluster_") and os.path.isdir(os.path.join(SRC_BASE, d))
]
print(f"ðŸ§  Found {len(clusters)} clusters to process.\n")

filtered_rows = []  # for unique count later

for cluster in tqdm(clusters, desc="Filtering clusters by most common AccoID"):
    cluster_path = os.path.join(SRC_BASE, cluster)
    out_cluster = os.path.join(OUT_BASE, cluster)
    os.makedirs(out_cluster, exist_ok=True)

    files = [f for f in os.listdir(cluster_path) if f.lower().endswith((".jpg", ".png"))]
    if not files:
        continue

    # Extract AccoIDs
    accoids = [extract_accoid(f) for f in files if extract_accoid(f) is not None]
    if not accoids:
        continue

    # Find most common AccoID
    most_common_accoid, count = Counter(accoids).most_common(1)[0]

    # Copy only files matching that AccoID
    for f in files:
        if most_common_accoid in f:
            shutil.copy2(os.path.join(cluster_path, f), os.path.join(out_cluster, f))
            filtered_rows.append({
                "cluster": cluster,
                "file": f,
                "AccoID": most_common_accoid
            })

print("\nâœ… Filtering complete!")
print(f"ðŸ“‚ Filtered clusters saved to: {OUT_BASE}")

# === STEP 2: UNIQUE ORDER COUNT ===
filtered_df = pd.DataFrame(filtered_rows)
unique_orders = filtered_df["AccoID"].nunique()

print(f"\nðŸ§¾ Unique AccoID count after filtering: {unique_orders}")
print(f"ðŸ’¾ Detailed mapping saved to filtered_orders_summary.csv")

filtered_df.to_csv(os.path.join(OUT_BASE, "filtered_orders_summary.csv"), index=False)


ðŸ§  Found 287 clusters to process.



Filtering clusters by most common AccoID:   0%|          | 0/287 [00:00<?, ?it/s]

Filtering clusters by most common AccoID: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 287/287 [00:22<00:00, 12.91it/s]


âœ… Filtering complete!
ðŸ“‚ Filtered clusters saved to: ../data/crops_face/20251105/4_cafe_pos_cluster_faces_filtered

ðŸ§¾ Unique AccoID count after filtering: 136
ðŸ’¾ Detailed mapping saved to filtered_orders_summary.csv





In [66]:
import os
import re
import pandas as pd
from tqdm import tqdm

# === Paths ===
CLUSTER_BASE = r"../data/crops_face/20251105/4_cafe_pos_cluster_faces_filtered"
OUTPUT_CSV   = r"../data/cluster_accid_diversity_20251107_2.csv"

# === Regex Helper ===
re_accoid = re.compile(r"AccoID_(\d+)")

records = []

# === Iterate through cluster folders ===
for cluster in tqdm(sorted(os.listdir(CLUSTER_BASE)), desc="Scanning clusters"):
    cluster_path = os.path.join(CLUSTER_BASE, cluster)
    if not os.path.isdir(cluster_path):
        continue

    accoids = set()  # store unique order IDs for this cluster

    for fname in os.listdir(cluster_path):
        if not fname.lower().endswith((".jpg", ".png")):
            continue

        match = re_accoid.search(fname)
        if match:
            accoids.add(int(match.group(1)))

    # Record summary for this cluster
    records.append({
        "cluster": cluster,
        "num_unique_orders": len(accoids),
        "accoids": sorted(list(accoids))
    })

# === Convert to DataFrame ===
df = pd.DataFrame(records)
df = df.sort_values("num_unique_orders", ascending=False)

# === Save ===
df.to_csv(OUTPUT_CSV, index=False)
print(f"\nâœ… Clusterâ€“AccoID diversity summary saved â†’ {OUTPUT_CSV}")
print(df.head(10))


Scanning clusters: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 288/288 [00:00<00:00, 3950.02it/s]


âœ… Clusterâ€“AccoID diversity summary saved â†’ ../data/cluster_accid_diversity_20251107_2.csv
         cluster  num_unique_orders    accoids
0    cluster_000                  1  [1169434]
189  cluster_205                  1  [1169038]
195  cluster_211                  1  [1169514]
194  cluster_210                  1  [1169623]
193  cluster_209                  1  [1169057]
192  cluster_208                  1  [1169182]
191  cluster_207                  1  [1168576]
190  cluster_206                  1  [1168590]
188  cluster_203                  1  [1169136]
197  cluster_213                  1  [1169019]





In [70]:
import os
import cv2
import shutil
import numpy as np
from tqdm import tqdm

# === PATHS ===
SRC_BASE = r"../data/crops_face/20251105/4_cafe_pos_cluster_faces_filtered"
OUT_BASE = r"../data/crops_face/20251105/5_cafe_pos_faces_unique"
os.makedirs(OUT_BASE, exist_ok=True)

# === Load Haar Cascade (for frontal face detection) ===
face_cascade = cv2.CascadeClassifier("../models/haarcascade_frontalface_default.xml")

def frontal_score(img):
    """
    Compute a score indicating how 'frontal' the face is.
    Higher = more frontal.
    """
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=4, minSize=(40, 40))
    if len(faces) == 0:
        return 0.0  # no face detected = bad candidate

    # Choose largest detected face
    x, y, w, h = max(faces, key=lambda f: f[2] * f[3])

    img_h, img_w = gray.shape
    face_center_x = x + w / 2
    img_center_x = img_w / 2
    center_offset = abs(face_center_x - img_center_x) / img_center_x  # 0 = perfectly centered

    aspect_ratio = w / float(h)
    aspect_score = 1 - abs(1 - aspect_ratio)  # closer to 1 = more frontal
    center_score = 1 - center_offset  # closer to center = better

    # Combine into final frontalness score
    return (aspect_score * 0.9) + (center_score * 0.1)

# === Iterate through clusters ===
clusters = [
    d for d in os.listdir(SRC_BASE)
    if d.startswith("cluster_") and os.path.isdir(os.path.join(SRC_BASE, d))
]

print(f"ðŸ§  Found {len(clusters)} clusters to process for frontal face selection.\n")

for cluster in tqdm(clusters, desc="Selecting most frontal face"):
    cluster_path = os.path.join(SRC_BASE, cluster)
    files = [f for f in os.listdir(cluster_path) if f.lower().endswith((".jpg", ".png"))]
    if not files:
        continue

    best_file = None
    best_score = -1

    for f in files:
        img_path = os.path.join(cluster_path, f)
        img = cv2.imread(img_path)
        if img is None:
            continue

        score = frontal_score(img)
        if score > best_score:
            best_score = score
            best_file = f

    if best_file:
        src_path = os.path.join(cluster_path, best_file)
        dst_path = os.path.join(OUT_BASE, f"{cluster}_{best_file}")
        shutil.copy2(src_path, dst_path)

print(f"\nâœ… Most frontal face per cluster saved to: {OUT_BASE}")


ðŸ§  Found 287 clusters to process for frontal face selection.



Selecting most frontal face: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 287/287 [06:24<00:00,  1.34s/it]


âœ… Most frontal face per cluster saved to: ../data/crops_face/20251105/5_cafe_pos_faces_unique





In [126]:
import os
import re
import pandas as pd
from datetime import datetime

# Directory containing unique faces
folder = "../data/crops_face/20251107/5_cafe_pos_faces_unique"

In [127]:
records = []
for file in os.listdir(folder):
    if file.lower().endswith(('.jpg', '.png')):
        match = re.match(r"cluster_(\d+)_AccoID_(\d+)_(\d{8})_(\d{6})", file)
        if match:
            cluster_id, acco_id, date_str, time_str = match.groups()
            timestamp = datetime.strptime(date_str + time_str, "%Y%m%d%H%M%S")
            records.append({
                "image_name": file,
                "cluster_id": int(cluster_id),
                "AccoID": int(acco_id),
                "timestamp": timestamp
            })

faces_df = pd.DataFrame(records)
print(faces_df.head())

                                       image_name  cluster_id   AccoID  \
0  cluster_000_AccoID_1170854_20251107_091623.png           0  1170854   
1  cluster_001_AccoID_1171262_20251107_112821.png           1  1171262   
2  cluster_002_AccoID_1171659_20251107_135038.png           2  1171659   
3  cluster_004_AccoID_1171659_20251107_135011.png           4  1171659   
4  cluster_006_AccoID_1171574_20251107_132453.png           6  1171574   

            timestamp  
0 2025-11-07 09:16:23  
1 2025-11-07 11:28:21  
2 2025-11-07 13:50:38  
3 2025-11-07 13:50:11  
4 2025-11-07 13:24:53  


In [128]:
# Load accounting data
acc = pd.read_csv("../db/cafe_pos_2025-11-07.csv")

In [129]:
acc = acc[[
    "AccoID", "AccoDate", "AccoDocNo", "AccoAmount",
    "Quantity", "Discount", "UnitPrice",
    "StockCateDesc", "StockName", "StockDesciption"
]]

In [130]:
merged = faces_df.merge(acc, on="AccoID", how="left")

merged = merged[[
    "image_name", "cluster_id", "AccoID", "AccoDocNo", "AccoDate",
    "timestamp", "AccoAmount", "Quantity", "Discount", "UnitPrice",
    "StockCateDesc", "StockName", "StockDesciption"
]]

# Save as first draft result CSV
merged.to_csv("../results/20251107/results_faces_accounting_20251107.csv", index=False)
print("âœ… Stage 1 results saved")

âœ… Stage 1 results saved
