# Proofreading Script

## Libraries

In [56]:
# Import the libraries
import pandas as pd
import math
import re

## Data

In [57]:
# Path to the Excel file
file_path = "/Users/merveogretmek/Desktop/AL/March/14th/Proofreading/CT-BA-25-001_3132025.xlsx"

# Load the Excel file into a DataFrame
df = pd.read_excel(file_path, skiprows=[1,2])

# Set display option to show full text in columns
pd.set_option('display.max_colwidth', None)

  warn(msg)


## Functions

### Custom Rounding Functions

In [58]:
# --- Custom Rounding for Seat Height (for Product Name & Description) ---
def custom_round_seat_height(attr_value: str) -> str:
    """
    For the Product Name and Description checks:
      - If the value is between 25 and 26 (exclusive), round up to 26.
      - If the value is between 30 and 31 (exclusive), round down to 30.
      - Otherwise, use standard rounding (nearest integer).
    Returns the rounded value as a string.
    """
    try:
        val = float(attr_value)
    except ValueError:
        return attr_value.lower()  # fallback if conversion fails
    if 25 < val < 26:
        normalized_val = 26
    elif 30 < val < 31:
        normalized_val = 30
    else:
        normalized_val = int(round(val))
    return str(normalized_val)

In [59]:
# --- Custom Rounding for Dimensions (for Bullet Feature) ---
def custom_round_dimension(val: float) -> int:
    """
    For dimension values:
      - If the value is between 39 and 40, always round up to 40.
      - If the value is between 19 and 20, always round up to 20.
      - Otherwise, use standard mathematical rounding.
    """
    if 39 < val < 40:
        return 40
    elif 19 < val < 20:
        return 20
    else:
        return int(round(val))

### Product Name Check

In [60]:
# --- Product Name Check ---
prod_attributes = [
    "Collection Name",
    "Seat Height 1",
    "Finish Color",
    "Manufactured Woods Use",
    "General Category",
    "Upholstery Color"
]

def check_product_name(row):
    """
    Verify that each attribute's value appears in the 'Individual Product Name'.
    For 'Seat Height 1', the custom rounding is applied.
    """
    product_name = str(row["Individual Product Name"]).lower() if pd.notna(row["Individual Product Name"]) else ""
    mismatches = {}
    for attr in prod_attributes:
        attr_value = str(row[attr]) if pd.notna(row[attr]) else ""
        if not attr_value:
            continue
        if attr == "Seat Height 1":
            normalized_attr = custom_round_seat_height(attr_value)
            if normalized_attr not in product_name:
                mismatches[attr] = attr_value
        else:
            if attr_value.lower() not in product_name:
                mismatches[attr] = attr_value
    return mismatches

df["Product_Name_Mismatches"] = df.apply(check_product_name, axis=1)

### Description Check

In [61]:
# --- Description Check ---
desc_attributes = [
    "Collection Name",
    "Seat Height 1",
    "General Category",
    "Upholstery Color",
    "Finish Color",
    "Manufactured Woods Use"
]

def check_description(row):
    """
    Verify that each attribute's value appears in the 'Description' column.
    For 'Seat Height 1', the custom rounding is applied.
    """
    description = str(row["Description"]).lower() if pd.notna(row["Description"]) else ""
    mismatches = {}
    for attr in desc_attributes:
        attr_value = str(row[attr]) if pd.notna(row[attr]) else ""
        if not attr_value:
            continue
        if attr == "Seat Height 1":
            normalized_attr = custom_round_seat_height(attr_value)
            if normalized_attr not in description:
                mismatches[attr] = attr_value
        else:
            if attr_value.lower() not in description:
                mismatches[attr] = attr_value
    return mismatches

df["Description_Mismatches"] = df.apply(check_description, axis=1)

### Tolerance (<=1) 

In [62]:
# --- Helper Function for Comparing Numeric Values with Tolerance ---
def compare_with_tolerance(bullet_val, exp_val, tol = 1):
    """
    Compares two numeric values (given as strings) after converting to integers.
    Returns a mismatch message if the absolute difference is greater than tol;
    otherwise returns None.
    If conversion fails, falls back to a simple string comparison.
    """
    try:
        bullet_num = int(bullet_val)
        exp_num = int(exp_val)
        if abs(bullet_num - exp_num) > tol:
            return f"Bullet: {bullet_val} vs Expected: {exp_val}"
    except Exception:
        if bullet_val != exp_val:
            return f"Bullet: {bullet_val} vs Expected: {exp_val}"
    
    return None

### Bullet Feature 1 Check

In [63]:
# --- Bullet Feature 1 Parsing ---
def parse_bullet_feature(bullet_str):
    """
    Parses the Bullet Feature 1 string into its parts:
      - Depth, Width, Height from the Dimension (expected in the format: 
        Dimension: {depth}" x {width}" x {height}")
      - Seat Height from the text (expected as: Seat Height: {seat}").
    Returns a dictionary with keys: Bullet_Depth, Bullet_Width, Bullet_Height, Bullet_Seat_Height.
    """
    if not bullet_str or not isinstance(bullet_str, str):
        return {"Bullet_Depth": "", "Bullet_Width": "", "Bullet_Height": "", "Bullet_Seat_Height": ""}
    bullet_str_lower = bullet_str.lower()
    # Parse the dimension part
    m = re.search(r'dimension:\s*(\d+)"\s*x\s*(\d+)"\s*x\s*(\d+)"', bullet_str_lower)
    if m:
        bullet_depth = m.group(1)
        bullet_width = m.group(2)
        bullet_height = m.group(3)
    else:
        bullet_depth = bullet_width = bullet_height = ""
    # Parse the seat height part
    m_seat = re.search(r'seat height:\s*(\d+)"', bullet_str_lower)
    bullet_seat = m_seat.group(1) if m_seat else ""
    return {"Bullet_Depth": bullet_depth, "Bullet_Width": bullet_width,
            "Bullet_Height": bullet_height, "Bullet_Seat_Height": bullet_seat}

# Create temporary bullet feature columns by parsing the Bullet Feature 1 text
bullet_features = df["Bullet Feature 1"].apply(parse_bullet_feature)
bullet_features_df = pd.DataFrame(bullet_features.tolist())
df = pd.concat([df, bullet_features_df], axis=1)

def check_bullet_feature_details(row):
    """
    Compare each extracted bullet feature part with the expected values computed from the original columns.
    For the Bullet Feature:
      - Depth, Width, Height are computed from Product Depth/Width/Height using custom_round_dimension.
      - Seat Height is computed using standard mathematical rounding.
    Returns a dictionary indicating mismatches for each part.
    """
    # Expected values from original columns (using custom rounding for dimensions)
    try:
        exp_depth = str(custom_round_dimension(float(row["Product Depth 1"])))
    except (ValueError, TypeError):
        exp_depth = ""
    try:
        exp_width = str(custom_round_dimension(float(row["Product Width 1"])))
    except (ValueError, TypeError):
        exp_width = ""
    try:
        exp_height = str(custom_round_dimension(float(row["Product Height 1"])))
    except (ValueError, TypeError):
        exp_height = ""
    try:
        exp_seat = str(int(round(float(row["Seat Height 1"]))))
    except (ValueError, TypeError):
        exp_seat = ""
    
    # Parsed values from Bullet Feature 1
    bullet_depth = str(row.get("Bullet_Depth", ""))
    bullet_width = str(row.get("Bullet_Width", ""))
    bullet_height = str(row.get("Bullet_Height", ""))
    bullet_seat = str(row.get("Bullet_Seat_Height", ""))
    
    details = {}
    diff = compare_with_tolerance(bullet_depth, exp_depth, tol=1)
    if diff:
        details['Depth'] = diff
    diff = compare_with_tolerance(bullet_width, exp_width, tol=1)
    if diff:
        details['Width'] = diff
    diff = compare_with_tolerance(bullet_height, exp_height, tol=1)
    if diff:
        details['Height'] = diff
    diff = compare_with_tolerance(bullet_seat, exp_seat, tol=1)
    if diff:
        details['Seat Height'] = diff
        
    return details

df["Bullet_Feature_Details"] = df.apply(check_bullet_feature_details, axis=1)

## Mismatch Results

In [64]:
# Product Name mismatches
print("Product Name mismatches:")
print(df[["Individual Product Name", "Product_Name_Mismatches"]][df["Product_Name_Mismatches"].apply(lambda x: len(x) > 0)])

# Description mismatches
print("\nDescription mismatches:")
print(df[["Description", "Description_Mismatches"]][df["Description_Mismatches"].apply(lambda x: len(x) > 0)])

# Bullet Feature mismatches (showing breakdown details)
print("\nBullet Feature Breakdown Mismatches:")
bullet_mismatch_df = df[df["Bullet_Feature_Details"].apply(lambda x: len(x) > 0)]
print(bullet_mismatch_df[["Bullet Feature 1", "Bullet_Feature_Details"]])

Product Name mismatches:
                                                 Individual Product Name  \
4  Elle 26 in. Counter Height Rubberwood Barstool with Ivory Fabric Seat   

          Product_Name_Mismatches  
4  {'Finish Color': 'Amber Glow'}  

Description mismatches:
Empty DataFrame
Columns: [Description, Description_Mismatches]
Index: []

Bullet Feature Breakdown Mismatches:
                                Bullet Feature 1  \
4  Dimension: 22" x 21" x 33" - Seat Height: 26"   

                                                                                                   Bullet_Feature_Details  
4  {'Depth': 'Bullet: 22 vs Expected: 20', 'Width': 'Bullet: 21 vs Expected: 18', 'Height': 'Bullet: 33 vs Expected: 40'}  
