# Proofreading Script

In [44]:
# Import necessary libraries
import pandas as pd
import re

In [45]:
# Apply custom rounding rule
def custom_round_seat_height(attr_value: str) -> str:
    try:
        val = float(attr_value)
    except ValueError:
        return attr_value.lower()
    
    if 25 < val < 26:
        normalized_val = 26
    elif 30 < val < 31:
        normalized_val = 30
    else:
        normalized_val = int(round(val))
    return str(normalized_val)

In [46]:
# Define the main product column and the attributes to check
product_col = "Individual Product Name"
attributes = [
    "Collection Name",
    "Seat Height 1",
    "Finish Color",
    "Manufactured Woods Use",
    "General Category",
    "Upholstery Color"
]

## Functions

In [47]:
# Convert all numeric tokens to a standardized format
def normalize_numbers_in_strings(s: str, precision: int = 2) -> str:

    def repl(match):
        try:
            num = float(match.group())
            return f"{num:.{precision}f}"
        except Exception:
            return match.group()
        
    return re.sub(r'\d+(?:\.\d+)?', repl, s)

In [48]:
# Check if each attribute's value appears in the product name
def check_attributes(row):
    
    product_name = str(row[product_col]).lower() if pd.notna(row[product_col]) else ""

    mismatches = {}

    for attr in attributes:
        attr_value = str(row[attr]) if pd.notna(row[attr]) else ""
        if not attr_value:
            continue

        if attr == "Seat Height 1":
            normalized_attr = custom_round_seat_height(attr_value)
            if normalized_attr not in product_name:
                mismatches[attr] = attr_value
        else:
            if attr_value.lower() not in product_name:
                mismatches[attr] = attr_value

    return mismatches

In [49]:
# Load the data 
file_path = "CT-BA-25-001_3132025.xlsx"
df = pd.read_excel(file_path, skiprows=[1,2])

  warn(msg)


In [50]:
# Apply the check to each row
df['Attribute_Mismatches'] = df.apply(check_attributes, axis=1)

In [51]:
mismatch_df = df[df['Attribute_Mismatches'].apply(lambda x: len(x) > 0)]

pd.set_option('display.max_colwidth', None)

print(mismatch_df[[product_col, 'Attribute_Mismatches']])

                                                 Individual Product Name  \
4  Elle 26 in. Counter Height Rubberwood Barstool with Ivory Fabric Seat   

             Attribute_Mismatches  
4  {'Finish Color': 'Amber Glow'}  
