In [None]:
import pandas as pd
import re
df = pd.read_csv("/content/final_output.csv")
df

Unnamed: 0.1,Unnamed: 0,index,entity_name,model_output
0,0,0,height,2.68 cm
1,1,1,width,The width of the item is 15.5 cm.
2,2,2,height,200mm
3,3,3,depth,20cm
4,4,4,depth,10.5 inches
...,...,...,...,...
131182,35182,131283,maximum_weight_recommendation,"1,500 LB"
131183,35183,131284,item_weight,The image does not provide any information abo...
131184,35184,131285,maximum_weight_recommendation,The image does not provide a maximum weight re...
131185,35185,131286,item_weight,The image does not provide any information abo...


In [None]:
# Redefine the unit_mapping with the exhaustive list provided
unit_mapping = {
    'cm': 'centimetre', 'mm': 'millimetre', 'm': 'metre',
    'g': 'gram', 'kg': 'kilogram', 'mcg': 'microgram', 'mg': 'milligram',
    'oz': 'ounce', 'lb': 'pound', 't': 'ton',
    'kv': 'kilovolt', 'mv': 'millivolt', 'v': 'volt',
    'kw': 'kilowatt', 'w': 'watt',
    'cl': 'centilitre', 'cuft': 'cubic foot', 'cuin': 'cubic inch', 'cup': 'cup',
    'dl': 'decilitre', 'fl': 'fluid ounce', 'gal': 'gallon', 'igal': 'imperial gallon',
    'l': 'litre', 'mcl': 'microlitre', 'ml': 'millilitre', 'pt': 'pint', 'qt': 'quart',
    'in': 'inch', 'ft': 'foot', 'yd': 'yard', 'inches' : 'inch', 'lbs' : 'pound'

}

entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'maximum_weight_recommendation': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart'}
}
default_units = {
    'width' : {'meter'},
    'depth' : {'meter'},
    'height' : {'meter'},
    'item_weight' : {'gram'},
    'maximum_weight_recommendation' : {'gram'},
    'voltage' : {'volt'},
    'wattage' : {'watt'},
    'item_volume' : {'litre'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Update the unit mapping to include symbols
unit_mapping.update({
    '"': 'inch',  # Map double quotes to inches
    'fl oz': 'fluid ounce',
})

def clean_and_normalize_units(row):
    entity = row['entity_name']
    raw_output = row['model_output']

    # Clean from non-standard characters and handle ranges
    cleaned_output = re.sub(r"[^-0-9a-zA-Z.\" ]+", "", raw_output)

    # Check for ranges and select the larger value
    if '-' in cleaned_output:
        parts = cleaned_output.split('-')
        cleaned_output = max(parts, key=lambda x: float(re.search(r"(\d+\.?\d*)", x).group(1)) if re.search(r"(\d+\.?\d*)", x) else 0)

    # Extract numerical values and units using regex, considering symbols like '"'
    match = re.search(r"(\d+\.?\d*)\s*([a-zA-Z\"]+)?", cleaned_output)
    if match:
        value, unit = match.groups()
        if unit:
            # Normalize the unit, handling cases with symbols
            normalized_unit = unit_mapping.get(unit.strip().lower(), unit.strip().lower())
            # Validate if the unit is allowed for the entity
            if normalized_unit in entity_unit_map[entity]:
                return f"{value} {normalized_unit}"

        # If no valid unit is found, use the first allowed unit for the entity as default
        default_unit = list(default_units[entity])[0]
        return f"{value} {default_unit}"

    # Return an empty string or placeholder if invalid or no match
    return ""

# Apply the function to update DataFrame
df['prediction'] = df.apply(clean_and_normalize_units, axis=1)

df



Unnamed: 0.1,Unnamed: 0,index,entity_name,model_output,prediction
0,0,0,height,2.68 cm,2.68 centimetre
1,1,1,width,The width of the item is 15.5 cm.,15.5 centimetre
2,2,2,height,200mm,200 millimetre
3,3,3,depth,20cm,20 centimetre
4,4,4,depth,10.5 inches,10.5 inch
...,...,...,...,...,...
131182,35182,131283,maximum_weight_recommendation,"1,500 LB",1500 pound
131183,35183,131284,item_weight,The image does not provide any information abo...,
131184,35184,131285,maximum_weight_recommendation,The image does not provide a maximum weight re...,
131185,35185,131286,item_weight,The image does not provide any information abo...,




In [None]:
# Redefine the unit_mapping with the exhaustive list provided
unit_mapping = {
    'cm': 'centimetre', 'mm': 'millimetre', 'm': 'metre',
    'g': 'gram', 'kg': 'kilogram', 'mcg': 'microgram', 'mg': 'milligram',
    'oz': 'ounce', 'lb': 'pound', 't': 'ton',
    'kv': 'kilovolt', 'mv': 'millivolt', 'v': 'volt',
    'kw': 'kilowatt', 'w': 'watt',
    'cl': 'centilitre', 'cuft': 'cubic foot', 'cuin': 'cubic inch', 'cup': 'cup',
    'dl': 'decilitre', 'fl': 'fluid ounce', 'gal': 'gallon', 'igal': 'imperial gallon',
    'l': 'litre', 'mcl': 'microlitre', 'ml': 'millilitre', 'pt': 'pint', 'qt': 'quart',
    'in': 'inch', 'ft': 'foot', 'yd': 'yard', 'inches' : 'inch', 'lbs' : 'pound' ,
    'grams' : 'gram', 'kilograms' : 'kilogram', 'micrograms' : 'microgram', 'milligrams' : 'milligram',
    'ounces' : 'ounce', 'pounds' : 'pound', 'tons' : 'ton',
    'kilovolts' : 'kilovolt', 'millivolts' : 'millivolt', 'volts' : 'volt',
    'kilowatts' : 'kilowatt', 'watts' : 'watt', 'millilitres' : 'millilitre', 'centilitres' : 'centilitre',
    'cubic feet' : 'cubic foot', 'cubic inches' : 'cubic inch',
    'imperial gallons' : 'imperial gallon', 'gallons' : 'gallon',
    'microlitres' : 'microlitre', 'decilitres' : 'decilitre', 'fluid ounces' : 'fluid ounce',
    'pints' : 'pint', 'quarts' : 'quart', 'liters' : 'litre', 'cups' : 'cup', 'pounds' : 'pound', 'ounces' : 'ounce'
}

entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'maximum_weight_recommendation': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart'}
}
default_units = {
    'width' : {'meter'},
    'depth' : {'meter'},
    'height' : {'meter'},
    'item_weight' : {'gram'},
    'maximum_weight_recommendation' : {'gram'},
    'voltage' : {'volt'},
    'wattage' : {'watt'},
    'item_volume' : {'litre'}
}

# Update the unit mapping to include symbols
unit_mapping.update({
    '"': 'inch',  # Map double quotes to inches
})

def format_number(number_str):
    """Ensure the number is correctly formatted."""
    try:
        # Convert to float and format to ensure .3 becomes 0.3 and 3. becomes 3.0
        formatted_number = "{:.1f}".format(float(number_str))
        # Avoid unnecessary decimal places for whole numbers
        if formatted_number.endswith(".0"):
            return formatted_number[:-2]
        return formatted_number
    except ValueError:
        return "Invalid"

def clean_and_normalize_units(row):
    entity = row['entity_name']
    raw_output = row['model_output']

    # Clean from non-standard characters and handle ranges
    cleaned_output = re.sub(r"[^-0-9a-zA-Z.\"x ]+", "", raw_output)

    # Check for ranges and select the larger value
    if '-' in cleaned_output:
        parts = cleaned_output.split('-')
        cleaned_output = max(parts, key=lambda x: float(re.search(r"(\d+\.?\d*)", x).group(1)) if re.search(r"(\d+\.?\d*)", x) else 0)

    # Handle 'x' cases with spaces around it
    if 'x' in cleaned_output:
        # Split on 'x' with optional spaces around it
        parts = re.split(r'\s*x\s*', cleaned_output)
        if entity in {'height', 'depth'}:
            # For 'height' or 'depth', use the second value
            if len(parts) > 1:
                cleaned_output = parts[1].strip()
            else:
                cleaned_output = parts[0].strip()
        else:
            # For other entities, use the larger value
            cleaned_output = max(parts, key=lambda x: float(re.search(r"(\d+\.?\d*)", x).group(1)) if re.search(r"(\d+\.?\d*)", x) else 0)

    # Extract numerical values and units using regex, considering symbols like '"'
    match = re.search(r"(\d+\.?\d*)\s*([a-zA-Z\"]+)", cleaned_output)
    if match:
        value, unit = match.groups()
        formatted_value = format_number(value)

        # Normalize the unit, handling cases with symbols
        normalized_unit = unit_mapping.get(unit.strip().lower(), unit.strip().lower())

        # Validate if the unit is allowed for the entity
        if normalized_unit in entity_unit_map[entity]:
            return f"{formatted_value} {normalized_unit}"

    # Return an empty string or placeholder if invalid or no match
    return ""

# Apply the function to update DataFrame
df['prediction'] = df.apply(clean_and_normalize_units, axis=1)

# Display the updated DataFrame
df


Unnamed: 0.1,Unnamed: 0,index,entity_name,model_output,prediction
0,0,0,height,2.68 cm,2.7 centimetre
1,1,1,width,The width of the item is 15.5 cm.,15.5 centimetre
2,2,2,height,200mm,200 millimetre
3,3,3,depth,20cm,20 centimetre
4,4,4,depth,10.5 inches,10.5 inch
...,...,...,...,...,...
131182,35182,131283,maximum_weight_recommendation,"1,500 LB",1500 pound
131183,35183,131284,item_weight,The image does not provide any information abo...,
131184,35184,131285,maximum_weight_recommendation,The image does not provide a maximum weight re...,
131185,35185,131286,item_weight,The image does not provide any information abo...,




In [None]:
df2 = df[['index', 'prediction']]
df2.to_csv('prediction3.csv', index=False)