## Checking for absence of units

In [2]:
import requests
import pandas as pd
from PIL import Image
from tqdm import tqdm
from io import BytesIO
import matplotlib.pyplot as plt

# Function to display an image from a URL
def display_image_from_url(url):
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    # Display the image
    plt.imshow(img)
    plt.axis('off')  # Hide axes for a cleaner display
    plt.show()

In [44]:
df1 = pd.read_csv("cleaned_test_FINAL_500AM.csv")
df3 = pd.read_csv("1-56.csv")

# Counters
count = 0
unitless = 0
unitless_list = []

df_inuse = df1
for index, (row1, row2) in tqdm(enumerate(zip(df3.itertuples(), df_inuse.itertuples())), total=len(df3), desc = "Calculating Missing Units"):
    # Getting additional info
    entity_name = getattr(row1, 'entity_name')
    current_url = getattr(row1, 'image_link')
    # Getting prediction info
    amazon_index = getattr(row2, 'index')
    prediction = getattr(row2, 'prediction')
    #prediction = getattr(row2, 'entity_value')
    str_pred = str(prediction)

    # Checking if number present in unit i.e. alphabet must be present
    if any(char.isdigit() for char in str_pred):
        if not any(char.isalpha() for char in str_pred):
            #print(f"{amazon_index} | {prediction}")
            unitless_list.append((amazon_index, current_url, entity_name, str_pred))
            unitless += 1

# Unitless
print(f"Unitless: {unitless}")

for pair in unitless_list:
    print(f"{pair[0]} | {pair[1]} | {pair[2]} | {pair[3]}")
    display_image_from_url(pair[1])
    print("")

Calculating Missing Units: 100%|██████████| 131187/131187 [00:00<00:00, 293387.28it/s]

Unitless: 0





## Messed up Units

In [12]:
import re

# Load the CSV file
dirty_rec = pd.read_csv('No_Depth_ItemWeight.csv')

# Define the entity-unit mappings
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'maximum_weight_recommendation': {'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce', 'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'}
}

# Flatten the allowed units into a set
allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Handle common mistakes in units
def common_mistake(unit):
    if unit in allowed_units:
        return unit
    if unit.replace('ter', 'tre') in allowed_units:
        return unit.replace('ter', 'tre')
    if unit.replace('feet', 'foot') in allowed_units:
        return unit.replace('feet', 'foot')
    return unit

# Parse a string to extract the numeric value and the unit, and flag dirty entries
def parse_string(s):
    s_stripped = "" if s is None or str(s) == 'nan' else s.strip()
    if s_stripped == "":
        return None, None, False
    # Regex to check if the string follows "number + space + unit"
    pattern = re.compile(r'^-?\d+(\.\d+)?\s+[a-zA-Z\s]+$')
    if not pattern.match(s_stripped):
        return None, None, True  # Dirty entry due to wrong format
    
    # Split into number and unit parts
    parts = s_stripped.split(maxsplit=1)
    number = float(parts[0])
    unit = common_mistake(parts[1])
    
    if unit not in allowed_units:
        return number, unit, True  # Dirty entry due to invalid unit
    
    return number, unit, False  # Clean entry

# Apply the parsing function to the 'prediction' column
# Only keep the dirty flag (no need to add other columns)
dirty_rec['is_dirty'] = dirty_rec['prediction'].apply(lambda x: parse_string(x)[2])

# Set the 'prediction' values of dirty entries to blank
dirty_rec.loc[dirty_rec['is_dirty'] == True, 'prediction'] = ''

# Drop the 'is_dirty' column after processing
dirty_rec.drop(columns=['is_dirty'], inplace=True)

# Save the modified DataFrame to a new CSV file
dirty_rec.to_csv('Cleaned_No_Depth_ItemWeight.csv', index=False)

# View the modified DataFrame
print(dirty_rec.head())

   index       prediction
0      0  6.68 centimetre
1      1  42.0 centimetre
2      2              NaN
3      3              NaN
4      4              NaN


## Checking number of entires

In [74]:
df1 = pd.read_csv("No_Width.csv")
count = 0

df_inuse = df1
for index, row in tqdm(df_inuse.iterrows(), total=len(df_inuse), desc="Calculating entries"):
    prediction = row['prediction']
    if any(char.isdigit() for char in str(prediction)): count += 1

print(f"Predictions: {count}/{len(df1)}")
print(f"% Filled: {count/(len(df1)+count)*100}")

Calculating entries: 100%|██████████| 131187/131187 [00:04<00:00, 29522.17it/s]

Predictions: 56530/131187
% Filled: 30.1144808408402





No_Depth
    Predictions: 83355/131187
    % Filled: 38.85253237128394

cleaned_No_Depth_Working 
    Predictions: 83409/131187
    % Filled: 38.867919252921766
    
test_final_wattage_update
    Predictions: 102396/131187
    % Filled: 43.837094309089274

## Removing all of a Quantity

In [73]:
c = True
count = 0
quantity = "width"

csv_nirvan = 'Priyansh/priyansh_1.csv'
df_nirvan = pd.read_csv(csv_nirvan)

# A
#if not c: df = pd.read_csv("test_final_wattage_update.csv")
#if c: df = pd.read_csv("No_Depth.csv")

# B
if not c: df = pd.read_csv("No_Depth.csv")
if c: df = pd.read_csv("No_Width.csv")

result_df = pd.DataFrame(columns=['index', 'prediction'])

for index, (row1, row2) in tqdm(enumerate(zip(df.itertuples(), df_nirvan.itertuples())), total=len(df), desc=f"Removing {quantity}"):

    # Orignial entity_name
    index_amazon = getattr(row2, 'index')
    entity_name = getattr(row2, 'entity_name')
    prediction = getattr(row1, 'prediction')

    # Removing depth
    if entity_name == quantity and any(char.isdigit() for char in str(prediction)):
        if not c: result_df = result_df.append({'index': index_amazon, 'prediction': ""}, ignore_index=True)
        #print(index_amazon)
        count += 1
    elif any(char.isdigit() for char in str(prediction)):
        if not c: result_df = result_df.append({'index': index_amazon, 'prediction': str(prediction)}, ignore_index=True)
    else:
        if not c: result_df = result_df.append({'index': index_amazon, 'prediction': ""}, ignore_index=True)
    
    #if index == 20: break

# Writing to File
if not c: 
    result_df.to_csv('No_Width.csv', index=False)
    print(".csv file written")
print(count)

Removing width: 100%|██████████| 131187/131187 [00:00<00:00, 399027.49it/s]

0



