In [2]:
import pandas as pd
import numpy as np

materials = pd.read_csv("/content/materials.csv")
products = pd.read_csv("/content/products.csv")

In [3]:
str_cols = [
    "product_id", "product_name", "product_category",
    "fragility_level", "temperature_sensitivity",
    "moisture_sensitivity", "packaging_format",
    "current_packaging_material"
]
for col in str_cols:
    products[col] = products[col].str.strip()

In [4]:
products["fragility_level"] = products["fragility_level"].str.title()
products["temperature_sensitivity"] = products["temperature_sensitivity"].str.title()
products["moisture_sensitivity"] = products["moisture_sensitivity"].str.title()
products["product_category"] = products["product_category"].str.title()

In [5]:
materials.rename(
    columns={"co2_emission_kg_per_kg": "co2_emission_per_kg"},
    inplace=True
)

In [6]:
materials.columns

Index(['material_id', 'material_type', 'strength_mpa', 'weight_capacity',
       'co2_emission_per_kg', 'biodegradability_score', 'recyclability_pct',
       'cost_inr_per_kg', 'material_category'],
      dtype='object')

In [7]:
def flag_outliers(series, lower_q=0.01, upper_q=0.99):
    low, high = series.quantile([lower_q, upper_q])
    return (series < low) | (series > high)

products["flag_weight_outlier"] = flag_outliers(products["product_weight_g"])
products["flag_volume_outlier"] = flag_outliers(products["product_volume_cm3"])
products["flag_price_outlier"] = flag_outliers(products["price_inr"])

In [8]:
products["flag_weight_outlier"].value_counts()

Unnamed: 0_level_0,count
flag_weight_outlier,Unnamed: 1_level_1
False,450
True,7


In [9]:
products["flag_price_outlier"].value_counts()

Unnamed: 0_level_0,count
flag_price_outlier,Unnamed: 1_level_1
False,453
True,4


In [10]:
products.loc[products["flag_weight_outlier"],
             ["product_name", "product_weight_g"]].head(5)

Unnamed: 0,product_name,product_weight_g
53,Lakmé Lipstick 5 g,5.0
148,Everest Rice 10000 g,10000.0
171,Lakmé Lipstick 5 g v2,5.0
342,Saffola Flour 10000 g,10000.0
367,Fortune Rice 10000 g,10000.0


In [11]:
materials.to_csv("/content/materials.csv", index=False)
products.to_csv("/content/products.csv", index=False)

print("Data Cleaning completed successfully.")

Data Cleaning completed successfully.
