In [2]:
import pandas as pd

df = pd.read_csv("data\product.csv")

# Clean column names
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("(", "")
    .str.replace(")", "")
)

print("Columns:", df.columns)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Drop rows with missing product_type
if "product_type" in df.columns:
    df.dropna(subset=["product_type"], inplace=True)

# Convert numeric columns ONLY if they exist
for col in ["weight", "weight_kg", "fragility"]:
    if col in df.columns:
        df[col] = df[col].astype(float)

# Fill missing values
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

text_cols = df.select_dtypes(include=["object"]).columns
df[text_cols] = df[text_cols].fillna("unknown")

# Standardize text
if "product_type" in df.columns:
    df["product_type"] = df["product_type"].str.title()

if "industry_type" in df.columns:
    df["industry_type"] = df["industry_type"].str.title()

# Save cleaned file
df.to_csv("data\product.csv", index=False)
print("products_cleaned.csv created")



Columns: Index(['product_id', 'product_name', 'sector', 'main_packaging_material',
       'strength_score', 'weight_capacity_score', 'barrier_score',
       'biodegradability_score', 'co2_emission_score', 'recyclability_percent',
       'cost_score', 'reuse_potential_score'],
      dtype='object')
products_cleaned.csv created


In [3]:
import os

base_dir = os.path.abspath(os.path.join(os.getcwd(), "data"))
os.makedirs(base_dir, exist_ok=True)

output_path = os.path.join(base_dir, "products_cleaned.csv")

df.to_csv(output_path, index=False)
print("products_cleaned.csv created at:", output_path)


products_cleaned.csv created at: c:\Users\dell\Downloads\EcopackAI\data\products_cleaned.csv
