In [None]:
import pandas as pd
import json
df = pd.read_csv("..\data\ecommerce.csv")
df.drop_duplicates(inplace=True)
df.fillna({
    "customer_phone": "N/A",
    "geoip.region_name": "Unknown"
}, inplace=True)
df['order_date'] = pd.to_datetime(df['order_date'], errors='coerce')
orders_list = []
for idx, row in df.iterrows():
    products_str = row['products']
    products_str = products_str.replace('""', '"').strip('"')
    try:
        products = json.loads(products_str)
        for p in products:
            orders_list.append({
                "order_id": row['order_id'],
                "customer_id": row['customer_id'],
                "product_id": p['product_id'],
                "product_name": p['product_name'],
                "category": p['category'],
                "price": p['price'],
                "quantity": p['quantity'],
                "order_date": row['order_date']
            })
    except:
        continue
df_products = pd.DataFrame(orders_list)
df_products.to_csv("../data/products_cleaned.csv", index=False)

df_customers = df[['customer_id','customer_first_name','customer_last_name','customer_gender','email','customer_phone']].drop_duplicates()
df_customers.to_csv("../data/customers_cleaned.csv", index=False)

df_orders = df[['order_id','customer_id','order_date','total_quantity','taxful_total_price']].drop_duplicates()
df_orders.rename(columns={'taxful_total_price':'total_price'}, inplace=True)
df_orders.to_csv("../data/orders_cleaned.csv", index=False)





In [6]:
import pandas as pd
import json

# -------------------------
# قراءة البيانات الأصلية
# -------------------------
df = pd.read_csv("../data/ecommerce.csv")

# -------------------------
# Customers
# -------------------------
df_customers = df[['customer_id','customer_first_name','customer_last_name','customer_gender','email','customer_phone']].copy()
df_customers['customer_phone'].fillna("N/A", inplace=True)
df_customers.drop_duplicates(subset=['customer_id'], inplace=True)  # حسب customer_id فقط
df_customers.to_csv("../data/customers_cleaned.csv", index=False)

# -------------------------
# Orders
# -------------------------
df['order_date'] = pd.to_datetime(df['order_date'], errors='coerce')
df_orders = df[['order_id','customer_id','order_date','total_quantity','taxful_total_price']].copy()
df_orders.rename(columns={'taxful_total_price':'total_price'}, inplace=True)
# إزالة $ وتحويل العمود total_price لرقم
df_orders['total_price'] = df_orders['total_price'].replace('[\$,]', '', regex=True).astype(float)
df_orders.drop_duplicates(subset=['order_id'], inplace=True)  # حسب order_id فقط
df_orders.to_csv("../data/orders_cleaned.csv", index=False)

# -------------------------
# Order Items
# -------------------------
orders_list = []
for idx, row in df.iterrows():
    products_str = row['products']
    products_str = products_str.replace('""', '"').strip('"')
    try:
        products = json.loads(products_str)
        for p in products:
            orders_list.append({
                "order_id": row['order_id'],
                "product_id": int(p['product_id']),
                "quantity": int(p['quantity']),
                "price": float(str(p['price']).replace('$',''))  # إزالة $ وتحويل لـ float
            })
    except:
        continue

df_order_items = pd.DataFrame(orders_list)
df_order_items.drop_duplicates(subset=['order_id','product_id'], inplace=True)

# التأكد إن كل order_id موجود في orders
df_order_items = df_order_items[df_order_items['order_id'].isin(df_orders['order_id'])]

df_order_items.to_csv("../data/order_items_cleaned.csv", index=False)

# -------------------------
# Products
# -------------------------
df_products = df[['products']].copy()
products_list = []

for idx, row in df.iterrows():
    products_str = row['products']
    products_str = products_str.replace('""', '"').strip('"')
    try:
        products = json.loads(products_str)
        for p in products:
            products_list.append({
                "product_id": int(p['product_id']),
                "product_name": p['product_name'],
                "category": p['category'],
                "price": float(str(p['price']).replace('$',''))
            })
    except:
        continue

df_products = pd.DataFrame(products_list).drop_duplicates(subset=['product_id'])
df_products.to_csv("../data/products_cleaned.csv", index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_customers['customer_phone'].fillna("N/A", inplace=True)
  df_customers['customer_phone'].fillna("N/A", inplace=True)
