In [5]:
import pandas as pd 
import numpy as np 
import phonenumbers
from datetime import datetime

In [3]:
!pip install phonenumbers



In [25]:
def read_raw_data(file_name, sep=None):
    missing_col = []
    df = pd.read_csv(file_name)
    return df

In [31]:
customers = pd.read_csv("Data-Architecture-design/customers_raw.csv")
products  = pd.read_csv("Data-Architecture-design/products_raw.csv")
sales     = pd.read_csv("Data-Architecture-design/sales_raw.csv")

print(customers.shape, products.shape, sales.shape)

(27, 7) (20, 5) (41, 7)


In [24]:
def find_treat_missing_val(df):
    missing_col = []
    
    numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    
    for col, cnt in df.isna().sum().items():
        if cnt > 0:
            missing_col.append(f"{col} is having {cnt} missing rows")
            
            if col in numeric_cols:
                df[col] = df[col].fillna(df[col].median())
            else:
                df.dropna(subset=[col], inplace=True)
                
    return df, missing_col

In [22]:
def extract_data():
    customers = pd.read_csv(r"C:\Users\paul\Documents\Data-Architecture-design\customers_raw.csv")
    products = pd.read_csv(r"C:\Users\paul\Documents\Data-Architecture-design\products_raw.csv")
    sales = pd.read_csv(r"C:\Users\paul\Documents\Data-Architecture-design\sales_raw.csv")
    return customers, products, sales

In [35]:
def remove_duplicates(df):
    before = len(df)
    df = df.drop_duplicates()
    after = len(df)
    return df, before - after

customers, cust_dup = remove_duplicates(customers)
products, prod_dup  = remove_duplicates(products)
sales, sales_dup    = remove_duplicates(sales)

cust_dup, prod_dup, sales_dup

(1, 0, 1)

In [36]:
def handle_missing(df):
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            if df[col].dtype in ["int64", "float64"]:
                df[col].fillna(df[col].median(), inplace=True)
            else:
                df.dropna(subset=[col], inplace=True)
    return df

customers = handle_missing(customers)
products  = handle_missing(products)
sales     = handle_missing(sales)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [8]:
def clean_customers(df):
    df["email"] = df["email"].fillna("unknown@email.com")
    df = df.dropna(subset=["phone"])
    return df

In [9]:
def clean_products(df):
    df["price"] = df["price"].fillna(df["price"].mean())
    df["stock"] = df["stock"].fillna(0)
    return df

In [37]:
def format_phone(phone):
    try:
        parsed = phonenumbers.parse(str(phone), "IN")
        return phonenumbers.format_number(
            parsed,
            phonenumbers.PhoneNumberFormat.E164
        )
    except:
        return None
    
customers["phone"] = customers["phone"].apply(format_phone)
customers = customers.dropna(subset=["phone"])

In [38]:
products["category"] = products["category"].str.strip().str.title()
products["category"].unique()

array(['Electronics', 'Fashion', 'Groceries'], dtype=object)

In [42]:
sales["transaction_date"] = pd.to_datetime(
    sales["transaction_date"], errors="coerce"
)

sales = sales.dropna(subset=["transaction_date"])
sales["transaction_date"] = sales["transaction_date"].dt.strftime("%Y-%m-%d")

In [44]:
# Customers ID
if "customer_id" not in customers.columns:
    customers.insert(0, "customer_id", range(1, len(customers) + 1))

# Products ID
if "product_id" not in products.columns:
    products.insert(0, "product_id", range(1, len(products) + 1))

# Orders ID
if "order_id" not in sales.columns:
    sales.insert(0, "order_id", range(1, len(sales) + 1))

In [45]:
report = {
    "customers_records": len(customers),
    "products_records": len(products),
    "sales_records": len(sales),
    "customer_duplicates_removed": cust_dup,
    "product_duplicates_removed": prod_dup,
    "sales_duplicates_removed": sales_dup
}

report

{'customers_records': 20,
 'products_records': 20,
 'sales_records': 25,
 'customer_duplicates_removed': 1,
 'product_duplicates_removed': 0,
 'sales_duplicates_removed': 1}

In [46]:
customers.to_csv("customers_clean.csv", index=False)
products.to_csv("products_clean.csv", index=False)
sales.to_csv("sales_clean.csv", index=False)