In [None]:
import duckdb
from sqlalchemy import create_engine
import pandas as pd

%load_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False

# Create DuckDB connection
%sql duckdb:///:memory:

MissingPackageError: Can't load plugin: sqlalchemy.dialects:duckdb

To fix it, run this in your notebook: %pip install duckdb-engine --quiet

For more details, see: https://jupysql.ploomber.io/en/latest/howto/db-drivers.html#duckdb


cekk datanya sama apa engga

In [6]:
csv_data = pd.read_csv("dataset/data.csv")
csv_data

Unnamed: 0,Date,BranchID,BranchName,ProductCategory,ProductName,Quantity,Price,TotalSales,Salesperson
0,2025-01-01,BR_02,Jakarta South,Electronics,Laptop ASUS Vivobook,3,8500000,25500000,Citra
1,2025-01-01,BR_04,Bandung Main,Electronics,Smartwatch Xiaomi,3,1200000,3600000,Dewi
2,2025-01-01,BR_06,Surabaya East,Fashion,Sepatu Sneakers,3,850000,2550000,Fitri
3,2025-01-01,BR_05,Surabaya Central,Electronics,Tablet iPad Air,2,9500000,19000000,Eko
4,2025-01-01,BR_03,Jakarta West,Beauty,Parfum EDT 100ml,3,450000,1350000,Eko
...,...,...,...,...,...,...,...,...,...
5140,2025-11-10,BR_02,Jakarta South,Electronics,Headphone Sony WH-1000,2,3200000,6400000,Eko
5141,2025-02-24,BR_03,Jakarta West,Electronics,Smartphone Samsung A54,1,4500000,4500000,Joko
5142,2025-01-01,BR_07,Semarang,Electronics,Smartwatch Xiaomi,4,1200000,4800000,Dewi
5143,2025-04-09,BR_05,Surabaya Central,Home & Living,Lampu LED Smart,5,250000,1250000,Andi


In [9]:
excel_data = pd.read_excel("dataset/data.xlsx")
excel_data

ImportError: Pandas requires version '3.1.0' or newer of 'openpyxl' (version '3.0.10' currently installed).

## Fix: upgrade openpyxl

In [None]:
%pip install "openpyxl>=3.1.0" --quiet

In [None]:
excel_data = pd.read_excel("dataset/data.xlsx")
print("Excel shape:", excel_data.shape)
excel_data.head()

---
## Challenge 1: Data Cleaning

### 1.1 Import & Initial Inspection

In [None]:
df = pd.read_csv("dataset/data.csv", parse_dates=["Date"])
print("Shape:", df.shape)
print("\nDtypes:")
print(df.dtypes)
print("\nSample:")
df.head()

In [None]:
print("=== BASIC INFO ===")
df.info()
print("\n=== DESCRIBE ===")
df.describe(include="all")

### 1.2 Date Filter: 2025-01-01 to 2026-03-31

In [None]:
start_date = pd.Timestamp("2025-01-01")
end_date   = pd.Timestamp("2026-03-31")

before_filter = len(df)
df_filtered = df[(df["Date"] >= start_date) & (df["Date"] <= end_date)].copy()
after_filter = len(df_filtered)

print(f"Rows before date filter : {before_filter}")
print(f"Rows after  date filter : {after_filter}")
print(f"Rows removed            : {before_filter - after_filter}")
print(f"Date range in filtered  : {df_filtered['Date'].min()} to {df_filtered['Date'].max()}")

### 1.3 Anomaly Detection

In [None]:
anomaly_log = {}

# 1. Duplicate rows (exact match)
n_dups = df_filtered.duplicated().sum()
anomaly_log["Duplicate rows (exact match)"] = int(n_dups)
print(f"Duplicate rows to remove : {n_dups}")
df_filtered[df_filtered.duplicated(keep=False)].head(6)

In [None]:
# 2. Negative values (Quantity, Price, TotalSales)
neg_mask = (
    (df_filtered["Quantity"]   < 0) |
    (df_filtered["Price"]      < 0) |
    (df_filtered["TotalSales"] < 0)
)
neg_rows = df_filtered[neg_mask]
anomaly_log["Negative values (Quantity/Price/TotalSales)"] = len(neg_rows)
print(f"Rows with negative values : {len(neg_rows)}")
neg_rows

In [None]:
# 3. Future dates (Date > today = 2026-02-21)
today = pd.Timestamp("2026-02-21")
future_mask = df_filtered["Date"] > today
future_rows = df_filtered[future_mask]
anomaly_log["Future dates (> 2026-02-21)"] = len(future_rows)
print(f"Rows with future dates : {len(future_rows)}")
future_rows.sort_values("Date")

In [None]:
# 4. Price = 0 or TotalSales = 0 while Quantity > 0
zero_mask = (
    ((df_filtered["Price"] == 0) | (df_filtered["TotalSales"] == 0))
    & (df_filtered["Quantity"] > 0)
)
zero_rows = df_filtered[zero_mask]
anomaly_log["Price/TotalSales = 0 with transaction"] = len(zero_rows)
print(f"Rows with zero price/sales : {len(zero_rows)}")
zero_rows

In [None]:
# 5. Date gaps (missing dates in the range)
all_dates     = pd.date_range(start_date, end_date, freq="D")
present_dates = df_filtered["Date"].dt.normalize().unique()
missing_dates = all_dates.difference(present_dates)
anomaly_log["Date gaps (missing dates)"] = len(missing_dates)
print(f"Missing dates in range : {len(missing_dates)}")
print(missing_dates.tolist())

In [None]:
# 6. TotalSales mismatch (Price * Quantity != TotalSales)
mismatch_mask = df_filtered["Price"] * df_filtered["Quantity"] != df_filtered["TotalSales"]
mismatch_rows = df_filtered[mismatch_mask]
anomaly_log["TotalSales mismatch (Price x Qty != TotalSales)"] = len(mismatch_rows)
print(f"Rows with TotalSales mismatch : {len(mismatch_rows)}")
mismatch_rows.head()

### 1.4 Anomaly Summary Table

In [None]:
anomaly_summary = pd.DataFrame(
    list(anomaly_log.items()),
    columns=["Anomaly Type", "Count"]
)
print("=== ANOMALY DOCUMENTATION ===")
anomaly_summary

### 1.5 Data Cleaning — Remove Anomalies

In [None]:
df_clean = df_filtered.copy()
original_len = len(df_clean)

# Step 1: Remove duplicates
df_clean = df_clean.drop_duplicates()
print(f"After removing duplicates : {len(df_clean)} rows (removed {original_len - len(df_clean)})")

# Step 2: Remove negative values
neg_mask2 = (
    (df_clean["Quantity"]   < 0) |
    (df_clean["Price"]      < 0) |
    (df_clean["TotalSales"] < 0)
)
before = len(df_clean)
df_clean = df_clean[~neg_mask2]
print(f"After removing negatives  : {len(df_clean)} rows (removed {before - len(df_clean)})")

# Step 3: Remove future dates
today = pd.Timestamp("2026-02-21")
before = len(df_clean)
df_clean = df_clean[df_clean["Date"] <= today]
print(f"After removing futures    : {len(df_clean)} rows (removed {before - len(df_clean)})")

# Step 4: Remove zero-price/sales rows
zero_mask2 = (
    ((df_clean["Price"] == 0) | (df_clean["TotalSales"] == 0))
    & (df_clean["Quantity"] > 0)
)
before = len(df_clean)
df_clean = df_clean[~zero_mask2]
print(f"After removing zero price : {len(df_clean)} rows (removed {before - len(df_clean)})")

print(f"\nFinal clean dataset: {len(df_clean)} rows")
df_clean.head()

---
## Challenge 2: Trend Analysis

### 2.1 Most Profitable Branch

In [None]:
branch_sales = (
    df_clean.groupby(["BranchID", "BranchName"])["TotalSales"]
    .sum()
    .reset_index()
    .sort_values("TotalSales", ascending=False)
    .rename(columns={"TotalSales": "TotalRevenue"})
)
branch_sales["Rank"] = range(1, len(branch_sales) + 1)
branch_sales["TotalRevenue_fmt"] = branch_sales["TotalRevenue"].apply(
    lambda x: f"Rp {x:,.0f}"
)
print("=== BRANCH PROFITABILITY ===")
print(f"Top    : {branch_sales.iloc[0]['BranchName']} — {branch_sales.iloc[0]['TotalRevenue_fmt']}")
print(f"Worst  : {branch_sales.iloc[-1]['BranchName']} — {branch_sales.iloc[-1]['TotalRevenue_fmt']}")
branch_sales[["Rank","BranchName","TotalRevenue_fmt"]]

### 2.2 Best-Selling Products

In [None]:
product_sales = (
    df_clean.groupby(["ProductCategory", "ProductName"])
    .agg(TotalQty=("Quantity","sum"), TotalRev=("TotalSales","sum"))
    .reset_index()
    .sort_values("TotalRev", ascending=False)
)
product_sales["TotalRev_fmt"] = product_sales["TotalRev"].apply(lambda x: f"Rp {x:,.0f}")

print("=== TOP 10 PRODUCTS BY REVENUE ===")
product_sales.head(10)[["ProductCategory","ProductName","TotalQty","TotalRev_fmt"]]

In [None]:
cat_sales = (
    df_clean.groupby("ProductCategory")["TotalSales"]
    .sum()
    .sort_values(ascending=False)
    .reset_index()
)
cat_sales["TotalSales_fmt"] = cat_sales["TotalSales"].apply(lambda x: f"Rp {x:,.0f}")
print("=== SALES BY PRODUCT CATEGORY ===")
cat_sales[["ProductCategory","TotalSales_fmt"]]

### 2.3 Seasonal & Time Trends

In [None]:
df_clean["YearMonth"] = df_clean["Date"].dt.to_period("M")
monthly = (
    df_clean.groupby("YearMonth")["TotalSales"]
    .sum()
    .reset_index()
)
monthly["TotalSales_fmt"] = monthly["TotalSales"].apply(lambda x: f"Rp {x:,.0f}")
print("=== MONTHLY SALES TREND ===")
monthly[["YearMonth","TotalSales_fmt"]]

In [None]:
day_map = {0:"Senin",1:"Selasa",2:"Rabu",3:"Kamis",4:"Jumat",5:"Sabtu",6:"Minggu"}
df_clean["DayOfWeek"] = df_clean["Date"].dt.dayofweek
df_clean["DayName"]   = df_clean["DayOfWeek"].map(day_map)

dow_sales = (
    df_clean.groupby(["DayOfWeek","DayName"])["TotalSales"]
    .sum()
    .reset_index()
    .sort_values("DayOfWeek")
)
dow_sales["TotalSales_fmt"] = dow_sales["TotalSales"].apply(lambda x: f"Rp {x:,.0f}")
print("=== SALES BY DAY OF WEEK ===")
print(f"Highest: {dow_sales.loc[dow_sales['TotalSales'].idxmax(), 'DayName']}")
print(f"Lowest : {dow_sales.loc[dow_sales['TotalSales'].idxmin(), 'DayName']}")
dow_sales[["DayName","TotalSales_fmt"]]

In [None]:
sp_sales = (
    df_clean.groupby("Salesperson")["TotalSales"]
    .sum()
    .reset_index()
    .sort_values("TotalSales", ascending=False)
)
sp_sales["TotalSales_fmt"] = sp_sales["TotalSales"].apply(lambda x: f"Rp {x:,.0f}")
print("=== SALESPERSON PERFORMANCE ===")
sp_sales[["Salesperson","TotalSales_fmt"]]

### 2.4 Insights
- Perhatikan bulan dengan penjualan tertinggi/terendah
- Cek apakah hari kerja vs weekend mempengaruhi penjualan
- Identifikasi produk/kategori yang mendominasi revenue

---
## Challenge 3: Final Numbers

In [None]:
# Angka kunci setelah data cleaning — harus EXACT MATCH
total_quantity    = int(df_clean["Quantity"].sum())
total_sales       = int(df_clean["TotalSales"].sum())
total_salesperson = int(df_clean["Salesperson"].nunique())

print("=" * 45)
print(f"  Quantity Total     : {total_quantity:,}")
print(f"  TotalSales Total   : Rp {total_sales:,.0f}")
print(f"  Salesperson Total  : {total_salesperson}")
print("=" * 45)